In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import massapp as mass
import regutil as reg


In [None]:
data = pd.read_spss('..\\SPSS\\Data\\GISValTechSampleData.sav')
data.head()
data['ParcelId'] = data['ParcelId'].astype(int)
data['SaleDate'] = pd.to_datetime(data['SaleDate'])
data['SalesPrice'] = data['SalesPrice'].astype(int)
data['Sqft'] = data['Sqft'].astype(int)
data['LandSize'] = data['LandSize'].astype(int)
data['Quality'] = pd.Categorical(data['Quality'], ordered = True, categories = [
    'Poor', 'BelowAverage', 'Average', 'AboveAverage', 'Superior'])
data['GarageSize'] = data['GarageSize'].astype(int)
data['EffAge'] = data['EffAge'].astype(int)
data['NBHD'] = pd.Categorical(data['NBHD'].astype(int))

data.set_index('ParcelId', inplace = True)
data.head()

In [None]:
data['SPPSF'] = data['SalesPrice'] / data['Sqft']
#data.groupby(['Quality'], observed = True)['SPPSF'].describe()
data.groupby(['Quality'], observed = True)['SPPSF'].aggregate(['count', 'mean', 'median', 'std' ])

In [None]:
data.groupby(['NBHD'], observed = True)['SPPSF'].aggregate(['count', 'mean', 'median', 'std' ])

In [None]:
data['SYEAR'] = data['SaleDate'].dt.year
data['SMONTH'] = data['SaleDate'].dt.month
data['SDATE'] = data['SaleDate'].dt.to_period('M').dt.to_timestamp()
pd.crosstab(data['SYEAR'], data['SMONTH'])

In [None]:
startdate = pd.to_datetime('1/1/2023')
basedate = pd.to_datetime('12/31/2023')
timeperiod = (basedate.to_period('M') - startdate.to_period('M')).n
timeperiod

In [None]:
data['Months'] = [(x - startdate.to_period('M')).n for x in data['SaleDate'].dt.to_period('M')]
data['Month'] = timeperiod - data['Months']
data.head()

In [None]:
modelData = pd.DataFrame({"SalesPrice" : data['SalesPrice'], "const" : 1})

# Time
modelData['MonthsSF'] = data['Months'] * data['Sqft']

# Land Size
#modelData = addContinuous(modelData, data['LandSize'], 'LN_LandSize')
#modelData['LandSize25'] = pow(data['LandSize'], .25)
modelData['LandSize50'] = pow(data['LandSize'], .50)
#modelData['LandSize75'] = pow(data['LandSize'], .75)

# Quality
modelData['SQFT_Poor'] = [r['Sqft'] if r['Quality'] == 'Poor' else 0 for i, r in data.iterrows()]
modelData['SQFT_BelowAverage'] = [r['Sqft'] if r['Quality'] == 'BelowAverage' else 0 for i, r in data.iterrows()]
modelData['SQFT_Average'] = [r['Sqft'] if r['Quality'] == 'Average' else 0 for i, r in data.iterrows()]
modelData['SQFT_AboveAverage'] = [r['Sqft'] if r['Quality'] == 'AboveAverage' else 0 for i, r in data.iterrows()]
modelData['SQFT_Superior'] = [r['Sqft'] if r['Quality'] == 'Superior' else 0 for i, r in data.iterrows()]

# Bathrooms
modelData['Bathrooms'] = data['Bathrooms']

# EffAge
def effAgeAdj(effAge):
    if effAge < 0:
        return 0
    elif effAge > 60:
        return 60
    return effAge

modelData['EffageSF75'] = [effAgeAdj(r['EffAge']) * pow(r['Sqft'], .75) for i, r in data.iterrows()]

# GarageSize
modelData['GarageSize'] = data['GarageSize']

# NBHD
modelData = reg.add_binaries(modelData, data['NBHD'], 'NBHD_105', 'NBHD', min_sales = 6)

modelData.sample(10)

In [None]:
col = ['SalesPrice', 'const', 'NBHD_104', 'SQFT_Average', 'Bathrooms', 'EffageSF75', 'SQFT_Poor', 'MonthsSF', 'NBHD_102', 'SQFT_Superior', 'GarageSize', 'SQFT_BelowAverage', 'NBHD_103', 'LandSize50', 'SQFT_AboveAverage', 'NBHD_101']
modelData = modelData.reindex(columns=col)
modelData.head()

In [None]:
# This assumes dependent variable is in the first column
while True:
    model = sm.OLS(
        modelData.iloc[:, 0],
        modelData.drop(modelData.columns[0], axis = 1)
    ).fit()
    ix = model.pvalues[model.pvalues.index != 'const'].idxmax()
    if(model.pvalues[ix] <= .10):
        break
    else:
        print("Dropping {0} p-value: {1} ".format(str(ix), model.pvalues[ix].round(3)))
        modelData.drop(columns = ix, inplace = True)

model.summary2()

In [None]:
model.params

In [None]:
data['ESP'] = model.fittedvalues.astype(int)
data['Ratio'] = data['ESP'] / data['SalesPrice']
data.head()

In [None]:
mass.PRB(data['ESP'], data['SalesPrice'], show_graph = True)

In [None]:
mass.ratio_statistics(data, group = 'SYEAR', predicted_value_column='ESP', sales_price_column='SalesPrice')

In [None]:
p = sns.lmplot(x='Months', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by SaleDate")
p.ax.ticklabel_format(useOffset=False)
label = [x.strftime('%b %Y') for x in data.sort_values(by='SaleDate')["SDATE"].unique()]
p.ax.set_xticks(range(len(label)), labels=label)
p.ax.tick_params(axis="x", labelrotation=45)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='Sqft', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by Sqft")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='LandSize', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by LandSize")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='EffAge', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by EffAge")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='Bathrooms', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by Bathrooms")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='GarageSize', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by GarageSize")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
mass.ratio_statistics(data, group='Quality', predicted_value_column="ESP", sales_price_column="SalesPrice")

In [None]:
mass.ratio_statistics(data, group='NBHD', predicted_value_column="ESP", sales_price_column="SalesPrice")

In [None]:
t = mass.ratio_statistics(data, 'SDATE', predicted_value_column="ESP", sales_price_column="SalesPrice")
t.index = t.index.strftime('%b %Y')
t

In [None]:
data.loc[(data['Ratio'] >= 1.5) | (data['Ratio'] <= .75), ]

In [None]:
data.loc[(data['Ratio'] >= 1.5) | (data['Ratio'] <= .75), ].to_excel('.\\reports\\additivebadratio.xlsx')