In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import massapp as mass
import regutil as reg


In [None]:
data = pd.read_spss('..\\SPSS\\data\\GISValTechSampleData.sav')
data.head()
data['ParcelId'] = data['ParcelId'].astype(int)
data['SaleDate'] = pd.to_datetime(data['SaleDate'])
data['SalesPrice'] = data['SalesPrice'].astype(int)
data['Sqft'] = data['Sqft'].astype(int)
data['LandSize'] = data['LandSize'].astype(int)
data['Quality'] = pd.Categorical(data['Quality'], ordered = True, categories = [
    'Poor', 'BelowAverage', 'Average', 'AboveAverage', 'Superior'])
data['GarageSize'] = data['GarageSize'].astype(int)
data['EffAge'] = data['EffAge'].astype(int)
data['NBHD'] = pd.Categorical(data['NBHD'].astype(int))

data.set_index('ParcelId', inplace = True)
data.head()

In [None]:
data['SPPSF'] = data['SalesPrice'] / data['Sqft']
#data.groupby(['Quality'], observed = True)['SPPSF'].describe()
data.groupby(['Quality'], observed = True)['SPPSF'].aggregate(['count', 'mean', 'median', 'std' ])

In [None]:
data.groupby(['NBHD'], observed = True)['SPPSF'].aggregate(['count', 'mean', 'median', 'std' ])

In [None]:
data['SYEAR'] = data['SaleDate'].dt.year
data['SMONTH'] = data['SaleDate'].dt.month
data['SDATE'] = data['SaleDate'].dt.to_period('M').dt.to_timestamp()
pd.crosstab(data['SYEAR'], data['SMONTH'])

In [None]:
startdate = pd.to_datetime('1/1/2023')
basedate = pd.to_datetime('12/31/2023')
timeperiod = (basedate.to_period('M') - startdate.to_period('M')).n
timeperiod

In [None]:
data['Months'] = [(x - startdate.to_period('M')).n for x in data['SaleDate'].dt.to_period('M')]
data['Month'] = timeperiod - data['Months']
data.head()


In [None]:
def calculatePctGood(eff_age: int) -> float:
    if eff_age < 0:
        eff_age = 0
    elif eff_age > 60:
        eff_age = 60    
    return round(1 - (eff_age / 100), 2)
        
data['Pct_Good'] = data['EffAge'].apply(calculatePctGood)
data.head()
    

In [None]:
model_data = pd.DataFrame({"LN_SalesPrice" : np.log(data['SalesPrice']), "const" : 1.00})
model_data = pd.concat([model_data, data['Months']], axis = 1 )
model_data = reg.add_continuous(model_data, data['Sqft'], 'LN_Sqft')
model_data = reg.add_continuous(model_data, data['LandSize'], 'LN_LandSize')
model_data = reg.add_binaries(model_data, data['Quality'], 'Quality_Average', 'Quality', min_sales = 6)
model_data = reg.add_continuous(model_data, data['Bathrooms'], 'LN_Bathrooms')
model_data = reg.add_continuous(model_data, data['Pct_Good'], 'LN_Pct_Good')
model_data = reg.add_continuous(model_data, data['GarageSize'], 'LN_GarageSize')
#modelData = reg.add_continuous(modelData, data['GarageSize'] / 480, 'LN_GarageSize_Ratio')
model_data = reg.add_binaries(model_data, data['NBHD'], 'NBHD_103', 'NBHD', min_sales = 6)
model_data

In [None]:
# This assumes dependent variable is in the first column
while True:
    model = sm.OLS(
        model_data.iloc[:, 0],
        model_data.drop(model_data.columns[0], axis = 1)
    ).fit()
    ix = model.pvalues[model.pvalues.index != 'const'].idxmax()
    if(model.pvalues[ix] <= .05):
        break
    else:
        print("Dropping {0} p-value: {1} ".format(str(ix), model.pvalues[ix].round(3)))
        model_data.drop(columns = ix, inplace = True)

model.summary()

In [None]:
model.params

In [None]:
# In statsmodels, "BSE" stands for "Standard Error of the coefficient"; 
# it represents the estimated standard deviation of a regression coefficient, indicating how much 
# the coefficient is likely to vary if the analysis was repeated with a different sample 
# from the population. 
model.bse

In [None]:
reg.get_parameter_coverage(model_data)

In [None]:
data['ESP'] = reg.estimate_parcel_value(model_data, model.params ).astype(int)
data['Ratio'] = round((data['ESP'] / data['SalesPrice']), 2)
data.head()

In [None]:
#PRB is centered around 0 and has a generally accepted value of between -0.05 and 0.05 and statically significant
#As defined in the IAAO Standard on Ratio Studies Section 9.2.7. 
#Higher PRB values indicate progressivity in assessment, while negative values indicate regressivity.
#The default significance level is 0.05
mass.PRB(data['ESP'], data['SalesPrice'], show_graph = True)

In [None]:
mass.ratio_statistics(data, 'NBHD', 'ESP', 'SalesPrice')

In [None]:
mass.ratio_statistics(data, 'SDATE', 'ESP', 'SalesPrice')

In [None]:
p = sns.lmplot(x='Months', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by SaleDate")
p.ax.ticklabel_format(useOffset=False)
label = [x.strftime('%b %Y') for x in data.sort_values(by='SaleDate')["SDATE"].unique()]
p.ax.set_xticks(range(len(label)), labels=label)
p.ax.tick_params(axis="x", labelrotation=45)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='Sqft', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by Sqft")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='LandSize', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by LandSize")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='EffAge', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by EffAge")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='Bathrooms', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by Bathrooms")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
p = sns.lmplot(x='GarageSize', y='Ratio', data = data, lowess = True, line_kws={'color': 'red'})
p.figure.set_figwidth(8)
p.figure.set_figheight(4)
p.ax.set_title("Ratio by GarageSize")
p.ax.ticklabel_format(useOffset=False)
plt.axhline(y=1.00, color = 'black')

In [None]:
data.groupby('Quality', observed=True)['Ratio'].describe()

In [None]:
data.groupby(['Quality'], observed = True)['Ratio'].aggregate(['count', 'mean', 'median', 'min', 'max' ])

In [None]:
data.loc[(data['Ratio'] >= 1.5) | (data['Ratio'] <= .75), ]

In [None]:
data.loc[(data['Ratio'] >= 1.5) | (data['Ratio'] <= .75), ].to_excel('.\\reports\\multiplicativebadratio.xlsx')