In [31]:
from shapely.geometry import Point, Polygon
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

%run ../python_files/feature_selection

In [18]:
# import data
puds = pd.read_csv('../data/final_datasets/master_puds_tracts.csv')

In [19]:
# feature engineering
puds = create_demo_col(puds)
minipuds = count_puds(puds)

In [27]:
# set up dependent var
outcome = 'eviction-rate'

In [69]:
# does number of PUDs in a census tract work as a predictor for eviction rate?

# set up single linear regression
x_cols = minipuds['pud_count']

X = minipuds['pud_count'].values

y = minipuds[outcome]

# fit model
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + ['pud_count']
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.5687
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,0.452
Time:,08:57:07,Log-Likelihood:,-426.08
No. Observations:,173,AIC:,856.2
Df Residuals:,171,BIC:,862.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,2.9364,0.253,11.627,0.000,2.438,3.435
pud_count,-0.0596,0.079,-0.754,0.452,-0.216,0.096

0,1,2,3
Omnibus:,36.424,Durbin-Watson:,0.897
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51.618
Skew:,1.253,Prob(JB):,6.19e-12
Kurtosis:,3.937,Cond. No.,3.82


Based on R-squared of 0.003, pud_count does **not** explain any of the variance in eviction-rate

In [76]:
# can you predict eviction rate based on ward?

# set up single linear regression
encoder = OneHotEncoder(handle_unknown="error", drop='first')
X_cat = encoder.fit_transform(np.array(minipuds['ward']).reshape(-1, 1)).toarray()

X = X_cat
y = minipuds[outcome]

# fit model
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + [("ward_"+str(i)) for i in range(0,7)]
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.575
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,31.93
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,1.05e-27
Time:,09:01:52,Log-Likelihood:,-352.3
No. Observations:,173,AIC:,720.6
Df Residuals:,165,BIC:,745.8
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.6912,0.460,3.673,0.000,0.782,2.600
ward_0,-0.8540,0.619,-1.379,0.170,-2.077,0.369
ward_1,-1.2545,0.642,-1.954,0.052,-2.522,0.013
ward_2,0.6225,0.613,1.015,0.311,-0.588,1.833
ward_3,1.3232,0.597,2.217,0.028,0.145,2.502
ward_4,-0.4840,0.597,-0.811,0.419,-1.662,0.694
ward_5,4.3164,0.597,7.232,0.000,3.138,5.495
ward_6,4.8273,0.626,7.707,0.000,3.591,6.064

0,1,2,3
Omnibus:,88.238,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,702.129
Skew:,1.691,Prob(JB):,3.43e-153
Kurtosis:,12.272,Cond. No.,9.95


In [79]:
# what about looking at more variables?

# set up multiple linear regression
x_cols = ['pct-non-white','poverty-rate', 'pct-renter-occupied','pud_count']

minitest = minipuds[x_cols]

X = minitest.values

encoder = OneHotEncoder(handle_unknown="error", drop='first')
X_cat = encoder.fit_transform(np.array(minipuds['ward']).reshape(-1, 1)).toarray()

X = np.concatenate((X, X_cat), axis = 1)
y = minipuds[outcome]

# fit model01
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + x_cols + [("ward_"+str(i)) for i in range(0,7)]
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.631
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,24.98
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,1.67e-29
Time:,09:02:59,Log-Likelihood:,-340.24
No. Observations:,173,AIC:,704.5
Df Residuals:,161,BIC:,742.3
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-1.3734,0.881,-1.558,0.121,-3.114,0.367
pct-non-white,0.0412,0.011,3.865,0.000,0.020,0.062
poverty-rate,-0.0027,0.020,-0.135,0.893,-0.042,0.036
pct-renter-occupied,0.0115,0.010,1.185,0.238,-0.008,0.031
pud_count,0.0397,0.056,0.712,0.478,-0.070,0.150
ward_0,-0.3024,0.623,-0.485,0.628,-1.533,0.928
ward_1,0.1676,0.680,0.246,0.806,-1.176,1.511
ward_2,-0.0091,0.671,-0.014,0.989,-1.335,1.316
ward_3,0.4171,0.631,0.661,0.510,-0.829,1.663

0,1,2,3
Omnibus:,93.923,Durbin-Watson:,2.072
Prob(Omnibus):,0.0,Jarque-Bera (JB):,893.76
Skew:,1.762,Prob(JB):,8.37e-195
Kurtosis:,13.563,Cond. No.,949.0


In [80]:
# set up single linear regression
x_cols = 'pct-non-white'

X = minipuds[x_cols].values
y = minipuds[outcome]

# # fit model03
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + [x_cols]
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.511
Model:,OLS,Adj. R-squared:,0.508
Method:,Least Squares,F-statistic:,178.9
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,2.19e-28
Time:,09:03:07,Log-Likelihood:,-364.43
No. Observations:,173,AIC:,732.9
Df Residuals:,171,BIC:,739.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-1.7911,0.378,-4.737,0.000,-2.537,-1.045
pct-non-white,0.0693,0.005,13.376,0.000,0.059,0.080

0,1,2,3
Omnibus:,33.245,Durbin-Watson:,1.64
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.18
Skew:,0.892,Prob(JB):,4.26e-15
Kurtosis:,5.449,Cond. No.,181.0


In [81]:
# set up single linear regression
x_cols = 'poverty-rate'

X = minipuds[x_cols].values
y = minipuds[outcome]

# # fit model04
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + [x_cols]
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.381
Model:,OLS,Adj. R-squared:,0.377
Method:,Least Squares,F-statistic:,105.3
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,1.51e-19
Time:,09:03:11,Log-Likelihood:,-384.87
No. Observations:,173,AIC:,773.7
Df Residuals:,171,BIC:,780.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.0474,0.245,4.283,0.000,0.565,1.530
poverty-rate,0.1256,0.012,10.260,0.000,0.101,0.150

0,1,2,3
Omnibus:,82.147,Durbin-Watson:,1.578
Prob(Omnibus):,0.0,Jarque-Bera (JB):,395.424
Skew:,1.753,Prob(JB):,1.3600000000000001e-86
Kurtosis:,9.525,Cond. No.,28.6


In [82]:
# set up single linear regression
x_cols = 'pct-renter-occupied'

X = minipuds[x_cols].values
y = minipuds[outcome]

# # fit model05
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + [x_cols]
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.092
Model:,OLS,Adj. R-squared:,0.087
Method:,Least Squares,F-statistic:,17.41
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,4.78e-05
Time:,09:03:15,Log-Likelihood:,-417.98
No. Observations:,173,AIC:,840.0
Df Residuals:,171,BIC:,846.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,0.4370,0.612,0.714,0.476,-0.771,1.645
pct-renter-occupied,0.0421,0.010,4.173,0.000,0.022,0.062

0,1,2,3
Omnibus:,50.426,Durbin-Watson:,1.019
Prob(Omnibus):,0.0,Jarque-Bera (JB):,99.462
Skew:,1.367,Prob(JB):,2.52e-22
Kurtosis:,5.514,Cond. No.,179.0


In [83]:
# looking at top 2 predictor cols

# set up multiple linear regression
x_cols = ['pct-non-white','poverty-rate']

minitest = minipuds[x_cols]

X = minitest.values
y = minipuds[outcome]

# fit model01
X = sm.add_constant(X)
model = sm.OLS(y, X, hasconst=True )
result = model.fit()
labels = ['intercept'] + x_cols
result.summary(xname=labels)

0,1,2,3
Dep. Variable:,eviction-rate,R-squared:,0.545
Model:,OLS,Adj. R-squared:,0.539
Method:,Least Squares,F-statistic:,101.7
Date:,"Fri, 29 Nov 2019",Prob (F-statistic):,8.78e-30
Time:,09:04:39,Log-Likelihood:,-358.28
No. Observations:,173,AIC:,722.6
Df Residuals:,170,BIC:,732.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-1.4286,0.380,-3.759,0.000,-2.179,-0.678
pct-non-white,0.0531,0.007,7.822,0.000,0.040,0.066
poverty-rate,0.0504,0.014,3.538,0.001,0.022,0.079

0,1,2,3
Omnibus:,59.417,Durbin-Watson:,1.795
Prob(Omnibus):,0.0,Jarque-Bera (JB):,205.933
Skew:,1.317,Prob(JB):,1.91e-45
Kurtosis:,7.651,Cond. No.,194.0


# Graveyard

In [160]:
# set up co-linearity check

y_vif = minipuds[outcome]
## remove Passenger from predictor list
## prepare data for the linear model

X_vif = minipuds[x_cols]
## add intercept term
X_vif = sm.add_constant(X_vif.values)
## fit model
model_vif = sm.OLS(y_vif, X_vif, hasconst=True)
result_vif = model_vif.fit()
## check the r2-score
result_vif.summary()
## calculate vif score directly from r2-score
passenger_vif = 1/(1 - result_vif.rsquared)
passenger_vif

1.1018357699538996

In [116]:
## standard scaling 
# for col in x_cols:
#     ## Here we don't have to do this but still it is a good practice
#     if (type(minipuds[col]) == int) | (type(minipuds[col]) == float):
#         minipuds[col] = (minipuds[col] - minipuds[col].mean())/minipuds[col].std()

In [None]:
test['ward'] = [int(el[-1]) for el in minipuds.ward]