In [51]:
import pandas as pd
import numpy as np

In [52]:
m_df = pd.DataFrame(pd.read_csv('../data/model_data2.csv'))
hpi2 = pd.DataFrame(pd.read_csv('../data/normalized_hpi.csv'))

In [53]:
hpi2 = hpi2.rename(columns={'county': 'county_code'})
m_df = m_df.rename(columns = {'hpi_x': 'hpi'})

In [54]:
m_df = pd.merge(hpi2[['county_code','year','nhpi']], m_df, on=['county_code', 'year'])
m_df.median_year_built = m_df.median_year_built.str.replace('-','')
m_df.median_year_built = m_df.median_year_built.astype('int')

In [55]:
df = m_df[[
    'previous_one', 
    'previous_two',
    'previous_three',
    'previous_four',
    'previous_five',
    'pop_chg_yoy',
    'EMP',
    'ESTAB',
    'PAYANN',
    'one_year_emp',
    'two_years_emp',
    'three_years_emp',
    'four_years_emp',
    'five_years_emp',
    'one_year_estab',
    'two_years_estab',
    'three_years_estab',
    'four_years_estab',
    'five_years_estab',
    'one_year_pay',
    'two_years_pay',
    'three_years_pay',
    'four_years_pay',
    'five_years_pay',
    'single_units',
    'total_units',
    'one_year_pop',
    'two_years_pop',
    'three_years_pop',
    'four_years_pop', 
    'five_years_pop',
    'hh_income', 
    'one_year_inc',
    'two_years_inc',
    'three_years_inc',
    'four_years_inc',
    'five_years_inc',
    'aggregate_rent',
    'aggregate_rooms',
    'bedrooms_total',
    'born_population',
    'family_household',
    'fiftyplus_units',
    'five9_units',
    'fiveplus_rooms',
    'foreign_born',
    'four_rooms',
    'geographic_mobility',
    'household_type',
    'housing_units',
    'married_household', 
    'median_age', 
    'median_rent',
    'median_rooms',
    'median_year_built',
    'native_born',
    'naturalized',
    'nonfamily_household',
    'occupancy_status_total',
    'occupied', 
    'one_room',
    'owner_tenure',
    'population',
    'poverty_count',
    'renter_tenure',
    'single_attached',
    'single_detached', 
    'ten19_units',
    'tenure',
    'three4_units',
    'three_rooms',
    'twenty49_units',
    'two_rooms',
    'two_units',
    'under_18',
    'units_total',
    'vacant',
    'hpi',
    'nhpi',
    'one_year',
    'two_years',
    'three_years',
    'four_years',
    'five_years',
          ]]

In [56]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

In [57]:
X = df.drop(['one_year','two_years','three_years','four_years','five_years', 'hpi', 'nhpi'], axis=1)
ys = ['one_year', 'two_years','three_years', 'four_years','five_years','hpi']

print(X.shape)

(5033, 77)


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

y = df[ys]

X_scaler = StandardScaler().fit(X)
X_scaled = X_scaler.transform(X)
y_scaler = StandardScaler().fit(y)
y_scaled = y_scaler.transform(y)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, random_state=73)


model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)


In [59]:
df_corr = pd.DataFrame({
    'one_year_test': [x[0] for x in predictions],
    'two_years_test': [x[1] for x in predictions],
    'three_years_test': [x[2] for x in predictions],
    'four_years_test': [x[3] for x in predictions],
    'five_years_test': [x[4] for x in predictions],
    'hpi_test': [x[5] for x in predictions],
    'one_year_actual': [x[0] for x in y_test],
    'two_years_actual': [x[1] for x in y_test],
    'three_years_actual': [x[2] for x in y_test],
    'four_years_actual': [x[3] for x in y_test],
    'five_years_actual': [x[4] for x in y_test],
    'hpi_actual': [x[5] for x in y_test],
    
})



cor = df_corr.corr()

cor

Unnamed: 0,one_year_test,two_years_test,three_years_test,four_years_test,five_years_test,hpi_test,one_year_actual,two_years_actual,three_years_actual,four_years_actual,five_years_actual,hpi_actual
one_year_test,1.0,0.890657,0.59698,0.330482,0.242667,-0.339925,0.396206,0.444113,0.310305,0.206995,0.160515,-0.249058
two_years_test,0.890657,1.0,0.858582,0.639511,0.579776,-0.159484,0.342661,0.483635,0.422551,0.358696,0.335134,-0.169883
three_years_test,0.59698,0.858582,1.0,0.76415,0.865903,0.085083,0.206879,0.418954,0.480314,0.48689,0.507903,0.013873
four_years_test,0.330482,0.639511,0.76415,1.0,0.88811,0.291878,0.066149,0.247336,0.339936,0.416674,0.44477,0.094974
five_years_test,0.242667,0.579776,0.865903,0.88811,1.0,0.328709,0.04563,0.256394,0.412089,0.501825,0.559935,0.187244
hpi_test,-0.339925,-0.159484,0.085083,0.291878,0.328709,1.0,-0.129955,-0.053149,0.05053,0.15851,0.183188,0.69285
one_year_actual,0.396206,0.342661,0.206879,0.066149,0.04563,-0.129955,1.0,0.705561,0.521994,0.395184,0.340889,-0.106899
two_years_actual,0.444113,0.483635,0.418954,0.247336,0.256394,-0.053149,0.705561,1.0,0.777745,0.662253,0.609988,-0.039912
three_years_actual,0.310305,0.422551,0.480314,0.339936,0.412089,0.05053,0.521994,0.777745,1.0,0.810898,0.784303,0.043502
four_years_actual,0.206995,0.358696,0.48689,0.416674,0.501825,0.15851,0.395184,0.662253,0.810898,1.0,0.863108,0.14086


In [49]:
correlations = [0.396, 0.484, 0.480, 0.417, 0.560, 0.692]
r2_list = []
mse_list = []

for a in ys:
    y = df[[a]]
    y_scaler = StandardScaler().fit(y)
    y_scaled = y_scaler.transform(y)


    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, random_state=73)


    model = LinearRegression()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    MSE = mean_squared_error(y_test, predictions)
    r2 = model.score(X_test, y_test)
    r2_list.append(r2.round(3))
    mse_list.append(MSE.round(3))
    print(f"MSE: {MSE}, R2: {r2}")


MSE: 0.8736673826454913, R2: 0.149815918483715
MSE: 0.7609235139648413, R2: 0.22132490759071977
MSE: 0.78127937610763, R2: 0.22163577165405013
MSE: 0.9564235092663604, R2: 0.03531370197521311
MSE: 0.6739575379897867, R2: 0.3040796369678168
MSE: 0.5144026406913497, R2: 0.47132631523596114


In [50]:
linear_df = pd.DataFrame({
    'r2': r2_list,
    'MSE': mse_list,
    'predict_vs_actual': correlations
}, index=ys)

linear_df

Unnamed: 0,r2,MSE,predict_vs_actual
one_year,0.15,0.874,0.396
two_years,0.221,0.761,0.484
three_years,0.222,0.781,0.48
four_years,0.035,0.956,0.417
five_years,0.304,0.674,0.56
hpi,0.471,0.514,0.692
