In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
import statsmodels.formula.api as smf


In [2]:
df = pd.read_csv('data/train_data.csv', delimiter=',')

cols = df.columns[3:-1]
numFeatures = len(cols)

df = df.dropna(subset=cols) #drop na cells 
df = df[df.bedrooms <= 15]
df = df[df.sqft_lot <= 1250000]
df = df[df.sqft_above <= 7000]
df = df[df.sqft_basement <= 3500]
df = df[df.sqft_lot15 <= 500000]

scores = []
for i in df['yr_renovated']:
    if i != 0:
        scores.append((i - 2015) + 82)
    else:
        scores.append(0)
df['renovation_score'] = scores

landvalue = []
view = list(df['view'])
water = list(df['waterfront'])
living = list(df['sqft_living'])
lot = list(df['sqft_lot'])
grade = list(df['grade'])
condition = list(df['condition'])
for i in range(len(water)):
    value = (view[i] + 1) * living[i]
    if water[i] == 1:
        value  = value * 2
    value = value / lot[i]
    landvalue.append(value)
df['landvalue'] = landvalue

centerpoint= (47.628591, -122.289796)  #center point of city center
dist_df = df.loc[:,['Unique_idx','lat','long','price']]
dist_df['dist_latLon'] = np.sqrt(np.square(dist_df['lat']- centerpoint[0]) + np.square(dist_df['long']-centerpoint[1]))
df['distFromSeattle'] = dist_df['dist_latLon']
# df = df.drop(['lat', 'long'], axis=1)


#202272.96564043706
cols1 = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','waterfront', 'view', 'condition', 'grade', 'sqft_above','sqft_basement', 'yr_built', 'zipcode', 'lat', 'long', 'yr_renovated', 'sqft_living15', 'sqft_lot15', 'price']

#193483.43463007256
cols2 = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','landvalue','waterfront', 'view', 'condition', 'grade', 'sqft_above','sqft_basement', 'yr_built', 'distFromSeattle', 'zipcode', 'lat', 'long', 'yr_renovated', 'renovation_score', 'sqft_living15', 'sqft_lot15', 'price']

#200158.15498220024
cols3 = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','landvalue', 'condition', 'grade', 'sqft_above','sqft_basement', 'yr_built', 'distFromSeattle', 'renovation_score', 'sqft_living15', 'sqft_lot15', 'price']


df_train1 = df.loc[:,cols1]
df_train2 = df.loc[:,cols2]
df_train3 = df.loc[:,cols3]

In [3]:
# train_df = pd.read_csv('./data/train_data.csv')
# val_df = pd.read_csv('./data/val_data.csv')
# sample_df = pd.read_csv('./data/sample_submission.csv')

percent = 0.7  #choose percent to split 
n= df_train2.shape[0] #number of rows in data set before splitting
s= math.floor(percent * n)
y= df_train2.iloc[0:s,-1]
X = df_train2.iloc[0:s,:-1]

y_test= df_train2.iloc[s:n,-1]
X_test = df_train2.iloc[s:n,:-1]

In [5]:
X['price']=y  #need to add price as last col for statsmodel

est03 = smf.ols('price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + landvalue + waterfront + view + condition + grade + sqft_above + sqft_basement + yr_built + zipcode + yr_renovated +renovation_score +distFromSeattle + np.power(distFromSeattle, 2)', X).fit()
#cols2 = ['              bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','landvalue','waterfront', 'view', 'condition', 'grade', 'sqft_above','sqft_basement', 'yr_built', 'distFromSeattle', 'zipcode', 'lat', 'long', 'yr_renovated', 'renovation_score', 'sqft_living15', 'sqft_lot15', 'price']

est03.summary().tables[0]

0,1,2,3
Dep. Variable:,price,R-squared:,0.744
Model:,OLS,Adj. R-squared:,0.743
Method:,Least Squares,F-statistic:,2060.0
Date:,"Wed, 12 Feb 2020",Prob (F-statistic):,0.0
Time:,16:17:00,Log-Likelihood:,-163210.0
No. Observations:,12098,AIC:,326500.0
Df Residuals:,12080,BIC:,326600.0
Df Model:,17,,
Covariance Type:,nonrobust,,


In [6]:
est03.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.072e+07,3.57e+06,22.615,0.000,7.37e+07,8.77e+07
bedrooms,-3.451e+04,2320.638,-14.871,0.000,-3.91e+04,-3e+04
bathrooms,2.154e+04,3835.545,5.615,0.000,1.4e+04,2.91e+04
sqft_living,116.7613,2.546,45.864,0.000,111.771,121.752
sqft_lot,0.2247,0.044,5.138,0.000,0.139,0.310
floors,-4.002e+04,4601.127,-8.698,0.000,-4.9e+04,-3.1e+04
landvalue,1.614e+04,5010.541,3.221,0.001,6317.260,2.6e+04
waterfront,5.31e+05,2.05e+04,25.920,0.000,4.91e+05,5.71e+05
view,4.742e+04,2925.599,16.210,0.000,4.17e+04,5.32e+04


In [7]:
#compare to test data
yhat = est03.predict(X_test)
yhat = yhat.astype(int)
#print(y_test)
#print(yhat)
#print(np.abs(yhat-y_test))
delta = yhat - y_test
mse = np.sqrt(np.mean(np.square(delta)))
print('mse (stats model) = ', mse)
#188662.44865167243

mse (stats model) =  188598.00121326977
