In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy
import statsmodels.api as sm
from scipy import stats
from geopy import distance
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

## Uploading our cleaned data

In [8]:
df = pd.read_csv('data/cleaned_data.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,distance_sammamish,distance_kirkland,distance_shoreline,median_zip_income,zip_population,sqft_living_x_above,grade_x_price,sqft_living_grade,sqft_bath,zip_has_wh
0,0,7399300360,2022-05-24,675000.0,4,1.0,1180,7140,1.0,0,...,12.6,14.9,21.5,83621.0,24668.0,1392400,4745566.0,8260,1180.0,0
1,1,8910500230,2021-12-13,920000.0,5,2.5,2770,6703,1.0,0,...,16.5,7.4,3.2,77602.0,50153.0,4348900,5355000.0,19390,6925.0,0
2,2,1180000275,2021-09-29,311000.0,6,2.0,2880,6156,1.0,0,...,11.6,12.2,18.4,87788.0,26282.0,4550400,4375000.0,20160,5760.0,0
3,3,1604601802,2021-12-14,775000.0,3,3.0,2160,1400,2.0,0,...,12.3,8.7,13.4,94790.0,48937.0,2354400,6732000.0,19440,6480.0,0
4,4,8562780790,2021-08-24,592500.0,2,2.0,1120,758,2.0,0,...,5.7,11.8,20.0,128556.0,27808.0,1254400,9030000.0,7840,2240.0,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28723 entries, 0 to 28722
Data columns (total 51 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            28723 non-null  int64  
 1   id                    28723 non-null  int64  
 2   date                  28723 non-null  object 
 3   price                 28723 non-null  float64
 4   bedrooms              28723 non-null  int64  
 5   bathrooms             28723 non-null  float64
 6   sqft_living           28723 non-null  int64  
 7   sqft_lot              28723 non-null  int64  
 8   floors                28723 non-null  float64
 9   waterfront            28723 non-null  int64  
 10  greenbelt             28723 non-null  int64  
 11  nuisance              28723 non-null  int64  
 12  view                  28723 non-null  int64  
 13  condition             28723 non-null  int64  
 14  grade                 28723 non-null  int64  
 15  heat_source        

### Creating Train/Test Splits to test our best model

In [11]:
X = df[['sqft_living', 'grade', 'sqft_living_grade', 'yr_renovated', 'sqft_above', 'sqft_basement', 'condition', 
        'floors', 'lower_25_zip', 'middle_50_zip', 'top_75_zip', 'waterfront', 'view', 'year', 'distance_bellvue', 
        'distance_bothell', 'distance_renton', 'distance_redmond', 'distance_kirkland', 'distance_shoreline', 
        'sqft_bath', 'sqft_living_x_above', 'zip_population', 'zip_has_wh', 'median_home_price', 'grade_x_price']]
y = df['price']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [14]:
ss = StandardScaler()

In [15]:
ss.fit(X_train)
X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)

In [16]:
X_standardized_train.mean(axis = 0)

array([-1.13087474e-16, -1.28283603e-16, -9.18835727e-18, -5.76039321e-17,
       -1.15649612e-16, -6.21981108e-17,  1.87301129e-16, -1.39901574e-16,
       -3.56932340e-17,  5.86641272e-17, -1.59029260e-17,  1.37825359e-17,
       -6.92660779e-17, -1.49428134e-13,  9.04699793e-17, -5.62610184e-16,
       -1.12380677e-16,  1.90481714e-16, -1.13794271e-16,  8.48156056e-18,
       -6.00777206e-18, -5.30097535e-17, -1.74843837e-16,  3.55165348e-17,
        1.30403994e-16,  8.37554105e-17])

In [17]:
X_standardized_train.std(axis = 0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [18]:
X_standardized_test.mean(axis = 0)

array([ 0.00521898,  0.01497194,  0.00753786,  0.00849828,  0.00441832,
        0.00538305,  0.01580828, -0.00409213,  0.00737001, -0.02309097,
        0.0209266 ,  0.01796942,  0.00471874,  0.00226719, -0.00377085,
        0.01162301, -0.01910973,  0.00327131,  0.00541811,  0.01122866,
        0.00220066,  0.00189224,  0.00327764, -0.01062958,  0.01696497,
        0.02193659])

In [19]:
X_standardized_test.std(axis = 0)

array([0.99618575, 1.0014287 , 0.99561265, 1.01813096, 0.995101  ,
       0.99811009, 1.00133692, 0.99237112, 1.00445839, 1.00274093,
       1.0157074 , 1.07299073, 1.01665453, 1.00065877, 0.99406054,
       0.98824273, 0.98746204, 0.99159568, 0.99257492, 0.99108309,
       0.98942704, 0.94771666, 0.9828715 , 0.9869852 , 1.02471556,
       1.02974167])

In [20]:
lr_raw = LinearRegression()
lr_raw.fit(X_standardized_train, y_train)
lr_raw.score(X_standardized_train, y_train)

0.7662577371698421

In [21]:
pd.Series(lr_raw.coef_, index = X.columns)

sqft_living           -2.334887e+05
grade                 -1.264322e+05
sqft_living_grade      5.080810e+05
yr_renovated           1.968290e+04
sqft_above             2.348885e+05
sqft_basement          1.152394e+04
condition              4.683492e+04
floors                -3.267831e+04
lower_25_zip           9.201169e+17
middle_50_zip          1.068730e+18
top_75_zip             8.552146e+17
waterfront             3.524542e+04
view                   5.890290e+04
year                   7.756547e+04
distance_bellvue      -1.708765e+05
distance_bothell       3.608798e+05
distance_renton        1.540630e+05
distance_redmond       1.852493e+05
distance_kirkland     -5.681852e+05
distance_shoreline     5.120975e+04
sqft_bath              5.737403e+04
sqft_living_x_above   -1.873351e+05
zip_population        -9.522788e+03
zip_has_wh             1.014724e+04
median_home_price     -8.915135e+04
grade_x_price          3.078681e+05
dtype: float64

In [25]:
y_pred = lr_raw.predict(X_standardized_test)
mean_absolute_error(y_pred, y_test)

187207.4576277456

## The findings
Our model explains approximately 77% of the variance in home prices (Adjusted R-Square = .766)

The MAE is 187207, meaning our prediction will come within $187,207 of the actual price