STEP 1: basic data cleaning

In [5]:
#import packages and data set
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

df = pd.read_csv('../data/kc_house_data.csv')

In [8]:
# inspect data
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19221.0,21534.0,21597.0,21597.0,21597.0,21597.0,17755.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474000.0,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,0.007596,0.233863,3.409825,7.657915,1788.596842,1970.999676,83.636778,98077.951845,47.560093,-122.213982,1986.620318,12758.283512
std,2876736000.0,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,0.086825,0.765686,0.650546,1.1732,827.759761,29.375234,399.946414,53.513072,0.138552,0.140724,685.230472,27274.44195
min,1000102.0,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,370.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,1951.0,0.0,98033.0,47.4711,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,1975.0,0.0,98065.0,47.5718,-122.231,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,2210.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


Zipcode is catagorical data, and will likely need to be bined due to how many distinct zipcodes there likely are. For simplicity we'll bin by city

In [9]:
#probably inefficent function but it will convert zipcode to city
def city_zip(zip_in):
    if zip_in in [98001,98002,98071,98092]:
        return 'Auburn';
    elif zip_in in [98003,98023,98063,98093]:
        return 'Federal_way';
    elif zip_in in [98004,98005,98006,98007,98008,98009,98015]:
        return 'Bellevue';
    elif zip_in == 98010:
        return 'Black_diamond';
    elif zip_in in [98011,98014]:
        return 'Bothell';
    elif zip_in == 98014:
        return 'Carnation'
    elif zip_in == 98019:
        return 'Duball';
    elif zip_in == 98022:
        return 'Enumclaw';
    elif zip_in == 98024:
        return 'Fall city'
    elif zip_in in [98027,98029]:
        return 'Issaquah';
    elif zip_in == 98028:
        return 'Kenmore'
    elif zip_in in [98030,98031,98032,98035,98042,98064,98089]:
        return 'Kent'
    elif zip_in in [98033,98034,98083]:
        return 'Kirkland'
    elif zip_in == 98038:
        return 'Maple Valley'
    elif zip_in == 98039:
        return 'Medina'
    elif zip_in == 98040:
        return 'Mercer Island'
    elif zip_in == 98045:
        return 'North Bend'
    elif zip_in in [98055, 98056,98057,98058,98059]:
        return 'Renton';
    elif zip_in == 98065:
        return 'Snoqualmie';
    elif zip_in == 98079:
        return 'Vashon';
    elif zip_in in [98072,98077]:
        return 'Woodinville';
    elif zip_in in [98074,98075]:
        return 'Sammamish';
    else:
        return 'Seattle';

In [10]:
#apply function to zip codes
df['city'] = df['zipcode'].apply(city_zip)

zip code needs to be one hot encoded so I'm stealing the one hot encoding function from the lab to help make this easier

In [11]:
def encode_and_concat_feature_train(X_train_all_features, feature_name):
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(sparse=False)
    single_feature_df = X_train_all_features[[feature_name]]
    ohe.fit(single_feature_df)
    
    # call helper function that actually encodes the feature and concats it
    X_train_all_features = encode_and_concat_feature(X_train_all_features, feature_name, ohe)
    
    return ohe, X_train_all_features

In [12]:
def encode_and_concat_feature(X, feature_name, ohe):

    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df)
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=single_feature_df.index)
    
    # drop the old feature from X and concat the new one-hot encoded df - two lines of code here
    X.drop([feature_name],axis=1,inplace=True)
    X = pd.concat([X, ohe_df], axis =1)
    
    return X

In [13]:
zip_ohe, df = encode_and_concat_feature_train(df, 'city')

Next lets look at null Values

In [14]:
df.isnull().sum()

id                  0
date                0
price               0
bedrooms            0
bathrooms           0
sqft_living         0
sqft_lot            0
floors              0
waterfront       2376
view               63
condition           0
grade               0
sqft_above          0
sqft_basement       0
yr_built            0
yr_renovated     3842
zipcode             0
lat                 0
long                0
sqft_living15       0
sqft_lot15          0
Auburn              0
Bellevue            0
Black_diamond       0
Bothell             0
Duball              0
Enumclaw            0
Fall city           0
Federal_way         0
Issaquah            0
Kenmore             0
Kent                0
Kirkland            0
Maple Valley        0
Medina              0
Mercer Island       0
North Bend          0
Renton              0
Sammamish           0
Seattle             0
Snoqualmie          0
Woodinville         0
dtype: int64

Renovation cleaning, if not renovated set renovation to date built

In [15]:
df.loc[df['yr_renovated'] ==0, 'yr_renovated'] = np.nan
df['yr_renovated'].fillna(df['yr_built'],inplace = True)


df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,Kirkland,Maple Valley,Medina,Mercer Island,North Bend,Renton,Sammamish,Seattle,Snoqualmie,Woodinville
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Found one outlier with 33 bedrooms, as well as duplicate id sales, should be removed

In [16]:
df.drop(15856, inplace=True)
df.drop_duplicates(subset='id', keep='last' , inplace=True)

In [21]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,Medina,Mercer Island,North Bend,Renton,Sammamish,Seattle,Snoqualmie,Woodinville,year,season
0,221900.0,3,1.0,1180,5650,1.0,,0.0,3,7,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2014,fall
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2014,winter
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015,winter
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2014,winter
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2015,winter


similar to zipcode date needs to be translated and one hot encoded

In [17]:
def seasons(month):
    if month == '12' or month == '1' or month == '2':
        return 'winter'
    elif month == '3' or month == '4' or month =='5':
        return 'spring'
    elif month == '6' or month == '7' or month =='8':
        return 'summer'
    else:
        return 'fall'



In [20]:
df['year'] = df['date'].map(lambda x: int(x.split('/')[2]))
df['month'] = df['date'].map(lambda x: x.split('/')[0])
df['season'] = df['month'].apply(seasons)
ohe = OneHotEncoder(sparse=False)
ohe.fit(df[['season']])
ohe_seasons = pd.DataFrame(ohe.transform(df[['season']]), columns=ohe.get_feature_names())
df = pd.concat([df.drop('season', axis=1), ohe_seasons], axis=1)

KeyError: 'date'

Next finish dealing with nans and change basement to a boolean

In [22]:
df.waterfront = df.waterfront.replace(np.nan, '0')
df.view = df.view.replace(np.nan, '0')
df.sqft_basement = df.sqft_basement.replace('?','0')
df.waterfront = df.waterfront.astype(int)

In [23]:
df['sqft_basement'] = df['sqft_basement'].astype(str).astype(float)

df['sqft_basement'] = df['sqft_basement'].apply(lambda x: '1' if x >= 1 else 0)

In [24]:
df = df.rename(columns={'sqft_basement':'basement'})

With the data cleaned now we just have to begin processing step 1: importing everythin we need

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import statsmodels.api as sm
from statsmodels.formula.api import ols

import pylab 
import scipy.stats as stats

In [28]:
df = pd.read_csv('../data/Clean_data.csv')

In [29]:
df.drop(['zipcode'], axis=1, inplace=True)

In [30]:
df.dropna(inplace=True)

Now we need a base model to try and improve, so we'll start by simply throwing the data set into a linear regression

In [31]:
base_model = df.copy()

X , y = base_model.drop(columns=['price']), base_model[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)



In [32]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

# train_preds = model1.predict(X_train)
# y_train == train_preds 

test_preds = model1.predict(X_test)

cv_results = cross_val_score(model1, X_train, y_train, cv=5)
print(cv_results)
cv_results_mean = np.mean(cross_val_score(model1, X, y, cv=5,  scoring='neg_mean_squared_error'))
print(cv_results_mean)
#model1.score(y_test, test_preds)

[0.77170287 0.73948086 0.75512652 0.73824839 0.74758015]
-34261332108.943726


Decently solid r^2 scores for starting off but we can probably improve them

We were recommended to split the data so let's try making a high grade model(grade 10-13) and a low grade(1-6)

In [34]:
df = pd.read_csv('../data/Clean_data.csv')

In [35]:
df.dropna(inplace=True)
df.drop(['zipcode', 'lat','long'],axis=1, inplace=True)
df = df.loc[df['grade'] > 9]

KeyError: "['zipcode' 'lat' 'long'] not found in axis"

In [37]:
X = df.drop(['price'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)




base_lr = LinearRegression()
base_lr.fit(X_train,y_train)

kcross_base = cross_val_score(base_lr,X_train,y_train,cv=5)


kcross_base

array([0.70619833, 0.68362146, 0.61895346, 0.70668599, 0.66750471])

Not fantastic results but let's see if we can improve them, starting with standardization

In [38]:
X = df.drop(['price'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

fit_lr = LinearRegression()
fit_lr.fit(X_train,y_train)

kcross_fit = cross_val_score(fit_lr,X_train,y_train,cv=5)


kcross_fit

array([0.70595406, 0.6836197 , 0.63150975, 0.70766655, 0.66750435])

not much improvement but we'll keep the change, next let's see if all of the columns are really necessary, there are a few I suspect can go, starting with yr_renovated which shares a lot of data with 'yr_built'

In [39]:
#expirement 1: drop year renovated
X = df.drop(['price','yr_renovated'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

drop_1_lr = LinearRegression()
drop_1_lr.fit(X_train,y_train)

kcross_drop_1 = cross_val_score(drop_1_lr,X_train,y_train,cv=5)


kcross_drop_1

array([0.70662244, 0.68970824, 0.62329572, 0.71864176, 0.65876969])

OK that helped quite a lot, let's go ahead and drop a bunch of the other likely culprits

In [40]:
X = df.drop(['price', 'sqft_lot_diff', 'sqft_living15', 'sqft_lot', 'floors', 'sqft_lot15','yr_renovated'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

drop_5_lr = LinearRegression()
drop_5_lr.fit(X_train,y_train)

kcross_drop_5 = cross_val_score(drop_5_lr,X_train,y_train,cv=5)


print(kcross_drop_5)
print(drop_5_lr.score(X_train,y_train))

[0.70380428 0.68954204 0.63652468 0.719109   0.6566056 ]
0.7048972023426302


Overall dropping some of the weaker predictors didn't affect the model much but since they'd just muddle the data, I'll just leave them out

In [41]:
df.drop([ 'sqft_lot_diff', 'sqft_living15', 'sqft_lot', 'floors', 'sqft_lot15'], axis=1, inplace =True)

alright so I heard from one of my teammates that log transforming price helped accuracy a lot so let's give that a shot

In [42]:
X = df.drop(['price'], axis=1)
y = df[['price']]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

y_train = np.log(y_train)
y_test = np.log(y_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_scale = StandardScaler()
y_train = y_scale.fit_transform(y_train)
y_test = y_scale.transform(y_test)




y_scale_lr = LinearRegression()
y_scale_lr.fit(X_train,y_train)

kcross_y_scale = cross_val_score(y_scale_lr,X_train,y_train,cv=5)


print(kcross_y_scale)
print(y_scale_lr.score(X_train,y_train))

[0.73717685 0.72632184 0.67453098 0.75816843 0.69391847]
0.7326599459420047


That's a fairly significant improvement, not sure how much it helped overall though.

LOW GRADE MODEL

In [51]:
df = pd.read_csv('../data/Clean_data.csv')

In [52]:
df = df.dropna()
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,Seattle,Snoqualmie,Woodinville,sqft_living_diff,sqft_lot_diff,year,x0_fall,x0_spring,x0_summer,x0_winter
0,221900.0,3.0,1.00,1180.0,5650.0,1.0,0,0.0,3.0,7.0,...,1.0,0.0,0.0,-160.0,0.0,2014.0,1.0,0.0,0.0,0.0
1,538000.0,3.0,2.25,2570.0,7242.0,2.0,0,0.0,3.0,7.0,...,1.0,0.0,0.0,880.0,-397.0,2014.0,0.0,0.0,0.0,1.0
2,180000.0,2.0,1.00,770.0,10000.0,1.0,0,0.0,3.0,6.0,...,0.0,0.0,0.0,-1950.0,1938.0,2015.0,0.0,0.0,0.0,1.0
3,604000.0,4.0,3.00,1960.0,5000.0,1.0,0,0.0,5.0,7.0,...,1.0,0.0,0.0,600.0,0.0,2014.0,0.0,0.0,0.0,1.0
4,510000.0,3.0,2.00,1680.0,8080.0,1.0,0,0.0,3.0,8.0,...,0.0,0.0,0.0,-120.0,577.0,2015.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21414,700000.0,3.0,2.50,2230.0,4006.0,2.0,0,0.0,3.0,8.0,...,1.0,0.0,0.0,0.0,-174.0,2015.0,0.0,1.0,0.0,0.0
21415,800000.0,4.0,3.25,3540.0,159430.0,2.0,0,0.0,3.0,9.0,...,0.0,0.0,0.0,1600.0,-232610.0,2014.0,0.0,0.0,0.0,1.0
21416,899000.0,4.0,3.50,2490.0,5500.0,2.0,0,0.0,3.0,9.0,...,1.0,0.0,0.0,780.0,0.0,2015.0,0.0,0.0,1.0,0.0
21417,465750.0,3.0,2.50,2670.0,4534.0,2.0,0,0.0,3.0,9.0,...,0.0,0.0,0.0,-370.0,-545.0,2014.0,0.0,0.0,0.0,1.0


In [53]:
df['waterfront'] = df['waterfront'].astype('bool')
df2 = df[df["grade"] <= 6]
df2 =df

In [54]:
X = df2.drop(['price'], axis=1)
y = df2.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

base_lr = LinearRegression()
base_lr.fit(X_train,y_train)

kcross_base = cross_val_score(base_lr,X_train,y_train,cv=5)

kcross_base

array([0.75142349, 0.75668097, 0.74778475, 0.75342912, 0.73763169])

Start with a scaler

In [55]:
X = df2.drop(['price'], axis=1)
y = df2.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

fit_lr = LinearRegression()
fit_lr.fit(X_train,y_train)

kcross_fit = cross_val_score(fit_lr,X_train,y_train,cv=5)

kcross_fit

array([0.75140391, 0.75673767, 0.74778471, 0.75343936, 0.73761757])

Now to log values

This whole split data thing hasn't resulted in the strongest models, let's actually take a look at this idea.

In [57]:
df = pd.read_csv('../data/Clean_data.csv')
df.dropna(inplace=True)

Base model revisit

In [61]:
base_model = df.copy()


X , y = base_model.drop(columns=['price']), base_model[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

model1 = LinearRegression()
model1.fit(X_train, y_train)

# train_preds = model1.predict(X_train)
# y_train == train_preds 

test_preds = model1.predict(X_test)

cv_results = cross_val_score(model1, X_train, y_train, cv=5)
print(cv_results)
cv_results_mean = np.mean(cross_val_score(model1, X, y, cv=5,  scoring='neg_mean_squared_error'))
print(cv_results_mean)


[0.77370641 0.74278052 0.75943089 0.74054059 0.75141904]
-33804406979.804382


Try with split high

In [62]:
base_model = df.copy()

base_model = base_model.loc[base_model['grade'] > 8]

X , y = base_model.drop(columns=['price']), base_model[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

model1 = LinearRegression()
model1.fit(X_train, y_train)

# train_preds = model1.predict(X_train)
# y_train == train_preds 

test_preds = model1.predict(X_test)

cv_results = cross_val_score(model1, X_train, y_train, cv=5)
print(cv_results)
cv_results_mean = np.mean(cross_val_score(model1, X, y, cv=5,  scoring='neg_mean_squared_error'))
print(cv_results_mean)
#model1.score(y_test, test_preds)


[0.67220739 0.74410262 0.71293384 0.72246248 0.7372957 ]
-95885745243.7177


Try with split low

In [63]:
base_model = df.copy()

base_model = base_model.loc[base_model['grade'] < 9]

X , y = base_model.drop(columns=['price']), base_model[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

model1 = LinearRegression()
model1.fit(X_train, y_train)

# train_preds = model1.predict(X_train)
# y_train == train_preds 

test_preds = model1.predict(X_test)

cv_results = cross_val_score(model1, X_train, y_train, cv=5)
print(cv_results)
cv_results_mean = np.mean(cross_val_score(model1, X, y, cv=5,  scoring='neg_mean_squared_error'))
print(cv_results_mean)
#model1.score(y_test, test_preds)



[0.68612178 0.70771628 0.70724349 0.68150637 0.68342671]
-11977080157.725552


These are lower scores, most likely the decreased data makes the models not train as much as with all data together, as such it might be best to just stick with a single model, applying all the techniques we figured out could increase accuracy

In [64]:
Final model

SyntaxError: invalid syntax (<ipython-input-64-76a4566580fe>, line 1)

In [66]:
df = pd.read_csv('../data/Clean_data.csv')

In [67]:
df['coord'] = df['lat'] * df['long']
df['yr_till_renovation'] = df['yr_renovated'] - df['yr_built']
df['sqft_living-above'] = df['sqft_above'] - df['sqft_living']
df['yr_since_built'] = df['year'] - df['yr_built']
df.drop(['zipcode','lat','long','sqft_lot_diff','sqft_living15','yr_renovated','basement','year'], axis=1, inplace=True)
df.dropna(inplace=True)

In [70]:
base_model = df.copy()



X , y = base_model.drop(columns=['price']), base_model[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

scaler = StandardScaler()
# Calculate the standard deviation of the X_train dataset
scaler.fit(X_train)
# convert all values into their standard deviation equivalents.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
scaler.fit(y_train)
y_train_scaled = scaler.transform(y_train)
y_test_scaled = scaler.transform(y_test)

df_price_log = np.log(y_train) #['log_price']
df_price_log_test = np.log(y_test)

 
scaler.fit(df_price_log)
y_scaled_log_price_train = scaler.transform(df_price_log)
y_scaled_log_price_test = scaler.transform(df_price_log_test)


model_scaled_log_price = LinearRegression()
model_scaled_log_price.fit(X_train_scaled, y_scaled_log_price_train)


cv_results = cross_val_score(model_scaled_log_price, X_train_scaled, y_scaled_log_price_train, cv=5)
print(cv_results)
cv_results_mean_scaled_ylog = np.mean(cross_val_score(model_scaled_log_price, X_train_scaled, y_scaled_log_price_train, cv=5, ))
print(cv_results_mean_scaled_ylog)

[0.81803179 0.81048513 0.79869004 0.8109573  0.80867051]
0.8093669561957967


Check training 

In [71]:
# calculate predictions on training and test sets
y_hat_train = model_scaled_log_price.predict(X_train_scaled)
y_hat_test = model_scaled_log_price.predict(X_test_scaled)

#compare MSE for the predicted training and test values 
train_mse = mean_squared_error(y_scaled_log_price_train, y_hat_train)
test_mse = mean_squared_error(y_scaled_log_price_test, y_hat_test)


print(train_mse)
print(test_mse)

0.18932484814895398
0.18510730654725704
