In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
plt.style.use('seaborn')
sns.set(style="white")

In [4]:
kc_train = pd.read_csv("kc_house_data_train.csv")
kc_train

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,1,7974200820,20140821T000000,865000.0,5,3.00,2900,6730,1.0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,2,7701450110,20140815T000000,1038000.0,4,2.50,3770,10893,2.0,0,...,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,3,9522300010,20150331T000000,1490000.0,3,3.50,4560,14608,2.0,0,...,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,4,9510861140,20140714T000000,711000.0,3,2.50,2550,5376,2.0,0,...,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,17285,627300195,20150303T000000,750000.0,5,2.50,3240,9960,1.0,0,...,8,2020,1220,1958,0,98008,47.5858,-122.112,2730,10400
17286,17286,8819900270,20140520T000000,440000.0,2,1.75,1300,4000,2.0,0,...,7,1300,0,1948,0,98105,47.6687,-122.288,1350,4013
17287,17287,3816300095,20140514T000000,310000.0,3,1.00,1050,9876,1.0,0,...,7,1050,0,1953,0,98028,47.7635,-122.262,1760,9403
17288,17288,122069107,20141204T000000,427500.0,3,1.50,1900,43186,1.5,0,...,7,1300,600,1971,0,98038,47.4199,-121.990,2080,108028


In [5]:
# turn zipcode into into onehot encoding - easier than binning by lat/long
kc_train = pd.concat([kc_train,pd.get_dummies(kc_train["zipcode"],drop_first=True)],axis=1)
kc_train = kc_train.drop(["zipcode"],axis=1)

In [6]:
# drop the original index column and id because using id as index is
# annoying for scaling
#kc_train.set_index("id",inplace=True)
kc_train.drop("Unnamed: 0",axis=1,inplace=True)
kc_train.drop("id",axis=1,inplace=True)

In [7]:
# can't use datetime form so we convert to timestamp
kc_train["date"] = pd.to_datetime(kc_train.date).apply(lambda x: x.timestamp())

In [8]:
kc_train

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,1.412554e+09,365000.0,4,2.25,2070,8893,2.0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1,1.408579e+09,865000.0,5,3.00,2900,6730,1.0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
2,1.408061e+09,1038000.0,4,2.50,3770,10893,2.0,0,2,3,...,0,0,0,0,0,0,0,0,0,0
3,1.427760e+09,1490000.0,3,3.50,4560,14608,2.0,0,2,3,...,0,0,0,0,0,0,0,0,0,0
4,1.405296e+09,711000.0,3,2.50,2550,5376,2.0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,1.425341e+09,750000.0,5,2.50,3240,9960,1.0,0,1,3,...,0,0,0,0,0,0,0,0,0,0
17286,1.400544e+09,440000.0,2,1.75,1300,4000,2.0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
17287,1.400026e+09,310000.0,3,1.00,1050,9876,1.0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
17288,1.417651e+09,427500.0,3,1.50,1900,43186,1.5,0,0,4,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# removing price so we can plug it into the splitter
target = kc_train["price"]
kc_feat = kc_train.drop("price", axis=1)

In [10]:
kc_feat.columns = map(lambda x: "x" + str(x) if type(x)==int else x, kc_feat.columns)
from statsmodels.stats.outliers_influence import variance_inflation_factor
viffactor = [variance_inflation_factor(kc_feat.values, i) for i in range(kc_feat.shape[1])]

vif = pd.DataFrame()
vif["VIF Factor"] = viffactor
#[variance_inflation_factor(df_features.values, i) for i in range(df_features.shape[1])]
vif["features"] = kc_feat.columns
vif.round(1).sort_values("VIF Factor",ascending=False)
vif[5:]

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF Factor,features
5,2.414804,floors
6,1.254457,waterfront
7,1.518670,view
8,1.313142,condition
9,3.806668,grade
...,...,...
82,7.071312,x98177
83,3.097768,x98178
84,1.684150,x98188
85,1.907646,x98198


In [11]:
kc_feat_lo_vif=kc_feat[vif[5:]["features"]]
kc_feat_lo_vif

Unnamed: 0,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,...,x98146,x98148,x98155,x98166,x98168,x98177,x98178,x98188,x98198,x98199
0,2.0,0,0,4,8,2070,0,1986,0,47.4388,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,0,5,8,1830,1070,1977,0,47.6784,...,0,0,0,0,0,0,0,0,0,0
2,2.0,0,2,3,11,3770,0,1997,0,47.5646,...,0,0,0,0,0,0,0,0,0,0
3,2.0,0,2,3,12,4560,0,1990,0,47.6995,...,0,0,0,0,0,0,0,0,0,0
4,2.0,0,0,3,9,2550,0,2004,0,47.6647,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,1.0,0,1,3,8,2020,1220,1958,0,47.5858,...,0,0,0,0,0,0,0,0,0,0
17286,2.0,0,0,3,7,1300,0,1948,0,47.6687,...,0,0,0,0,0,0,0,0,0,0
17287,1.0,0,0,3,7,1050,0,1953,0,47.7635,...,0,0,0,0,0,0,0,0,0,0
17288,1.5,0,0,4,7,1300,600,1971,0,47.4199,...,0,0,0,0,0,0,0,0,0,0


In [12]:
poly_2 = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly2_data = poly_2.fit_transform(kc_feat)
poly2_columns = poly_2.get_feature_names(kc_feat.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)
df_poly2.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,x98177 x98178,x98177 x98188,x98177 x98198,x98177 x98199,x98178 x98188,x98178 x98198,x98178 x98199,x98188 x98198,x98188 x98199,x98198 x98199
0,1412554000.0,4.0,2.25,2070.0,8893.0,2.0,0.0,0.0,4.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1408579000.0,5.0,3.0,2900.0,6730.0,1.0,0.0,0.0,5.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1408061000.0,4.0,2.5,3770.0,10893.0,2.0,0.0,2.0,3.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1427760000.0,3.0,3.5,4560.0,14608.0,2.0,0.0,2.0,3.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1405296000.0,3.0,2.5,2550.0,5376.0,2.0,0.0,0.0,3.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
new_columns = []
for column in df_poly2.columns:
    a = 0
    a = df_poly2[column].isin([0]).sum()
    if a != 13832:
        new_columns.append(column)

In [14]:
df_cleanpoly2 = df_poly2[new_columns]

In [15]:
df_cleanpoly2

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,x98177 x98178,x98177 x98188,x98177 x98198,x98177 x98199,x98178 x98188,x98178 x98198,x98178 x98199,x98188 x98198,x98188 x98199,x98198 x98199
0,1.412554e+09,4.0,2.25,2070.0,8893.0,2.0,0.0,0.0,4.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.408579e+09,5.0,3.00,2900.0,6730.0,1.0,0.0,0.0,5.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.408061e+09,4.0,2.50,3770.0,10893.0,2.0,0.0,2.0,3.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.427760e+09,3.0,3.50,4560.0,14608.0,2.0,0.0,2.0,3.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.405296e+09,3.0,2.50,2550.0,5376.0,2.0,0.0,0.0,3.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,1.425341e+09,5.0,2.50,3240.0,9960.0,1.0,0.0,1.0,3.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17286,1.400544e+09,2.0,1.75,1300.0,4000.0,2.0,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17287,1.400026e+09,3.0,1.00,1050.0,9876.0,1.0,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17288,1.417651e+09,3.0,1.50,1900.0,43186.0,1.5,0.0,0.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
xpolytrain = df_cleanpoly2
ypolytrain = target

In [17]:
print(ypolytrain.shape)

(17290,)


In [20]:
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

selector = SelectKBest(f_regression, k=500)

selector.fit(xpolytrain, ypolytrain)

  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


SelectKBest(k=500, score_func=<function f_regression at 0x1a21e0ab00>)

In [21]:
selected_columns = xpolytrain.columns[selector.get_support()]
removed_columns = xpolytrain.columns[~selector.get_support()]

In [22]:
for i in selected_columns:
    print(i)

bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
grade
sqft_above
sqft_basement
yr_renovated
lat
sqft_living15
sqft_lot15
x98002
x98003
x98004
x98006
x98023
x98030
x98031
x98033
x98038
x98039
x98040
x98042
x98058
x98075
x98092
x98102
x98105
x98106
x98112
x98119
x98168
x98198
x98199
date bedrooms
date bathrooms
date sqft_living
date sqft_lot
date floors
date waterfront
date view
date grade
date sqft_above
date sqft_basement
date yr_renovated
date lat
date sqft_living15
date sqft_lot15
date x98002
date x98003
date x98004
date x98006
date x98023
date x98030
date x98031
date x98033
date x98038
date x98039
date x98040
date x98042
date x98058
date x98075
date x98092
date x98102
date x98105
date x98106
date x98112
date x98119
date x98168
date x98198
date x98199
bedrooms bathrooms
bedrooms sqft_living
bedrooms sqft_lot
bedrooms floors
bedrooms waterfront
bedrooms view
bedrooms condition
bedrooms grade
bedrooms sqft_above
bedrooms sqft_basement
bedrooms yr_built
bedrooms yr_renova

In [23]:
from sklearn.feature_selection import RFECV

In [24]:
ols = linear_model.LinearRegression()

In [25]:
display(xpolytrain.shape)
display(xpolytrain[selected_columns].shape)
selected_xpoly = xpolytrain[selected_columns]

(17290, 3828)

(17290, 500)

In [26]:
selector = RFECV(estimator=ols, min_features_to_select=100, step=1, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
selector.fit(selected_xpoly, ypolytrain)


Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.


RFECV(cv=5,
      estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 normalize=False),
      min_features_to_select=100, n_jobs=-1, scoring='neg_mean_squared_error',
      step=1, verbose=1)

In [27]:
selected_rfe = selected_xpoly.columns[selector.support_]
removed_rfe = selected_xpoly.columns[~selector.support_]

In [28]:

lm_rfe = LinearRegression()

lm_rfe = lm_rfe.fit(xpolytrain[selected_rfe], ypolytrain)

y_rfe = lm_rfe.predict(xpolytrain[selected_rfe])


trainRFE_rmse = np.sqrt(metrics.mean_squared_error(ypolytrain, y_rfe))


print('Training Root Mean Squared Error:' , trainRFE_rmse)



Training Root Mean Squared Error: 126996.51533779982


In [29]:
print(len(selected_rfe))
for i in selected_rfe:
    print(i)

477
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
grade
sqft_above
sqft_basement
yr_renovated
lat
sqft_living15
sqft_lot15
x98002
x98003
x98004
x98006
x98023
x98030
x98031
x98033
x98038
x98039
x98040
x98042
x98058
x98075
x98092
x98102
x98105
x98106
x98112
x98119
x98168
x98198
x98199
date sqft_living
date waterfront
date sqft_above
date sqft_basement
date x98004
date x98033
date x98038
date x98039
date x98040
date x98042
date x98058
date x98092
date x98102
date x98105
date x98112
date x98119
bedrooms bathrooms
bedrooms sqft_living
bedrooms sqft_lot
bedrooms floors
bedrooms waterfront
bedrooms view
bedrooms condition
bedrooms grade
bedrooms sqft_above
bedrooms sqft_basement
bedrooms yr_built
bedrooms yr_renovated
bedrooms lat
bedrooms long
bedrooms sqft_living15
bedrooms sqft_lot15
bedrooms x98002
bedrooms x98003
bedrooms x98004
bedrooms x98006
bedrooms x98023
bedrooms x98033
bedrooms x98038
bedrooms x98039
bedrooms x98040
bedrooms x98042
bedrooms x98075
bedrooms x98102


In [30]:
import pickle
pickle.dump([lm_rfe,selected_rfe], open( "model+cols2.p", "wb" ) )