In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, explained_variance_score
from sklearn.externals import joblib
import pandas as pd

In [2]:
#load in the data
df = pd.read_csv("kc_house_data.csv")

In [3]:
#create data frame
df[:15]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,20140512T000000,1230000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
6,1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
8,2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,...,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
9,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


In [4]:
df.drop("id",axis=1,inplace=True)

In [5]:
#predicting 'price' so this column will be used to compare results
y = df['price']

#drop fields that don't correctly predict price
X = df.drop(["price","date"],axis=1)

In [6]:
#split the .csv file to 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
#instantiate the model
tree_mod = RandomForestRegressor(n_jobs=-1, verbose=1)

In [8]:
parameters = {"n_estimators":[250,300,500]}

In [9]:
clf = GridSearchCV(tree_mod, parameters)

In [10]:
clf.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   10.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   10.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tas

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [250, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [11]:
preds = clf.predict(X_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


In [12]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [13]:
mean_absolute_error(y_test,preds)

67049.116180064724

In [14]:
r2_score(y_test,preds)

0.88420141833160149

In [15]:
explained_variance_score(y_test,preds)

0.88420811585782499

In [16]:
df_score = pd.DataFrame({"MAE":[mean_absolute_error(y_test,preds)],"R^2":[r2_score(y_test,preds)]})
df_score.to_csv('Model_Scores.csv')

In [17]:
forest = RandomForestRegressor(n_estimators=250,
                              random_state=0)

In [18]:
forest.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [19]:
importances = list(forest.feature_importances_)

In [20]:
print(importances)

[0.0030818274425439265, 0.007270759893849477, 0.25330757773935858, 0.01388634664026457, 0.00187363411782786, 0.029490710481106634, 0.010984406432344419, 0.0029153143505231916, 0.33553471537998969, 0.020687349201665605, 0.0052684652017587078, 0.02609301442703181, 0.002153101367798328, 0.013652511354162565, 0.15989311699712491, 0.069292300218895722, 0.031580760828461223, 0.013034087925292891]


In [21]:
col = list(X)

In [22]:
df_feat = pd.DataFrame({'feature':col,'importance':importances})

In [23]:
df_feat

Unnamed: 0,feature,importance
0,bedrooms,0.003082
1,bathrooms,0.007271
2,sqft_living,0.253308
3,sqft_lot,0.013886
4,floors,0.001874
5,waterfront,0.029491
6,view,0.010984
7,condition,0.002915
8,grade,0.335535
9,sqft_above,0.020687


In [24]:
df_feat.to_csv("feature_scores.csv")

In [25]:
joblib.dump(clf.best_estimator_, 'BlairModel.pkl')

['BlairModel.pkl']

In [26]:
model = joblib.load('BlairModel.pkl') 

In [27]:
pickle_test = model.predict(X_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


In [28]:
mean_absolute_error(y_test,preds)

67049.116180064724