In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("/home/nishanth/tensorflow/california_housing/cal_housing_edited.csv")
print data.head()

   longitude  latitude  housingMedianAge  totalRooms  totalBedrooms  \
0    -122.23     37.88              41.0       880.0          129.0   
1    -122.22     37.86              21.0      7099.0         1106.0   
2    -122.24     37.85              52.0      1467.0          190.0   
3    -122.25     37.85              52.0      1274.0          235.0   
4    -122.25     37.85              52.0      1627.0          280.0   

   population  households  medianIncome  rooms_per_household  \
0       322.0       126.0        8.3252             6.984127   
1      2401.0      1138.0        8.3014             6.238137   
2       496.0       177.0        7.2574             8.288136   
3       558.0       219.0        5.6431             5.817352   
4       565.0       259.0        3.8462             6.281853   

   bedrooms_per_room  population_per_household  medianHouseValue  
0           0.146591                  2.555556          452600.0  
1           0.155797                  2.109842        

In [3]:
scalar = StandardScaler()

In [4]:
train, test = train_test_split(np.array(data), test_size=0.2, shuffle=True, random_state=42)

In [5]:
print train.shape, test.shape
x_train, y_train ,x_test, y_test = train[:,:-1], train[:,-1], test[:,:-1], test[:,-1]

(16512, 12) (4128, 12)


In [6]:
#Train the model using linear regression
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()
X = scalar.fit_transform(x_train)
Y = y_train
lreg.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
#calculating root mean square error of model
from sklearn.metrics import mean_squared_error
def error(model, X, Y):
    predictions = model.predict(X)
    mse = mean_squared_error(Y, predictions)
    rmse = np.sqrt(mse)
    return rmse

In [10]:
print error(lreg, X, Y)

68311.8562967575


In [11]:
from sklearn.model_selection import cross_val_score
def cross_val(model):
    scores = cross_val_score(model, X, Y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores

In [12]:
rmse_scores = cross_val(lreg)
print "scores:",rmse_scores
print "mean:",np.mean(rmse_scores)
print "standard deviation:",np.std(rmse_scores)

scores: [65503.19601742 71822.27077741 67676.62308014 66640.50945839
 69192.93621885 66072.79696621 65711.8862025  69367.83077433
 73674.48726124 69735.41305941]
mean: 68539.79498159068
standard deviation: 2594.9543977641724


In [13]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor()
random_forest.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
print error(random_forest, X, Y)

22911.982666883323


In [15]:
rmse_scores = cross_val(random_forest)
print "scores:",rmse_scores
print "mean:",np.mean(rmse_scores)
print "standard deviation:",np.std(rmse_scores)

scores: [52112.8878655  56358.42643414 52829.32203417 54870.28910783
 55899.90323214 50894.44619137 52328.52601934 55386.14143807
 53780.13478363 54604.25147596]
mean: 53906.432858214204
standard deviation: 1718.9089576757333


In [16]:
#random forest regressor worked better, lets do some hyper parameter tuning

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]

reg = RandomForestRegressor()
grid_search = GridSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, Y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'n_estimators': [3, 10], 'max_features': [2, 3, 4], 'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [18]:
print "Best model:",grid_search.best_estimator_

Best model: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [19]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

(66926.33555438208, {'max_features': 2, 'n_estimators': 3})
(56471.889376170795, {'max_features': 2, 'n_estimators': 10})
(54133.45021531602, {'max_features': 2, 'n_estimators': 30})
(61463.344792676166, {'max_features': 4, 'n_estimators': 3})
(53620.01882275809, {'max_features': 4, 'n_estimators': 10})
(51344.271366321365, {'max_features': 4, 'n_estimators': 30})
(59506.280797152205, {'max_features': 6, 'n_estimators': 3})
(53560.469160220055, {'max_features': 6, 'n_estimators': 10})
(51509.80254841379, {'max_features': 6, 'n_estimators': 30})
(60708.60886528652, {'max_features': 8, 'n_estimators': 3})
(54017.6598259094, {'max_features': 8, 'n_estimators': 10})
(52107.34862496689, {'max_features': 8, 'n_estimators': 30})
(64059.44264475985, {'max_features': 2, 'n_estimators': 3, 'bootstrap': False})
(55719.98784049514, {'max_features': 2, 'n_estimators': 10, 'bootstrap': False})
(61012.03789792791, {'max_features': 3, 'n_estimators': 3, 'bootstrap': False})
(53161.105067472556, {'max_

In [20]:
print "Best model is ","(51127.42695126984, {'max_features': 4, 'n_estimators': 30})"

Best model is  (51127.42695126984, {'max_features': 4, 'n_estimators': 30})


In [21]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [22]:
#feature importance
names = data.columns.values
sorted(zip(feature_importances, names), reverse=True)

[(0.3531039582784274, 'medianIncome'),
 (0.14050440443019557, 'bedrooms_per_room'),
 (0.12772935950491887, 'population_per_household'),
 (0.09688044581680254, 'latitude'),
 (0.09397803422836293, 'longitude'),
 (0.05909052808819658, 'rooms_per_household'),
 (0.05268969496246531, 'housingMedianAge'),
 (0.019891067062357288, 'population'),
 (0.019034202702069826, 'totalBedrooms'),
 (0.01884097868338186, 'totalRooms'),
 (0.01825732624282189, 'households')]

In [23]:
final_model = grid_search.best_estimator_
X_test = scalar.fit_transform(x_test)
print "Test set mse before grid search:",error(random_forest, X_test, y_test)
print "Test set mse after grid search:", error(final_model, X_test, y_test)

Test set mse before grid search: 82288.43423379771
Test set mse after grid search: 72080.28547273835
