In [3]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [6]:
#import data sets
train_data = pd.read_csv('col_droptrain.csv', index_col=False)
test_data = pd.read_csv('col_droptest.csv', index_col=False)

In [7]:
train_data

Unnamed: 0,RegionName,City,State,Rent,total_pop,median_age,median_income,million_dollar_housing_units,dwellings_50_or_more_units,mobile_homes,...,Perc_vacat_housing_units,Perc_other_race,Perc_black,Perc_asian,Perc_hispanic,Perc_commuter_by_public_transport,Perc_worked_at_home,Perc_different_house_year_ago_same_city,Perc_different_house_year_ago_dif_city,perc_housing_built_2005_or_later
0,30458-2015-09,Statesboro,GA,679.0,41829.0,23.2,27908.0,44.0,267.0,1592.0,...,0.130343,0.011738,0.318463,0.021923,0.042626,0.018481,0.051834,0.060628,0.197877,0.013264
1,27701-2018-09,Durham,NC,1212.0,22595.0,32.2,31094.0,15.0,1124.0,105.0,...,0.118276,0.002788,0.445585,0.022660,0.190131,0.125178,0.043449,0.191812,0.088073,0.017151
2,6112-2015-09,Hartford,CT,1080.0,23298.0,28.2,30199.0,39.0,134.0,14.0,...,0.212282,0.002275,0.763241,0.011331,0.142201,0.242533,0.021089,0.107305,0.049532,0.000000
3,73071-2019-09,Norman,OK,873.0,40693.0,28.4,48414.0,0.0,524.0,272.0,...,0.111638,0.031086,0.066449,0.049689,0.087288,0.007434,0.018101,0.135404,0.119799,0.011447
4,91733-2014-09,South El Monte,CA,2372.0,43902.0,31.4,41592.0,15.0,68.0,618.0,...,0.039787,0.006628,0.002870,0.169764,0.786001,0.054054,0.049902,0.018473,0.053414,0.003427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6347,10014-2019-09,New York,NY,3872.0,31794.0,36.8,127973.0,1233.0,6692.0,0.0,...,0.150533,0.005850,0.012896,0.055419,0.062559,0.589270,0.099853,0.110367,0.052054,0.003263
6348,60619-2019-09,Chicago,IL,1108.0,62992.0,39.4,32558.0,0.0,1052.0,7.0,...,0.199877,0.001508,0.958947,0.002445,0.015129,0.313581,0.039935,0.103553,0.024289,0.002487
6349,12307-2015-09,Schenectady,NY,860.0,7619.0,30.6,21875.0,9.0,131.0,36.0,...,0.329583,0.087544,0.399396,0.045938,0.182701,0.187106,0.025612,0.074682,0.061163,0.000000
6350,19147-2015-09,Philadelphia,PA,1442.0,35014.0,34.7,60400.0,150.0,1144.0,35.0,...,0.122492,0.004227,0.091763,0.122237,0.106957,0.258358,0.049072,0.100703,0.054921,0.004813


In [8]:
#drop columns

train_data = train_data.drop(['RegionName', 'City', 'State'], axis=1)
test_data = test_data.drop(['RegionName', 'City', 'State'], axis=1)

In [9]:
train_data.shape

(6352, 34)

In [10]:
test_data.shape

(3129, 34)

In [11]:
X_train = train_data.drop(['Rent'],axis=1)
y_train = train_data['Rent']
X_test = test_data.drop(['Rent'],axis=1)
y_test = test_data['Rent']

In [12]:
X_train.shape

(6352, 33)

In [13]:
y_train.shape

(6352,)

In [14]:
X_test.shape

(3129, 33)

In [15]:
y_test.shape

(3129,)

In [16]:
# create RandomForest

rf = RandomForestRegressor(random_state=42)

rf.fit(X_train, y_train)

print("The training error is : %.5f" % (1 - rf.score(X_train, y_train)))
print("The test error is: %.5f" % (1 - rf.score(X_test, y_test)))

The training error is : 0.03209
The test error is: 0.13678


In [17]:
# check CV score

cv_score = cross_val_score(rf, X_train, y_train, cv=5)
cv_score.mean()

0.7614178899744374

In [18]:
# create y prediction

y_pred = rf.predict(X_test)

In [19]:
# get r2 score
# get mse

print("The r2 score is : %.5f" % r2_score(y_test, y_pred))
print("The mse is : %.5f" % mean_squared_error(y_test, y_pred))
print("The mse sqrt is : %.5f" % (mean_squared_error(y_test, y_pred)**.5))

The r2 score is : 0.86322
The mse is : 71286.30182
The mse sqrt is : 266.99495


In [20]:
# create param for gridsearch

param_grid = ({
    'max_features': ['sqrt', .2, .35],
    'min_samples_split': [2, 5],
    'n_estimators': [250, 400, 550],
    'max_depth': [6, 8, 10]
})

In [21]:
# run grid search

rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, return_train_score = True)

rf_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [6, 8, 10],
                         'max_features': ['sqrt', 0.2, 0.35],
                         'min_samples_split': [2, 5],
                         'n_estimators': [250, 400, 550]},
             return_train_score=True)

In [22]:
# get best params

rf_grid.best_params_

{'max_depth': 10,
 'max_features': 0.35,
 'min_samples_split': 2,
 'n_estimators': 550}

In [23]:
# create new randomForest model with new params
# (would normally just set_params but for reuse of notebook seems like a safer bet to create new)

rf2 = RandomForestRegressor(max_depth=10, max_features=0.35,
                            min_samples_split=2, n_estimators=550, random_state=42)

rf2.fit(X_train, y_train)

print("The new training error is : %.5f" % (1 - rf2.score(X_train, y_train)))
print("The new test error is: %.5f" % (1 - rf2.score(X_test, y_test)))

The new training error is : 0.07106
The new test error is: 0.16340


In [24]:
# create new y predict

y_new_pred = rf2.predict(X_test)

In [25]:
# updated r2 score
# updated mse

print("The r2 score is : %.5f" % r2_score(y_test, y_pred))
print("The mse is : %.5f" % mean_squared_error(y_test, y_pred))
print("The mse sqrt is : %.5f" % (mean_squared_error(y_test, y_pred)**.5))

print("The new r2 score is : %.5f" % r2_score(y_test, y_new_pred))
print("The new mse is : %.5f" % mean_squared_error(y_test, y_new_pred))
print("The new mse sqrt is : %.5f" % (mean_squared_error(y_test, y_new_pred)**.5))

The r2 score is : 0.86322
The mse is : 71286.30182
The mse sqrt is : 266.99495
The new r2 score is : 0.83660
The new mse is : 85163.70328
The new mse sqrt is : 291.82821
