In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
#import data sets
train_data = pd.read_csv('prelim_data_acsTRAIN.csv', index_col=0)
test_data = pd.read_csv('prelim_data_acsTEST.csv', index_col=0)

In [3]:
#drop columns

train_data = train_data.drop(['RegionName', 'City', 'State'], axis=1)
test_data = test_data.drop(['RegionName', 'City', 'State'], axis=1)

In [4]:
train_data.shape

(1244, 229)

In [5]:
test_data.shape

(614, 229)

In [6]:
X_train = train_data.drop(['2020-01'],axis=1)
y_train = train_data['2020-01']
X_test = test_data.drop(['2020-01'],axis=1)
y_test = test_data['2020-01']

In [7]:
X_train.shape

(1244, 228)

In [8]:
y_train.shape

(1244,)

In [9]:
X_test.shape

(614, 228)

In [10]:
y_test.shape

(614,)

In [12]:
# create RandomForest

rf = RandomForestRegressor(random_state=42)

rf.fit(X_train, y_train)

print("The training error is : %.5f" % (1 - rf.score(X_train, y_train)))
print("The test error is: %.5f" % (1 - rf.score(X_test, y_test)))

The training error is : 0.06810
The test error is: 0.12009


In [13]:
# check CV score

cv_score = cross_val_score(rf, X_train, y_train, cv=5)
cv_score.mean()

0.6779894258172506

In [14]:
# create y prediction

y_pred = rf.predict(X_test)

In [15]:
# get r2 score
# get mse

print("The r2 score is : %.5f" % r2_score(y_test, y_pred))
print("The mse is : %.5f" % mean_squared_error(y_test, y_pred))
print("The mse sqrt is : %.5f" % (mean_squared_error(y_test, y_pred)**.5))

The r2 score is : 0.87991
The mse is : 49297.64908
The mse sqrt is : 222.03074


In [16]:
# create param for gridsearch

param_grid = ({
    'max_features': ['sqrt', .25, .5, .75, 'auto'],
    'min_samples_split': [2, 5, 10, 15],
    'n_estimators': [50, 150, 250, 350],
    'max_depth': [10, 20, 100]
})

In [17]:
# run grid search

rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, return_train_score = True)

rf_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [10, 20, 100],
                         'max_features': ['sqrt', 0.25, 0.5, 0.75, 'auto'],
                         'min_samples_split': [2, 5, 10, 15],
                         'n_estimators': [50, 150, 250, 350]},
             return_train_score=True)

In [18]:
# get best params

rf_grid.best_params_

{'max_depth': 10,
 'max_features': 0.25,
 'min_samples_split': 2,
 'n_estimators': 350}

In [19]:
# create new randomForest model with new params
# (would normally just set_params but for reuse of notebook seems like a safer bet to create new)

rf2 = RandomForestRegressor(max_depth=10, max_features=0.25,
                            min_samples_split=2, n_estimators=350, random_state=42)

rf2.fit(X_train, y_train)

print("The new training error is : %.5f" % (1 - rf2.score(X_train, y_train)))
print("The new test error is: %.5f" % (1 - rf2.score(X_test, y_test)))

The new training error is : 0.05492
The new test error is: 0.12185


In [20]:
# create new y predict

y_new_pred = rf2.predict(X_test)

In [22]:
# updated r2 score
# updated mse

print("The r2 score is : %.5f" % r2_score(y_test, y_pred))
print("The mse is : %.5f" % mean_squared_error(y_test, y_pred))
print("The mse sqrt is : %.5f" % (mean_squared_error(y_test, y_pred)**.5))

print("The new r2 score is : %.5f" % r2_score(y_test, y_new_pred))
print("The new mse is : %.5f" % mean_squared_error(y_test, y_new_pred))
print("The new mse sqrt is : %.5f" % (mean_squared_error(y_test, y_new_pred)**.5))

The r2 score is : 0.87991
The mse is : 49297.64908
The mse sqrt is : 222.03074
The new r2 score is : 0.87815
The new mse is : 50023.00854
The new mse sqrt is : 223.65824
