In [1]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import matplotlib.pyplot as plt

from sklearn import metrics
from preprocessing import *
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import GridSearchCV

In [2]:
X = pickle.load(open('X.p','rb'))
y = pickle.load(open('y.p','rb'))
nas = pickle.load(open('nas.p','rb'))

**Train, Val, and Test**

In [3]:
# We use the first half of the dataset for training, the next 25% for validation and the final 25% for testing

n_trn = len(X) // 2
n_valid = n_trn + (len(X) // 4)
X_train, X_valid, X_test = split_vals_test(X, n_trn, n_valid)
y_train, y_valid, y_test = split_vals_test(y, n_trn, n_valid)

**Baseline model**

In [4]:
m = RFR(n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

  warn("Some inputs do not have OOB scores. "


CPU times: user 39.4 s, sys: 170 ms, total: 39.5 s
Wall time: 7.93 s
[0.07975016230858385, 0.20360450275845207, 0.972181703926252, 0.8275231910300023, -6.274355518398083]


**Analysis**:  r^2 of 0.83.  Negative oob score indicates that we are not using enough estimators (decision trees) in our forest

**Basic Fine-Tuning**

In [None]:
# n_estimators

m1 = RFR(n_estimators=10, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=20, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=80, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

  warn("Some inputs do not have OOB scores. "


CPU times: user 38.8 s, sys: 228 ms, total: 39 s
Wall time: 7.71 s


  warn("Some inputs do not have OOB scores. "


CPU times: user 1min 18s, sys: 1.34 s, total: 1min 20s
Wall time: 13.2 s
CPU times: user 3min 15s, sys: 577 ms, total: 3min 16s
Wall time: 27.8 s


**Analysis:** The returns on adding estimators are reduced significantly (judging from oob) when going from 40 to 80.  Thus, we'll stick with 40 for now.

In [None]:
# min_samples_leaf

m1 = RFR(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=40, min_samples_leaf=5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, min_samples_leaf=10, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=40, min_samples_leaf=25, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

**Analysis:** Going from 1 to 3 (1 was the default value from the previous cell) we see the oob and validation r^2 improving slightly, so we'll go with that.

In [None]:
# max_feaures

m1 = RFR(n_estimators=40, min_samples_leaf=3, max_features=1, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, min_samples_leaf=3, max_features='sqrt', n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=40, min_samples_leaf=3, max_features='log2', n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

**Analysis:** max_features = 0.5 is the clear winner here.  I'll now re-train the forest with more estimators to see if that yields any additional gains

In [None]:
# max_feaures

m1 = RFR(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=60, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=80, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=100, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

**Analysis:** we see some minor improvement going from 40 to 80, but above that the changes are really negligible.  We'll stick with 80 as our best model

In [None]:
best_model = m3
pickle.dump(best_model, open('nb5_best.p', 'wb'))

In [None]:
best = pickle.load(open('nb5_best.p','rb'))

In [None]:
pickle.dump([X_train, y_train], open('train.p','wb'))
pickle.dump([X_valid, y_valid], open('valid.p','wb'))
pickle.dump([X_test, y_test], open('test.p','wb'))