In [2]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import matplotlib.pyplot as plt

from sklearn import metrics
from preprocessing import *
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import GridSearchCV

In [3]:
X = pickle.load(open('X.p','rb'))
y = pickle.load(open('y.p','rb'))
nas = pickle.load(open('nas.p','rb'))

In [5]:
y

array([12.46458334, 12.54966235, 12.20557252, ..., 12.89921983,
       12.56024446, 12.64109656])

**Train, Val, and Test**

In [3]:
# We use the first half of the dataset for training, the next 25% for validation and the final 25% for testing

n_trn = len(X) // 2
n_valid = n_trn + (len(X) // 4)
X_train, X_valid, X_test = split_vals_test(X, n_trn, n_valid)
y_train, y_valid, y_test = split_vals_test(y, n_trn, n_valid)

**Baseline model**

In [4]:
m = RFR(n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m, X_train, y_train, X_valid, y_valid)

  warn("Some inputs do not have OOB scores. "


CPU times: user 39.4 s, sys: 170 ms, total: 39.5 s
Wall time: 7.93 s
[0.07975016230858385, 0.20360450275845207, 0.972181703926252, 0.8275231910300023, -6.274355518398083]


**Analysis**:  r^2 of 0.83.  Negative oob score indicates that we are not using enough estimators (decision trees) in our forest

**Basic Fine-Tuning**

In [5]:
# n_estimators

m1 = RFR(n_estimators=10, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=20, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=80, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

  warn("Some inputs do not have OOB scores. "


CPU times: user 38.8 s, sys: 228 ms, total: 39 s
Wall time: 7.71 s


  warn("Some inputs do not have OOB scores. "


CPU times: user 1min 18s, sys: 1.34 s, total: 1min 20s
Wall time: 13.2 s
CPU times: user 3min 15s, sys: 577 ms, total: 3min 16s
Wall time: 27.8 s
CPU times: user 6min 28s, sys: 535 ms, total: 6min 29s
Wall time: 52.5 s
[0.07975358104638124, 0.20478750681643823, 0.9721793188402297, 0.825513082888144, -6.265388928154166]
[0.07219963204187674, 0.19739455245250206, 0.9771998689981941, 0.8378838511326102, 0.7870207864999709]
[0.06791759552642464, 0.19431219684431486, 0.9798241435914183, 0.8429072741144209, 0.8564221401928943]
[0.06564020962977164, 0.19256999668239622, 0.9811545161374319, 0.8457116275241453, 0.8632949473340099]


**Analysis:** The returns on adding estimators are reduced significantly (judging from oob) when going from 40 to 80.  Thus, we'll stick with 40 for now.

In [6]:
# min_samples_leaf
m1 = RFR(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=40, min_samples_leaf=5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, min_samples_leaf=10, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=40, min_samples_leaf=25, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

CPU times: user 2min 44s, sys: 196 ms, total: 2min 44s
Wall time: 23.5 s
CPU times: user 2min 35s, sys: 187 ms, total: 2min 35s
Wall time: 23.2 s
CPU times: user 2min 16s, sys: 169 ms, total: 2min 16s
Wall time: 19.4 s
CPU times: user 2min 5s, sys: 183 ms, total: 2min 5s
Wall time: 18.6 s
[0.0851868599293174, 0.19382815918790186, 0.9682595855036173, 0.8436889449371754, 0.8573316389709049]
[0.10270202032333724, 0.1949022560433158, 0.9538655499611275, 0.8419517525424696, 0.8564470287231085]
[0.1312313732359156, 0.19902351779966898, 0.9246743790188436, 0.835197138354816, 0.8509094078141303]
[0.16477800055065653, 0.2074100426056273, 0.8812411916037944, 0.8210154626636681, 0.837005371786386]


**Analysis:** Going from 1 to 3 (1 was the default value from the previous cell) we see the oob and validation r^2 improving slightly, so we'll go with that.

In [7]:
# max_feaures

m1 = RFR(n_estimators=40, min_samples_leaf=3, max_features=1, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=40, min_samples_leaf=3, max_features='sqrt', n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=40, min_samples_leaf=3, max_features='log2', n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

CPU times: user 2.71 s, sys: 111 ms, total: 2.82 s
Wall time: 1.62 s
CPU times: user 1min 50s, sys: 222 ms, total: 1min 50s
Wall time: 16.6 s
CPU times: user 16.6 s, sys: 149 ms, total: 16.7 s
Wall time: 3.76 s
CPU times: user 10.2 s, sys: 165 ms, total: 10.4 s
Wall time: 3.17 s
[0.3127580206869916, 0.3548328579380348, 0.5721571428331118, 0.4761540260934751, 0.4932793707139609]
[0.0866491451808251, 0.19280843133471592, 0.9671605451439951, 0.8453293201053674, 0.8601555772093379]
[0.11005641846554352, 0.2092943441313079, 0.9470216869801372, 0.8177485733206248, 0.8367962656639258]
[0.13489458498261578, 0.23266414648865857, 0.9204103845252798, 0.7747758899726638, 0.7945185126551586]


**Analysis:** max_features = 0.5 is the clear winner here.  I'll now re-train the forest with more estimators to see if that yields any additional gains

In [8]:
# max_feaures

m1 = RFR(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m2 = RFR(n_estimators=60, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m3 = RFR(n_estimators=80, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m4 = RFR(n_estimators=100, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
%time m1.fit(X_train, y_train)
%time m2.fit(X_train, y_train)
%time m3.fit(X_train, y_train)
%time m4.fit(X_train, y_train)
print_score(m1, X_train, y_train, X_valid, y_valid)
print_score(m2, X_train, y_train, X_valid, y_valid)
print_score(m3, X_train, y_train, X_valid, y_valid)
print_score(m4, X_train, y_train, X_valid, y_valid)

CPU times: user 1min 56s, sys: 246 ms, total: 1min 56s
Wall time: 20.6 s
CPU times: user 2min 56s, sys: 351 ms, total: 2min 56s
Wall time: 30.2 s
CPU times: user 3min 57s, sys: 555 ms, total: 3min 57s
Wall time: 40.6 s
CPU times: user 4min 50s, sys: 826 ms, total: 4min 51s
Wall time: 48.8 s
[0.08685545606794018, 0.19321837672180223, 0.9670039780406083, 0.8446709055521905, 0.8604889946618992]
[0.08572343459389575, 0.19283930382215253, 0.9678584734072976, 0.8452797843990623, 0.8650044744559671]
[0.08531824533294104, 0.19182696862235832, 0.9681616022445615, 0.8468999686226957, 0.8664799947262254]
[0.0849487953752823, 0.19128687586703105, 0.9684367421494987, 0.8477608675171076, 0.8678186755535187]


**Analysis:** we see some minor improvement going from 40 to 80, but above that the changes are really negligible.  We'll stick with 80 as our best model

In [9]:
best_model = m3
pickle.dump(best_model, open('nb5_best.p', 'wb'))

In [10]:
best = pickle.load(open('nb5_best.p','rb'))

In [11]:
pickle.dump([X_train, y_train], open('train.p','wb'))
pickle.dump([X_valid, y_valid], open('valid.p','wb'))
pickle.dump([X_test, y_test], open('test.p','wb'))