In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, log_loss, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_and_test_dataset, get_feature_columns, get_whole_dataset
from models.helpers import get_feature_importance
from notebook_helpers import run_custom_grid_search

In [2]:
X_train, y_train, X_test, y_test = get_train_and_test_dataset("home_win")

In [4]:
params = {"oob_score":True, "bootstrap":True, "n_jobs":-1, "n_estimators": 5000} 
results = run_custom_grid_search(params, X_train, y_train, X_test, y_test)

Parameter estimation took:  616.1557083129883


NameError: name 'get_best_params' is not defined

In [5]:
results.to_csv("outcome_hyperparam_optimization.csv")
results.sort_values(['test_acc', 'test_logloss'], ascending=[False, True])

Unnamed: 0,max_depth,max_features,min_samples_leaf,test_acc,test_logloss,test_mae,test_mse
19,5.0,log2,15,0.593286,0.921258,0.59719,0.978142
13,5.0,log2,3,0.592506,0.922284,0.59719,0.976581
25,8.0,log2,5,0.591725,0.918667,0.588603,0.949258
17,5.0,log2,10,0.591725,0.921624,0.600312,0.984387
15,5.0,log2,5,0.591725,0.921874,0.598751,0.979703
20,8.0,sqrt,1,0.590945,0.918577,0.589383,0.950039
21,8.0,log2,1,0.590945,0.919463,0.588603,0.947697
10,5.0,sqrt,1,0.590945,0.920089,0.600312,0.982826
11,5.0,log2,1,0.590945,0.921907,0.600312,0.982826
16,5.0,sqrt,10,0.590164,0.919649,0.602654,0.98829


In [None]:
X_train, y_train, X_test, y_test = get_train_and_test_dataset("home_win")
def get_labeled_df(s):
    df = pd.DataFrame(s)
    df[df["home_win"] == 1] = "Win"
    df[df["home_win"] == 0] = "Draw"
    df[df["home_win"] == -1] = "Lose"
    return df

plt.figure(figsize=(20, 6))
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
pd.value_counts(get_labeled_df(y_train)["home_win"]).plot.bar(ax=axes[0])
pd.value_counts(get_labeled_df(y_test)["home_win"]).plot.bar(ax=axes[1])

In [None]:
best_params = {'max_depth': 5.0, 'max_features': 'sqrt', 'min_samples_leaf': 5}
params.pop("n_estimators", None)
params.update(best_params)

#n_estimators = [200, 500, 1000, 2000, 5000]
n_estimators = []
for k in n_estimators:
    results = []
    for i in range(0, 10):
        params["n_estimators"] = k
        clf = RandomForestClassifier(**params)
        clf.fit(X_train, y_train)

        test_acc = clf.score(X_test, y_test)
        results.append(test_acc)

    print("K estimators: ", k)
    print("std: ", np.std(results))
    print("Mean: ", np.mean(results))