In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, log_loss, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_and_test_dataset, get_feature_columns, get_whole_dataset
from models.helpers import get_feature_importance
from notebook_helpers import run_grid_search_for_outcome, get_cv_grid_search_arguments

In [2]:
X, y = get_whole_dataset("home_win")

In [3]:
params = {"oob_score":True, "bootstrap":True, "n_jobs":-1, "n_estimators": 2000} 
arguments = get_cv_grid_search_arguments(params, X)
results = run_grid_search_for_outcome(arguments, X, y)

In [4]:
results.to_csv("outcome_hyperparam_optimization.csv")
results.sort_values(['test_acc', 'test_logloss'], ascending=[False, True])

Unnamed: 0,max_depth,max_features,min_samples_leaf,test_acc,test_logloss
36,12.0,sqrt,10,0.572277,0.927287
46,,sqrt,10,0.572277,0.928195
47,,log2,10,0.570377,0.929089
34,12.0,sqrt,5,0.569956,0.93095
19,5.0,log2,15,0.569954,0.927739
38,12.0,sqrt,15,0.569534,0.926176
11,5.0,log2,1,0.569532,0.928157
13,5.0,log2,3,0.569321,0.928183
48,,sqrt,15,0.569111,0.926653
35,12.0,log2,5,0.568901,0.931216


In [None]:
X_train, y_train, X_test, y_test = get_train_and_test_dataset("home_win")
def get_labeled_df(s):
    df = pd.DataFrame(s)
    df[df["home_win"] == 1] = "Win"
    df[df["home_win"] == 0] = "Draw"
    df[df["home_win"] == -1] = "Lose"
    return df

plt.figure(figsize=(20, 6))
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
pd.value_counts(get_labeled_df(y_train)["home_win"]).plot.bar(ax=axes[0])
pd.value_counts(get_labeled_df(y_test)["home_win"]).plot.bar(ax=axes[1])

In [None]:
best_params = {'max_depth': 5.0, 'max_features': 'sqrt', 'min_samples_leaf': 5}
params.pop("n_estimators", None)
params.update(best_params)

#n_estimators = [200, 500, 1000, 2000, 5000]
n_estimators = []
for k in n_estimators:
    results = []
    for i in range(0, 10):
        params["n_estimators"] = k
        clf = RandomForestClassifier(**params)
        clf.fit(X_train, y_train)

        test_acc = clf.score(X_test, y_test)
        results.append(test_acc)

    print("K estimators: ", k)
    print("std: ", np.std(results))
    print("Mean: ", np.mean(results))