In [1]:
import functools
import numpy as np
import pandas as pd
import random
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_test_wc_dataset, get_feature_columns, get_whole_dataset
from simulation.predictor import ScorePredictor
from simulation.simulation import run_simulation, get_match_win_probability
from models.grid_search import run_custom_grid_search
from models.helpers import get_best_params, get_feature_importance

In [2]:
home = get_train_test_wc_dataset("home_score")
away = get_train_test_wc_dataset("away_score")

X_train = pd.concat([home[0], away[0]])
y_train = pd.concat([home[1], away[1]])

X_test = pd.concat([home[2], away[2]])
y_test = pd.concat([home[3], away[3]])

X_wc = pd.concat([home[4], away[4]])
y_wc = pd.concat([home[5], away[5]])

In [3]:
params = {"oob_score":True, "bootstrap":True, "n_jobs":-1, "n_estimators": 5000} 
results = run_custom_grid_search(params, X_train, y_train, X_test, y_test, X_wc, y_wc, classifier=False)

Parameter estimation took:  956.6009328365326


In [5]:
best_params = get_best_params(results)
best_params

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 5}

In [10]:
params.pop("n_estimators", None)
params.update(best_params)

n_estimators = [200, 500, 1000, 2000, 5000, 10000]
for k in n_estimators:
    results = []
    for i in range(0, 10):
        params["n_estimators"] = k
        rfr = RandomForestRegressor(**params)
        rfr.fit(X_train, y_train)

        y_true, y_pred = y_test, rfr.predict(X_test)
        test_acc = sum(np.around(y_pred) == y_true) / len(X_test)

        y_true, y_pred = y_wc, rfr.predict(X_wc)
        wc_acc = sum(np.around(y_pred) == y_true) / len(X_wc)
        
        total = test_acc + wc_acc
        results.append(total)

    print("K estimators: ", k)
    print("std: ", np.std(results))
    print("Mean: ", np.mean(results))

K estimators:  200
std:  0.008251554556017157
Mean:  0.7015886627906976
K estimators:  500
std:  0.003921554389215534
Mean:  0.6968546511627908
K estimators:  1000
std:  0.006350003804809443
Mean:  0.7022772529069767
K estimators:  2000
std:  0.005864203369363642
Mean:  0.7016820494186046
K estimators:  5000
std:  0.002514374616817575
Mean:  0.7017470930232558


KeyboardInterrupt: 

In [20]:
params["n_estimators"] = 200
rfr = RandomForestRegressor(**params)
rfr.fit(X_train, y_train)

start = time()
y_true, y_pred = y_test.tail(1), rfr.predict(X_test.tail(1))
print(time() - start)


params["n_estimators"] = 5000
rfr = RandomForestRegressor(**params)
rfr.fit(X_train, y_train)

start = time()
y_true, y_pred = y_test.tail(1), rfr.predict(X_test.tail(1))
print(time() - start)

params["n_estimators"] = 10000
rfr = RandomForestRegressor(**params)
rfr.fit(X_train, y_train)

start = time()
y_true, y_pred = y_test.tail(1), rfr.predict(X_test.tail(1))
print(time() - start)

0.10280704498291016
1.022695541381836
1.9442050457000732
