In [1]:
import functools
import numpy as np
import pandas as pd
import random
from time import time
from scipy.stats import randint as sp_randint
from multiprocessing import Pool, cpu_count

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_and_test_dataset, get_feature_columns, get_whole_dataset
from simulation.predictor import ScorePredictor, MaxProbabilityScorePredictor
from simulation.simulation import run_simulation
from models.grid_search import run_custom_grid_search
from models.score_model import get_model
from models.helpers import get_best_params

In [4]:
home = get_train_and_test_dataset("home_score")
away = get_train_and_test_dataset("away_score")

X_train = pd.concat([home[0], away[0]])
y_train = pd.concat([home[1], away[1]])

X_test = pd.concat([home[2], away[2]])
y_test = pd.concat([home[3], away[3]])

X_test_home = home[2]
X_test_away = away[2]

_, _, _, true_outcomes = get_train_and_test_dataset("home_win")

def get_model_accuracy(params):
    predicted_outcomes = []               

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    res = {
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
        "n_estimators": params["n_estimators"],
    }

    for i in range(X_test_home.shape[0]):
        home_fv = [X_test_home.iloc[i].as_matrix()]
        away_fv = [X_test_away.iloc[i].as_matrix()]
        home_mu = model.predict(home_fv)
        away_mu = model.predict(away_fv)

        goal_matrix = ScorePredictor.get_goal_matrix(home_mu, away_mu)
        away_win, draw, home_win = ScorePredictor.get_outcome_probabilities(goal_matrix)

        if home_win > away_win and home_win > draw:
            outcome = 1
        elif away_win > home_win and away_win > draw:
            outcome = -1
        elif draw > home_win and draw > away_win:
            outcome = 0                
        else:
            print("IDENTICAL PROBABILITIES", away_win, draw, home_win)
            outcome = 1
        predicted_outcomes.append(outcome)

    print(accuracy_score(true_outcomes.values, predicted_outcomes))

    res["test_acc"] = accuracy_score(true_outcomes.values, predicted_outcomes)
    res["test_mae"] = mean_absolute_error(true_outcomes.values, predicted_outcomes)
    res["test_mse"] = mean_squared_error(true_outcomes.values, predicted_outcomes)

    return res

def run_custom_grid_search(org_params):
    start = time()

    results = []
    param_array = []
    for depth in [3, 5, 8, 12, None]:
        for min_samples in [1, 3, 5, 10, 15]:
            for max_features in ["sqrt", "log2"]:
                params = org_params.copy()
                params["max_depth"] = depth
                params["min_samples_leaf"] = min_samples
                params["max_features"] = max_features
                param_array.append(params)
    
    pool = Pool(cpu_count())
    results = pool.map(get_model_accuracy, param_array)
    print("Parameter estimation took: ", time() - start)
    return results

In [5]:
params = {"oob_score":True, "bootstrap":True, "n_estimators": 5000} 
results = run_custom_grid_search(params)
get_best_params(results)

0.5519125683060109
0.5519125683060109
0.5511319281811085
0.5519125683060109
0.5519125683060109
0.5573770491803278
0.5573770491803278
0.5589383294301327
0.5589383294301327
0.5581576893052302
0.5636221701795472
0.5604996096799375
0.5480093676814989
0.5480093676814989
0.5480093676814989
0.5480093676814989
0.5472287275565965
0.5597189695550351
0.5589383294301327
0.5589383294301327
0.5597189695550351
0.5589383294301327
0.5589383294301327
0.5581576893052302
0.5620608899297423
0.5589383294301327
0.5604996096799375
0.555815768930523
0.5526932084309133
0.5565964090554254
0.5581576893052302
0.5565964090554254
0.5472287275565965
0.5550351288056206
0.546448087431694
0.5487900078064013
0.5612802498048399
0.5612802498048399
0.5636221701795472
0.5589383294301327
0.5589383294301327
0.5597189695550351
0.5581576893052302
0.555815768930523
0.5550351288056206
0.5542544886807181
0.5495706479313037
0.5417642466822795
0.5604996096799375
0.5589383294301327
Parameter estimation took:  3648.574865579605


TypeError: list indices must be integers or slices, not str

In [6]:
def get_best_params(results, verbose=False):
    best_params = results.loc[results['test_acc'].idxmax(), ["max_depth", "max_features", "min_samples_leaf"]]
    if verbose:
        columns = ["max_depth", "max_features", "min_samples_leaf", "test_acc", "wc_acc"]
        print("Test set bests")
        print(results.loc[results['test_acc'].idxmax(), columns])
        print(results.loc[results['test_mae'].idxmin(), columns])
        print(results.loc[results['test_mse'].idxmin(), columns])
        print()
        print("BEST COMBO")
        print(best_params)
     
    best_params = best_params.replace({np.nan:None})
    return best_params.to_dict()

rr = pd.DataFrame(results)
get_best_params(rr)

{'max_depth': 8.0, 'max_features': 'sqrt', 'min_samples_leaf': 1}

In [7]:
best_params = {'max_depth': 8.0, 'max_features': 'sqrt', 'min_samples_leaf': 1}
params.pop("n_estimators", None)
params.update(best_params)

n_estimators = [200, 500, 1000, 2000, 5000]
param_array = []
for k in n_estimators:
    k_params = params.copy()
    k_params["n_estimators"] = k
    param_array.append(params)


#pool = Pool(cpu_count())
#results = pool.map(get_model_accuracy, param_array)

Unnamed: 0,max_depth,max_features,min_samples_leaf,test_acc,test_mae,test_mse
0,3.0,sqrt,1,0.551913,0.679938,1.143638
1,3.0,log2,1,0.548009,0.687744,1.159251
2,3.0,sqrt,3,0.551913,0.679938,1.143638
3,3.0,log2,3,0.548009,0.687744,1.159251
4,3.0,sqrt,5,0.551913,0.679938,1.143638
5,3.0,log2,5,0.548009,0.687744,1.159251
6,3.0,sqrt,10,0.551913,0.679938,1.143638
7,3.0,log2,10,0.547229,0.689305,1.162373
8,3.0,sqrt,15,0.551132,0.681499,1.14676
9,3.0,log2,15,0.548009,0.687744,1.159251
