In [3]:
import functools
import numpy as np
import pandas as pd
import random
from time import time
from scipy.stats import randint as sp_randint
from multiprocessing import Pool, cpu_count

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_and_test_dataset, get_feature_columns, get_whole_dataset
from simulation.predictor import ScorePredictor, MaxProbabilityScorePredictor
from simulation.simulation import run_simulation
from models.grid_search import run_custom_grid_search
from models.score_model import get_model
from models.helpers import get_best_params

In [None]:
pool = Pool(cpu_count())

home = get_train_and_test_dataset("home_score")
away = get_train_and_test_dataset("away_score")

X_train = pd.concat([home[0], away[0]])
y_train = pd.concat([home[1], away[1]])

X_test = pd.concat([home[2], away[2]])
y_test = pd.concat([home[3], away[3]])

X_test_home = home[2]
X_test_away = away[2]

_, _, _, true_outcomes = get_train_and_test_dataset("home_win")

def get_model_accuracy(params):
    predicted_outcomes = []               

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    res = {
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
        "max_features": params["max_features"],
    }

    for i in range(X_test_home.shape[0]):
        home_fv = [X_test_away.iloc[i].as_matrix()]
        away_fv = [away.iloc[i].as_matrix()]
        home_mu = model.predict(home_fv)
        away_mu = model.predict(away_fv)

        goal_matrix = ScorePredictor.get_goal_matrix(home_mu, away_mu)
        away_win, draw, home_win = ScorePredictor.get_outcome_probabilities(goal_matrix)

        if home_win > away_win and home_win > draw:
            outcome = 1
        elif away_win > home_win and away_win > draw:
            outcome = -1
        elif draw > home_win and draw > away_win:
            outcome = 0                
        else:
            print("IDENTICAL PROBABILITIES", away_win, draw, home_win)
            outcome = 1
        predicted_outcomes.append(outcome)

    print(accuracy_score(true_outcomes.values, predicted_outcomes))

    res["test_acc"] = accuracy_score(true_outcomes.values, predicted_outcomes)
    res["test_mae"] = mean_absolute_error(true_outcomes.values, predicted_outcomes)
    res["test_mse"] = mean_squared_error(true_outcomes.values, predicted_outcomes)

    results.append(res)

def run_custom_grid_search(org_params):
    start = time()

    results = []
    params = org_params.copy()

    param_array = []
    for depth in [3, 5, 8, 12, None]:
        for min_samples in [1, 3, 5, 10, 15]:
            for max_features in ["sqrt", "log2"]:
                params["max_depth"] = depth
                params["min_samples_leaf"] = min_samples
                params["max_features"] = max_features
                param_array.append(params)
    

    results = pool.map(get_model_accuracy, param_array)
    print("Parameter estimation took: ", time() - start)
    return results

In [None]:
params = {"oob_score":True, "bootstrap":True, "n_estimators": 5000} 
results = run_custom_grid_search(params)
get_best_params(results)