In [1]:
import functools
import numpy as np
import pandas as pd
import random
from time import time
from scipy.stats import randint as sp_randint
from multiprocessing import Pool, cpu_count

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, accuracy_score, log_loss
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_feature_columns, get_whole_dataset
from simulation.predictor import ScorePredictor, MaxProbabilityScorePredictor
from simulation.simulation import run_simulation
from models.grid_search import run_custom_grid_search
from models.score_model import get_model
from models.helpers import get_best_params
from notebook_helpers import get_cv_grid_search_arguments, run_grid_search_for_score, get_score_model_metrics

In [2]:
Xhome, yhome = get_whole_dataset("home_score")
Xaway, yaway = get_whole_dataset("away_score")
_, outcomes = get_whole_dataset("home_win")

In [None]:
params = {"oob_score":True, "bootstrap":True, "n_estimators": 2000} 
arguments = get_cv_grid_search_arguments(params, Xhome)
results = run_grid_search_for_score(arguments, Xhome, yhome, Xaway, yaway, outcomes)
results.to_csv("score_hyperparam_optimization.csv")
results.sort_values(['test_acc', 'test_logloss'], ascending=[False, True])