In [1]:
import functools
import numpy as np
import pandas as pd
import random
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
%matplotlib inline
plt.style.use('ggplot')

from features.data_provider import get_train_test_wc_dataset, get_feature_columns, get_whole_dataset
from simulation.predictor import ScorePredictor
from simulation.simulation import run_simulation, get_match_win_probability

In [2]:
home = get_train_test_wc_dataset("home_score")
away = get_train_test_wc_dataset("away_score")

X_train = pd.concat([home[0], away[0]])
y_train = pd.concat([home[1], away[1]])

X_test = pd.concat([home[2], away[2]])
y_test = pd.concat([home[3], away[3]])

X_wc = pd.concat([home[4], away[4]])
y_wc = pd.concat([home[5], away[5]])

In [3]:
def get_feature_importance(clf):
    zipped = sorted(zip(get_feature_columns(), clf.feature_importances_), key = lambda t: t[1], reverse=True)
    for feature, importance in zipped:
        print(f"{feature}: {importance}")

In [4]:
def run_grid_search(clf, param_dist, 
                      Xtrain, ytrain, 
                      Xtest, ytest, 
                      X_wc, y_wc,
                      custom_report_for_model=None):
    random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
    start = time()
    random_search.fit(Xtrain, ytrain)
    print("TRAIN SET METRICS:")
    y_true, y_pred = ytrain, random_search.predict(Xtrain)
    print(sum(np.around(y_pred) == y_true) / len(Xtrain))
    print("BEST PARAMS: ", random_search.best_params_)
    print()
    print()
    
    best_clf = random_search.best_estimator_
    best_clf.fit(Xtrain, ytrain)
    
    print("TEST SET METRICS:")
    y_true, y_pred = ytest, best_clf.predict(Xtest)
    print(sum(np.around(y_pred) == y_true) / len(X_test))
        
    print()
    print()
    print("WC SET METRICS:")
    y_true, y_pred = y_wc, best_clf.predict(X_wc)
    print(sum(np.around(y_pred) == y_true) / len(X_wc))
    
    print()
    print()
    if custom_report_for_model:
        custom_report_for_model(best_clf)
    return best_clf

In [5]:
clf = RandomForestRegressor(oob_score=True, bootstrap=True, n_jobs=-1)

param_grid = {"n_estimators": [500],
              "max_depth": [3, 5, 8, 12, None],
              "min_samples_leaf": [5, 10, 15],
              "max_features": ["sqrt", "log2"]}
best_clf = run_grid_search(clf, param_grid,
                             X_train, y_train, 
                             X_test, y_test, 
                             X_wc, y_wc,
                             custom_report_for_model=get_feature_importance)

TRAIN SET METRICS:
0.5472237100517765
BEST PARAMS:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 500}


TEST SET METRICS:
0.34


WC SET METRICS:
0.34375


elo_diff: 0.09995063840711806
rating_diff: 0.04756279146014814
potential_diff: 0.045647726196349346
reactions_diff: 0.04051493771223519
dribbling_diff: 0.03764668166244962
crossing_diff: 0.03506667132768345
away_goal_mean: 0.03314843873645014
internationl_repuatiotion_diff: 0.03274073862532834
short_passing_diff: 0.030970110177454567
home_goal_mean: 0.02950280487234112
ball_control_diff: 0.029331684158026623
long_shots_diff: 0.029091629500535142
home_goals_with_away: 0.028292038311723295
finishing_diff: 0.02685074565941523
long_passing_diff: 0.025355189112818838
fk_accuracy_diff: 0.024012722750545248
penalties_diff: 0.022249370928846806
shot_power_diff: 0.022146722116957524
age_diff: 0.021974938879645984
marking_diff: 0.02146304504734742
gk_kicking_diff: 0.02123142392986853
stamina_diff: 0.021078

In [8]:
params = best_clf.get_params()
params.pop('n_estimators', None)
clf = RandomForestRegressor(**params)

param_grid = {"n_estimators": [200, 500, 1000, 2000, 5000]}
best_clf = run_grid_search(clf, param_grid,
                             X_train, y_train, 
                             X_test, y_test, 
                             X_wc, y_wc,
                             custom_report_for_model=get_feature_importance)

TRAIN SET METRICS:
0.5486520264238529
BEST PARAMS:  {'n_estimators': 500}


TEST SET METRICS:
0.34


WC SET METRICS:
0.34765625


elo_diff: 0.10511278045057323
potential_diff: 0.04803115846840489
rating_diff: 0.04727494283058675
reactions_diff: 0.03884626977991815
dribbling_diff: 0.03865266571293273
internationl_repuatiotion_diff: 0.03348620583350371
away_goal_mean: 0.03311032825730059
crossing_diff: 0.03261772932223328
ball_control_diff: 0.030862915616560623
home_goal_mean: 0.03022212538083436
long_shots_diff: 0.027936798905091917
home_goals_with_away: 0.02766470649114126
short_passing_diff: 0.026975058160217967
long_passing_diff: 0.02678373907501
finishing_diff: 0.025626508919293516
fk_accuracy_diff: 0.0249536079420678
penalties_diff: 0.022933211683837072
marking_diff: 0.021894659044483986
age_diff: 0.021774012700186512
shot_power_diff: 0.021770297944482202
standing_tackle_diff: 0.020980283868569552
gk_kicking_diff: 0.02087010891310977
height_diff: 0.020553307896433712
strength_diff: