In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# boosting & bagging
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
n_splits = 5

In [3]:
# Voting & Stacking
from sklearn.ensemble import VotingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

In [7]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')
    
# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# GridSearch
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'n_estimators':range(40,101,20), 'learning_rate':[0.1, 0.2, 0.5], 'max_depth':[1, 3]}
clf = GradientBoostingClassifier(random_state=0)
gridcv = GridSearchCV(clf,param_grid=param_grid,cv=tscv,scoring="accuracy")

# Entrainement du modèle
gridcv.fit(X_train_scaled, y_train)

print(gridcv.best_estimator_)
print(gridcv.best_score_)

# Prédiction du modèle
print(gridcv.score(X_test_scaled, y_test))

GradientBoostingClassifier(max_depth=1, n_estimators=60, random_state=0)
0.7007177033492823
0.674186046511628


In [8]:
gridcv.best_params_

{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 60}

In [9]:
from joblib import dump, load

dump(gridcv, 'Best_GBC.joblib')

['Best_GBC.joblib']

In [10]:
loaded_model = load('Best_GBC.joblib')

loaded_model.predict(X_test_scaled)

loaded_model.score(X_test_scaled, y_test)

0.674186046511628