In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# boosting & bagging
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
n_splits = 5

In [3]:
# Voting & Stacking
from sklearn.ensemble import VotingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from joblib import dump, load

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

# Voting Classifier

## les différentes combinaisons possibles de LR + RF + GBC + XGB en vote hard

In [54]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')
    
# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modèles
clf_lr = LogisticRegression(C=0.005, max_iter=2000, penalty='l1', random_state=42, solver='liblinear')
clf_rf = RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=75, random_state=10)
clf_gbc = GradientBoostingClassifier(learning_rate=0.1,max_depth=1, n_estimators=60, random_state=0)
clf_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, 
                            colsample_bytree=1, learning_rate=0.1, max_depth=2, min_child_weight=15, 
                            n_estimators=50, n_jobs=-1, random_state=0, subsample=1, tree_method='exact')

vclf = {}

vclf_1 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='hard')
vclf_1.fit(X_train_scaled, y_train)
vclf[str(vclf_1.named_estimators_.keys())] = vclf_1.score(X_test_scaled, y_test)

vclf_2 = VotingClassifier(estimators=[('rf', clf_rf), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='hard')
vclf_2.fit(X_train_scaled, y_train)
vclf[str(vclf_2.named_estimators_.keys())] = vclf_2.score(X_test_scaled, y_test)

vclf_3 = VotingClassifier(estimators=[('lr', clf_lr), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='hard')
vclf_3.fit(X_train_scaled, y_train)
vclf[str(vclf_3.named_estimators_.keys())] = vclf_3.score(X_test_scaled, y_test)

vclf_4 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('xgb', clf_xgb)], voting='hard')
vclf_4.fit(X_train_scaled, y_train)
vclf[str(vclf_4.named_estimators_.keys())] = vclf_4.score(X_test_scaled, y_test)

vclf_5 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('gbc', clf_gbc)], voting='hard')
vclf_5.fit(X_train_scaled, y_train)
vclf[str(vclf_5.named_estimators_.keys())] = vclf_5.score(X_test_scaled, y_test)

vclf_6 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf)], voting='hard')
vclf_6.fit(X_train_scaled, y_train)
vclf[str(vclf_6.named_estimators_.keys())] = vclf_6.score(X_test_scaled, y_test)

vclf_7 = VotingClassifier(estimators=[('lr', clf_lr), ('gbc', clf_gbc)], voting='hard')
vclf_7.fit(X_train_scaled, y_train)
vclf[str(vclf_7.named_estimators_.keys())] = vclf_7.score(X_test_scaled, y_test)

vclf_8 = VotingClassifier(estimators=[('lr', clf_lr), ('xgb', clf_xgb)], voting='hard')
vclf_8.fit(X_train_scaled, y_train)
vclf[str(vclf_8.named_estimators_.keys())] = vclf_8.score(X_test_scaled, y_test)

vclf_9 = VotingClassifier(estimators=[('rf', clf_rf), ('gbc', clf_gbc)], voting='hard')
vclf_9.fit(X_train_scaled, y_train)
vclf[str(vclf_9.named_estimators_.keys())] = vclf_9.score(X_test_scaled, y_test)

vclf_10 = VotingClassifier(estimators=[('rf', clf_rf), ('xgb', clf_xgb)], voting='hard')
vclf_10.fit(X_train_scaled, y_train)
vclf[str(vclf_10.named_estimators_.keys())] = vclf_10.score(X_test_scaled, y_test)

vclf_11 = VotingClassifier(estimators=[('gbc', clf_gbc), ('xgb', clf_xgb)], voting='hard')
vclf_11.fit(X_train_scaled, y_train)
vclf[str(vclf_11.named_estimators_.keys())] = vclf_11.score(X_test_scaled, y_test)

In [55]:
vclf

{"dict_keys(['lr', 'rf', 'gbc', 'xgb'])": 0.6744186046511628,
 "dict_keys(['rf', 'gbc', 'xgb'])": 0.673953488372093,
 "dict_keys(['lr', 'gbc', 'xgb'])": 0.6753488372093023,
 "dict_keys(['lr', 'rf', 'xgb'])": 0.6747674418604651,
 "dict_keys(['lr', 'rf', 'gbc'])": 0.6740697674418604,
 "dict_keys(['lr', 'rf'])": 0.6753488372093023,
 "dict_keys(['lr', 'gbc'])": 0.6758139534883721,
 "dict_keys(['lr', 'xgb'])": 0.674186046511628,
 "dict_keys(['rf', 'gbc'])": 0.6733720930232558,
 "dict_keys(['rf', 'xgb'])": 0.6731395348837209,
 "dict_keys(['gbc', 'xgb'])": 0.6734883720930233}

In [56]:
vclf_7.score(X_test_scaled, y_test)

0.6758139534883721

In [62]:
from joblib import dump, load

dump(vclf_7, 'Best_VC_LR_GBC.joblib')
dump(vclf_6, '2nd_Best_VC_LR_RF.joblib')

['2nd_Best_VC_LR_RF.joblib']

In [63]:
loaded_model = load('Best_VC_LR_GBC.joblib')
loaded_model.predict(X_test_scaled)
loaded_model.score(X_test_scaled, y_test)

0.6758139534883721

In [64]:
loaded_model = load('2nd_Best_VC_LR_RF.joblib')
loaded_model.predict(X_test_scaled)
loaded_model.score(X_test_scaled, y_test)

0.6753488372093023

## les différentes combinaisons possibles de LR + RF + GBC + XGB en vote soft

In [5]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')
    
# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modèles
clf_lr = LogisticRegression(C=0.005, max_iter=2000, penalty='l1', random_state=42, solver='liblinear')
clf_rf = RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=75, random_state=10)
clf_gbc = GradientBoostingClassifier(learning_rate=0.1,max_depth=1, n_estimators=60, random_state=0)
clf_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, 
                            colsample_bytree=1, learning_rate=0.1, max_depth=2, min_child_weight=15, 
                            n_estimators=50, n_jobs=-1, random_state=0, subsample=1, tree_method='exact')

vclf = {}

vclf_1 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='soft')
vclf_1.fit(X_train_scaled, y_train)
vclf[str(vclf_1.named_estimators_.keys())] = vclf_1.score(X_test_scaled, y_test)

vclf_2 = VotingClassifier(estimators=[('rf', clf_rf), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='soft')
vclf_2.fit(X_train_scaled, y_train)
vclf[str(vclf_2.named_estimators_.keys())] = vclf_2.score(X_test_scaled, y_test)

vclf_3 = VotingClassifier(estimators=[('lr', clf_lr), ('gbc', clf_gbc), ('xgb', clf_xgb)], voting='soft')
vclf_3.fit(X_train_scaled, y_train)
vclf[str(vclf_3.named_estimators_.keys())] = vclf_3.score(X_test_scaled, y_test)

vclf_4 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('xgb', clf_xgb)], voting='soft')
vclf_4.fit(X_train_scaled, y_train)
vclf[str(vclf_4.named_estimators_.keys())] = vclf_4.score(X_test_scaled, y_test)

vclf_5 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf), ('gbc', clf_gbc)], voting='soft')
vclf_5.fit(X_train_scaled, y_train)
vclf[str(vclf_5.named_estimators_.keys())] = vclf_5.score(X_test_scaled, y_test)

vclf_6 = VotingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf)], voting='soft')
vclf_6.fit(X_train_scaled, y_train)
vclf[str(vclf_6.named_estimators_.keys())] = vclf_6.score(X_test_scaled, y_test)

vclf_7 = VotingClassifier(estimators=[('lr', clf_lr), ('gbc', clf_gbc)], voting='soft')
vclf_7.fit(X_train_scaled, y_train)
vclf[str(vclf_7.named_estimators_.keys())] = vclf_7.score(X_test_scaled, y_test)

vclf_8 = VotingClassifier(estimators=[('lr', clf_lr), ('xgb', clf_xgb)], voting='soft')
vclf_8.fit(X_train_scaled, y_train)
vclf[str(vclf_8.named_estimators_.keys())] = vclf_8.score(X_test_scaled, y_test)

vclf_9 = VotingClassifier(estimators=[('rf', clf_rf), ('gbc', clf_gbc)], voting='soft')
vclf_9.fit(X_train_scaled, y_train)
vclf[str(vclf_9.named_estimators_.keys())] = vclf_9.score(X_test_scaled, y_test)

vclf_10 = VotingClassifier(estimators=[('rf', clf_rf), ('xgb', clf_xgb)], voting='soft')
vclf_10.fit(X_train_scaled, y_train)
vclf[str(vclf_10.named_estimators_.keys())] = vclf_10.score(X_test_scaled, y_test)

vclf_11 = VotingClassifier(estimators=[('gbc', clf_gbc), ('xgb', clf_xgb)], voting='soft')
vclf_11.fit(X_train_scaled, y_train)
vclf[str(vclf_11.named_estimators_.keys())] = vclf_11.score(X_test_scaled, y_test)

In [6]:
vclf

{"dict_keys(['lr', 'rf', 'gbc', 'xgb'])": 0.6743023255813954,
 "dict_keys(['rf', 'gbc', 'xgb'])": 0.6727906976744186,
 "dict_keys(['lr', 'gbc', 'xgb'])": 0.6744186046511628,
 "dict_keys(['lr', 'rf', 'xgb'])": 0.6747674418604651,
 "dict_keys(['lr', 'rf', 'gbc'])": 0.6743023255813954,
 "dict_keys(['lr', 'rf'])": 0.6738372093023256,
 "dict_keys(['lr', 'gbc'])": 0.6748837209302325,
 "dict_keys(['lr', 'xgb'])": 0.6748837209302325,
 "dict_keys(['rf', 'gbc'])": 0.6722093023255814,
 "dict_keys(['rf', 'xgb'])": 0.6722093023255814,
 "dict_keys(['gbc', 'xgb'])": 0.674186046511628}

In [7]:
from joblib import dump, load

dump(vclf_7, 'Best_VC_LR_GBC.joblib')
dump(vclf_4, '2nd_Best_VC_LR_RF_XGB.joblib')

['2nd_Best_VC_LR_RF_XGB.joblib']

In [8]:
loaded_model = load('Best_VC_LR_GBC.joblib')
loaded_model.predict(X_test_scaled)
loaded_model.score(X_test_scaled, y_test)

0.6748837209302325

In [9]:
loaded_model = load('2nd_Best_VC_LR_RF_XGB.joblib')
loaded_model.predict(X_test_scaled)
loaded_model.score(X_test_scaled, y_test)

0.6747674418604651

# Stacking Classifier

In [50]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')
    
# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modèles
clf_lr = LogisticRegression(C=0.005, max_iter=2000, penalty='l1', random_state=42, solver='liblinear')
clf_rf = RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=75, random_state=10)
clf_gbc = GradientBoostingClassifier(learning_rate=0.1,max_depth=1, n_estimators=60, random_state=0)
clf_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, 
                            colsample_bytree=1, learning_rate=0.1, max_depth=2, min_child_weight=15, 
                            n_estimators=50, n_jobs=-1, random_state=0, subsample=1, tree_method='exact')

sclf_1 = StackingClassifier(estimators=[('lr', clf_lr), ('rf', clf_rf)], final_estimator=clf_lr)
sclf_1.fit(X_train_scaled, y_train)
sclf_1.score(X_test_scaled, y_test)

0.6738372093023256