In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score

In [5]:
df = pd.read_csv('df_final_rolling_ready_50_with_bookodds.csv')

# Encodage, Dichotomisation...

In [6]:
# Encodage de player_1_name et player_2_name
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

In [7]:
# Suppression de player_1_name et player_2_name
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

In [8]:
# dichotomisation
df = pd.get_dummies(df)

# Split & Standardisation

In [9]:
# séparation en train et test selon le temps
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

In [10]:
# Normalisation et Standardisation des features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Machine à Vecteurs de Support

In [11]:
# entrainement d'un SVM
from sklearn import svm
clf1 = svm.SVC(random_state=0) # Par défaut C=1, gamma='scale' et kernel='rbf'

clf1.fit(X_train_scaled, y_train)

SVC(random_state=0)

In [12]:
print(clf1.score(X_train_scaled, y_train))
print(clf1.score(X_test_scaled, y_test))

0.7539120901026612
0.6627906976744186


In [13]:
from sklearn.metrics import classification_report
y_pred_1 = clf1.predict(X_test_scaled)
display(pd.crosstab(y_test, y_pred_1, rownames=['Classe réelle'], colnames=['Classe prédite']))
print(classification_report(y_test, y_pred_1))

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2862,1489
1,1411,2838


              precision    recall  f1-score   support

           0       0.67      0.66      0.66      4351
           1       0.66      0.67      0.66      4249

    accuracy                           0.66      8600
   macro avg       0.66      0.66      0.66      8600
weighted avg       0.66      0.66      0.66      8600



# SVM (changement des paramètres manuellement)

In [14]:
clf2 = svm.SVC(random_state=1, C=0.1, gamma = 0.1, kernel = 'poly')
clf2.fit(X_train_scaled, y_train)

SVC(C=0.1, gamma=0.1, kernel='poly', random_state=1)

In [15]:
print(clf2.score(X_train_scaled, y_train))
print(clf2.score(X_test_scaled, y_test))

0.9984550981760192
0.5616279069767441


In [16]:
y_pred_2 = clf2.predict(X_test_scaled)
display(pd.crosstab(y_test, y_pred_2, rownames=['Classe réelle'], colnames=['Classe prédite']))
print(classification_report(y_test, y_pred_2))

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2422,1929
1,1841,2408


              precision    recall  f1-score   support

           0       0.57      0.56      0.56      4351
           1       0.56      0.57      0.56      4249

    accuracy                           0.56      8600
   macro avg       0.56      0.56      0.56      8600
weighted avg       0.56      0.56      0.56      8600



# SVM (recherche des meilleurs hyperparamètres)
## Time Series Split Cross-Validation

In [18]:
df = pd.read_csv('df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Paramétrisation SVM & GridSearch
clf = svm.SVC(max_iter=2000, random_state=42)
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5], 
               'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 'scale', 'auto'], 'degree' : [2, 3]}
gridcv = GridSearchCV(clf, param_grid=param_grid, cv = tscv, scoring='accuracy')

# Entrainement du modèle
gridcv.fit(X_train_scaled, y_train)

print(gridcv.best_estimator_)
print(gridcv.best_score_)

# Prédiction du modèle
print(gridcv.score(X_test_scaled, y_test))









































































































































SVC(C=0.5, degree=2, gamma=0.01, max_iter=2000, random_state=42)
0.6316387559808613
0.5394186046511628


In [23]:
df = pd.read_csv('df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM avec les meilleurs paramètres
clf = svm.SVC(C=0.5, degree=2, gamma=0.01)

# Entrainement du modèle
clf.fit(X_train_scaled, y_train)

# Prédiction du modèle
print(clf.score(X_train_scaled, y_train))
print(clf.score(X_test_scaled, y_test))

0.740456493571215
0.6622093023255814


In [20]:
df = pd.read_csv('df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM avec les meilleurs paramètres
clf = svm.SVC(gamma=0.005, kernel='rbf')

# Entrainement du modèle
clf.fit(X_train_scaled, y_train)

# Prédiction du modèle
print(clf.score(X_train_scaled, y_train))
print(clf.score(X_test_scaled, y_test))

0.7280474434366591
0.6665116279069767


In [24]:
df = pd.read_csv('df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM avec les meilleurs paramètres
clf = svm.SVC(gamma=0.006, kernel='rbf')

# Entrainement du modèle
clf.fit(X_train_scaled, y_train)

# Prédiction du modèle
print(clf.score(X_train_scaled, y_train))
print(clf.score(X_test_scaled, y_test))

0.7361706368982358
0.6641860465116279


In [21]:
from joblib import dump, load

dump(clf, 'Best_SVM.joblib')

['Best_SVM.joblib']

In [22]:
loaded_model = load('Best_SVM.joblib')

loaded_model.predict(X_test_scaled)

loaded_model.score(X_test_scaled, y_test)

0.6665116279069767