In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score

In [3]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')

# Encodage, Dichotomisation...

In [4]:
# Encodage de player_1_name et player_2_name
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

In [5]:
# Suppression de player_1_name et player_2_name
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

In [6]:
# dichotomisation
df = pd.get_dummies(df)

# Split & Standardisation

In [7]:
# séparation en train et test selon le temps
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

In [8]:
# Normalisation et Standardisation des features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression simple

In [9]:
# entrainement d'une LogisticRegression
from sklearn.linear_model import LogisticRegression
clf1 = LogisticRegression(random_state=0)

clf1.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [10]:
print(clf1.score(X_train_scaled, y_train))
print(clf1.score(X_test_scaled, y_test))

0.7008372371175122
0.6680232558139535


In [11]:
from sklearn.metrics import classification_report
y_pred_1 = clf1.predict(X_test_scaled)
print(pd.crosstab(y_test, y_pred_1, rownames=['Classe réelle'], colnames=['Classe prédite']))
print(classification_report(y_test, y_pred_1))

Classe prédite     0     1
Classe réelle             
0               2997  1354
1               1501  2748
              precision    recall  f1-score   support

           0       0.67      0.69      0.68      4351
           1       0.67      0.65      0.66      4249

    accuracy                           0.67      8600
   macro avg       0.67      0.67      0.67      8600
weighted avg       0.67      0.67      0.67      8600



# Logistic Regression (changement des paramètres manuellement)

In [12]:
clf2 = LogisticRegression(random_state=1, penalty='elasticnet', C=0.1, max_iter=2000, solver='saga', l1_ratio=0.5)
clf2.fit(X_train_scaled, y_train)

LogisticRegression(C=0.1, l1_ratio=0.5, max_iter=2000, penalty='elasticnet',
                   random_state=1, solver='saga')

In [13]:
print(clf2.score(X_train_scaled, y_train))
print(clf2.score(X_test_scaled, y_test))

0.7019336190571115
0.6691860465116279


In [14]:
y_pred_2 = clf2.predict(X_test_scaled)
print(pd.crosstab(y_test, y_pred_2, rownames=['Classe réelle'], colnames=['Classe prédite']))
print(classification_report(y_test, y_pred_2))

Classe prédite     0     1
Classe réelle             
0               2969  1382
1               1463  2786
              precision    recall  f1-score   support

           0       0.67      0.68      0.68      4351
           1       0.67      0.66      0.66      4249

    accuracy                           0.67      8600
   macro avg       0.67      0.67      0.67      8600
weighted avg       0.67      0.67      0.67      8600



# Logistic Regression (recherche des meilleurs hyperparamètres)
## Time Series Split Cross-Validation

In [15]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Paramétrisation LogReg & GridSearch
clf_lr = LogisticRegression(max_iter=2000, random_state=42)
tscv = TimeSeriesSplit(n_splits=5)
param_grid = [{'solver': ['lbfgs'], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'penalty':['l2']},
                  {'solver': ['liblinear'], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'penalty':['l1', 'l2']}]
gridcv = GridSearchCV(clf_lr, param_grid=param_grid, cv = tscv, scoring='accuracy')

# Entrainement du modèle
gridcv.fit(X_train_scaled, y_train)

print(gridcv.best_estimator_)
print(gridcv.best_score_)

# Prédiction du modèle
print(gridcv.score(X_test_scaled, y_test))

LogisticRegression(C=0.005, max_iter=2000, penalty='l1', random_state=42,
                   solver='liblinear')
0.7006578947368421
0.6754651162790698


In [17]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Paramétrisation LogReg & GridSearch
clf_lr = LogisticRegression(C=0.005, max_iter=2000, penalty='l1', random_state=42, solver='liblinear')

# Entrainement du modèle
clf_lr.fit(X_train_scaled, y_train)

# Prédiction du modèle
print(clf_lr.score(X_test_scaled, y_test))

0.6754651162790698


In [18]:
from joblib import dump, load

dump(clf_lr, 'Best_LR.joblib')

['Best_LR.joblib']

In [20]:
loaded_model = load('Best_LR.joblib')

loaded_model.predict(X_test_scaled)

loaded_model.score(X_test_scaled, y_test)

0.6754651162790698