In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [3]:
df = pd.read_csv('Sources/df_final_rolling_ready_50_with_bookodds.csv')

# Encodage
le = LabelEncoder()
le.fit(pd.concat([df['player_1_name'], df['player_2_name']], axis=0))
df['player_1_name_encoded'] = le.transform(df['player_1_name'])
df['player_2_name_encoded'] = le.transform(df['player_2_name'])

# Suppression player_1_name & player_2
df.drop(['player_1_name', 'player_2_name'], axis=1, inplace=True)

# Dichotomisation
df = pd.get_dummies(df)

# Split
nb_rows_train = int(round(len(df)*0.7,0))
X_train = df.drop('player_1_win', axis=1)[:nb_rows_train]
y_train = df['player_1_win'][:nb_rows_train]
X_test = df.drop('player_1_win', axis=1)[nb_rows_train:]
y_test = df['player_1_win'][nb_rows_train:]

# Standardisation
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Paramétrisation LogReg & GridSearch
clf = KNeighborsClassifier()
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'n_neighbors': list(range(200, 1801, 200)), 'metric':['manhattan']}
gridcv = GridSearchCV(clf, param_grid=param_grid, cv = tscv, scoring='accuracy')

# Entrainement du modèle
gridcv.fit(X_train_scaled, y_train)

print(gridcv.best_estimator_)
print(gridcv.best_score_)

# Prédiction du modèle
print(gridcv.score(X_test_scaled, y_test))

KNeighborsClassifier(metric='manhattan', n_neighbors=600)
0.6742224880382774
0.6577906976744186


In [5]:
print(gridcv.score(X_test_scaled, y_test))

0.6577906976744186


In [7]:
from joblib import dump, load

dump(gridcv, 'Best_KNN.joblib')

['Best_KNN.joblib']

In [8]:
loaded_model = load('Best_KNN.joblib')

loaded_model.predict(X_test_scaled)

loaded_model.score(X_test_scaled, y_test)

0.6577906976744186