In [13]:
## Imports
import os
import sys
import time
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC

## Data importation

In [14]:
## Paths
dirname = os.getcwd()
rel_dataset_export_path = 'OneDrive - Association Cesi Viacesi mail\\Bureau\\Projects\\winAi\\Datasets\\processed\\stats-bomb'
dataset_export_path = os.path.join(dirname, rel_dataset_export_path)

In [15]:
## Load the most recent dataset
dataset_files = os.listdir(dataset_export_path)
dataset_files.sort()
dataset_file = dataset_files[-1]
dataset_path = os.path.join(dataset_export_path, dataset_file)

## Load the dataset
with open(dataset_path, 'r') as f:
    data_cleaned = pd.read_csv(dataset_path)
if data_cleaned is not None:
    print(f"Loading dataset from {dataset_path}")

Loading dataset from C:\Users\NyveK\OneDrive - Association Cesi Viacesi mail\Bureau\Projects\winAi\Datasets\processed\stats-bomb\20240805-1441_matches_stats.csv


## Features & Labels - Preparation

In [16]:
# Définir les features et le label
features = data_cleaned.drop(columns=['match_id', 'season', 'competition', 'home_team', 'away_team', 'stadium', 'referee', 'kick_off', 'home_score', 'away_score'])
labels = (data_cleaned['home_score'] > data_cleaned['away_score']).astype(int) # 1 pour victoire, 0 pour défaite ou nul

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Afficher un aperçu des données d'entraînement
X_train.head(), y_train.head()

(     home_team_id  away_team_id  goals_home  goals_away   xG_home   xG_away  \
 114           169            33         4.0         5.0  7.403860  4.209352   
 7             217           215         3.0         0.0  0.967318  0.822713   
 137           969           971         0.0         0.0  0.052880  2.771112   
 331          1207           863         0.0         0.0  0.022443  1.697006   
 304           965           974         2.0         2.0  1.242963  2.164964   
 
       xG_TOTAL  shots_on_target_home  shots_on_target_away  \
 114  11.613212                  12.0                   8.0   
 7     1.790031                   6.0                   4.0   
 137   2.823992                   0.0                   0.0   
 331   1.719449                   0.0                   0.0   
 304   3.407927                   7.0                  12.0   
 
      shots_off_target_home  shots_off_target_away  key_passes_home  \
 114                   12.0                    3.0             33.0

## Normalizing the data
----------------------

In [17]:
# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Training
---

### Linear Regression -

In [18]:
# Entraînement du modèle de régression logistique
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

### SVM (Support vector machine) -

In [19]:
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)

## Prediction & Evaluation
---

### Linear Regression -

In [20]:
# Prédiction et évaluation
y_pred = model.predict(X_test_scaled)
report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       0.85      1.00      0.92        29

    accuracy                           0.93        69
   macro avg       0.93      0.94      0.93        69
weighted avg       0.94      0.93      0.93        69



SVM

In [21]:
# Prédiction et évaluation
y_pred_svm = svm_model.predict(X_test_scaled)
svm_report = classification_report(y_test, y_pred_svm)

print(svm_report)

              precision    recall  f1-score   support

           0       0.92      0.88      0.90        40
           1       0.84      0.90      0.87        29

    accuracy                           0.88        69
   macro avg       0.88      0.89      0.88        69
weighted avg       0.89      0.88      0.88        69



### Random Forest

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


# Définir les features et le label
features = data_cleaned.drop(columns=['match_id', 'season', 'competition', 'home_team', 'away_team', 'stadium', 'referee', 'kick_off', 'home_score', 'away_score'])
labels = (data_cleaned['home_score'] > data_cleaned['away_score']).astype(int)  # 1 pour victoire, 0 pour défaite ou nul

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Définir les paramètres à tester
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialiser le modèle Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Initialiser GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Entraîner GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Meilleurs paramètres
best_params = grid_search.best_params_

# Meilleur modèle
best_rf_model = grid_search.best_estimator_

# Prédiction et évaluation avec le meilleur modèle
y_pred_best_rf = best_rf_model.predict(X_test_scaled)
report_best_rf = classification_report(y_test, y_pred_best_rf)

best_params, report_best_rf


Fitting 3 folds for each of 108 candidates, totalling 324 fits


({'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 '              precision    recall  f1-score   support\n\n           0       0.89      0.80      0.84        40\n           1       0.76      0.86      0.81        29\n\n    accuracy                           0.83        69\n   macro avg       0.82      0.83      0.82        69\nweighted avg       0.83      0.83      0.83        69\n')

### Prédiction RandomForest en utilisant les identifiants des équipes uniquement


In [27]:
# Remplacer les statistiques de match par des valeurs neutres (zéros) pour la prédiction
X_test_simplified = X_test.copy()
for column in X_test_simplified.columns:
    if column not in ['home_team_id', 'away_team_id']:
        X_test_simplified[column] = 0

# Normalisation des données simplifiées
X_test_simplified_scaled = scaler.transform(X_test_simplified)

# Prédiction avec les données simplifiées
y_pred_simplified = best_rf_model.predict(X_test_simplified_scaled)
report_simplified_stats = classification_report(y_test, y_pred_simplified)

print("Rapport de classification avec les identifiants des équipes uniquement :\n", report_simplified_stats)


Rapport de classification avec les identifiants des équipes uniquement :
               precision    recall  f1-score   support

           0       0.58      1.00      0.73        40
           1       0.00      0.00      0.00        29

    accuracy                           0.58        69
   macro avg       0.29      0.50      0.37        69
weighted avg       0.34      0.58      0.43        69



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Predict specific team

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Meilleurs paramètres trouvés : {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Rapport de classification avec les meilleurs paramètres :
               precision    recall  f1-score   support

           0       0.89      0.80      0.84        40
           1       0.76      0.86      0.81        29

    accuracy                           0.83        69
   macro avg       0.82      0.83      0.82        69
weighted avg       0.83      0.83      0.83        69

Prédiction pour le match spécifique: Défaite ou match nul de l'équipe à domicile
