In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor, plot_tree
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor


# ***Chargement des données d'entraînement et de test***

In [None]:
# Chargement des données d'entraînement et de test pour les équipes à domicile et à l'extérieur
home_team_train = pd.read_csv("train_home_team_statistics_df.csv")
away_team_train = pd.read_csv("train_away_team_statistics_df.csv")
home_team_test = pd.read_csv("test_home_team_statistics_df.csv")
away_team_test = pd.read_csv("test_away_team_statistics_df.csv")
y_train = pd.read_csv("Y_train_1rknArQ.csv")

# ***Preparation et Nettoyage des données***

In [None]:
# Sauvegarde des identifiants des matchs de test (utile pour soumettre les prédictions)
ids_test = home_team_test["ID"].copy().values

# Définir une fonction pour le nettoyage des features
def clean_and_prepare(home_df, away_df):
    home_clean = home_df.drop(columns=['ID', 'LEAGUE', 'TEAM_NAME'], errors='ignore')
    away_clean = away_df.drop(columns=['ID', 'LEAGUE', 'TEAM_NAME'], errors='ignore')
    home_clean.columns = ['HOME_' + col for col in home_clean.columns]
    away_clean.columns = ['AWAY_' + col for col in away_clean.columns]
    X = pd.concat([home_clean, away_clean], axis=1)
    return X.select_dtypes(include=[np.number])

In [None]:
X_train = clean_and_prepare(home_team_train,away_team_train)
X_final_test = clean_and_prepare(home_team_test,away_team_test)

 **Enlever les NaN des deux dataframes X_train et X_test**

In [None]:
# Remplacer les NaN dans les deux df X_train et X_test par la moyenne de la colonne
X_train = X_train.fillna(X_train.mean())
X_final_test = X_final_test.fillna(X_final_test.mean())

# Trouver les index avec NaN dans y_train
nan_index_y_train = y_train[y_train.isna().any(axis=1)].index

# Supprimer les lignes avec des NaN dans y_train (uniquement ces lignes dans X_train et y_train)
X_train = X_train.drop(index=nan_index_y_train)
y_train = y_train.drop(index=nan_index_y_train)



# Vérifier les nouvelles dimensions
print(X_train.shape, y_train.shape)
print(X_final_test.shape)


(12303, 280) (12303, 4)
(25368, 280)


# ***Definition des metrics pour notre model de prediction ***

In [None]:
# Definir les metrics de notre model de prediction
metrics = [
'AWAY_TEAM_BALL_POSSESSION_season_average',
'HOME_TEAM_SHOTS_ON_TARGET_season_sum',
'HOME_TEAM_SHOTS_ON_TARGET_5_last_match_average',
'HOME_TEAM_GAME_WON_season_sum',
'AWAY_TEAM_GAME_LOST_season_sum',
'HOME_TEAM_SHOTS_TOTAL_season_average',
'HOME_TEAM_GAME_LOST_season_sum',
'HOME_TEAM_BALL_POSSESSION_season_average',
'HOME_TEAM_SHOTS_INSIDEBOX_season_average',
'HOME_TEAM_SUCCESSFUL_PASSES_season_average',
'HOME_TEAM_GAME_WON_season_average',
'AWAY_TEAM_GAME_WON_season_average',
'AWAY_TEAM_GAME_WON_5_last_match_average',
'HOME_TEAM_GAME_LOST_season_average',
'AWAY_TEAM_SHOTS_ON_TARGET_season_average',
'HOME_TEAM_DANGEROUS_ATTACKS_season_sum',
'HOME_TEAM_DANGEROUS_ATTACKS_5_last_match_sum',
'AWAY_TEAM_DANGEROUS_ATTACKS_5_last_match_sum',
'AWAY_TEAM_GAME_WON_season_sum',
'AWAY_TEAM_GOALS_season_average',
'AWAY_TEAM_GOALS_season_sum',
'AWAY_TEAM_GOALS_5_last_match_sum',
'AWAY_TEAM_BALL_POSSESSION_5_last_match_average',

'HOME_TEAM_SHOTS_INSIDEBOX_season_sum',
'AWAY_TEAM_GAME_LOST_season_average',
'AWAY_TEAM_GAME_LOST_5_last_match_average',
'AWAY_TEAM_FOULS_season_std','HOME_TEAM_FOULS_season_std',
'HOME_TEAM_SHOTS_ON_TARGET_season_average','AWAY_TEAM_GAME_DRAW_season_sum',
'HOME_TEAM_INJURIES_season_average',
'AWAY_TEAM_SHOTS_ON_TARGET_season_average',
]

X_train = X_train[metrics]

X_final_test = X_final_test[metrics]

In [None]:


# Transformation de y_train et y_test en étiquettes (sans changer les noms des classes)
y_train_labels = y_train[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
# Séparation des données d'entraînement (X_train, y_train_labels) en deux ensembles :
# - X_train et y_train pour l'entraînement du modèle (80% des données)
# - X_test et y_test pour l'évaluation du modèle (20% des données)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train,y_train_labels , train_size=0.8, random_state=42)


# ***Tester les différents modèles afin de choisir le model avec les meilleurs résultats***

In [None]:
# Test de différents modèles de classification afin de choisir celui qui donne les meilleurs résultats
from sklearn.neural_network import MLPClassifier
models = {
    "RandomForest": RandomForestClassifier(max_depth=10,min_samples_split=10,n_estimators=200,random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "NaiveBayes": GaussianNB(),
    "NeuralNetwork": MLPClassifier(hidden_layer_sizes=(10,),activation='relu',solver='adam',max_iter=300,random_state=42)
}

for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))



=== RandomForest ===
Accuracy: 0.49573344169036976
[[325   5 428]
 [165   6 449]
 [187   7 889]]
              precision    recall  f1-score   support

   AWAY_WINS       0.48      0.43      0.45       758
        DRAW       0.33      0.01      0.02       620
   HOME_WINS       0.50      0.82      0.62      1083

    accuracy                           0.50      2461
   macro avg       0.44      0.42      0.37      2461
weighted avg       0.45      0.50      0.42      2461


=== LogisticRegression ===
Accuracy: 0.5050792360828932
[[367   3 388]
 [187   3 430]
 [207   3 873]]
              precision    recall  f1-score   support

   AWAY_WINS       0.48      0.48      0.48       758
        DRAW       0.33      0.00      0.01       620
   HOME_WINS       0.52      0.81      0.63      1083

    accuracy                           0.51      2461
   macro avg       0.44      0.43      0.37      2461
weighted avg       0.46      0.51      0.43      2461


=== DecisionTree ===
Accuracy: 0.386

# ***Model Logistic Regression : Entrainement et Evaluation du model***

In [None]:
# === Entraînement du modèle Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# === Prédictions sur les données de test (évaluation)
y_pred = lr_model.predict(X_test)

# === Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# === Prédictions sur x_test (données à prédire pour soumission)
y_pred = lr_model.predict(X_final_test)

# === Encodage one-hot des prédictions
pred_df = pd.get_dummies(pd.Series(y_pred), dtype=int)

# === S'assurer que toutes les colonnes sont présentes
for col in ['HOME_WINS', 'DRAW', 'AWAY_WINS']:
    if col not in pred_df.columns:
        pred_df[col] = 0

# === Réordonner les colonnes
pred_df = pred_df[['HOME_WINS', 'DRAW', 'AWAY_WINS']]

# === Ajouter les IDs
pred_df.insert(0, 'ID', ids_test)

# === Sauvegarde dans un fichier CSV
pred_df.to_csv("resultats.csv", index=False)




Accuracy: 0.5050792360828932
[[367   3 388]
 [187   3 430]
 [207   3 873]]
              precision    recall  f1-score   support

   AWAY_WINS       0.48      0.48      0.48       758
        DRAW       0.33      0.00      0.01       620
   HOME_WINS       0.52      0.81      0.63      1083

    accuracy                           0.51      2461
   macro avg       0.44      0.43      0.37      2461
weighted avg       0.46      0.51      0.43      2461

