Projet Prediction
SEARLE Oliver B11

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings('ignore')

In [2]:
# === Étape 1 : Chargement des données ===
train_home = pd.read_csv('X_Train_Data/train_home_team_statistics_df.csv', index_col=0)
train_away = pd.read_csv('X_Train_Data/train_away_team_statistics_df.csv', index_col=0)
y_train   = pd.read_csv('Y_train_1rknArQ.csv', index_col=0)

test_home = pd.read_csv('X_Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('X_Test_Data/test_away_team_statistics_df.csv', index_col=0)

In [3]:
# === Étape 2 : Prétraitement (suppression & renommage) ===
def prep(df, prefix, drop_first=False):
    """
    Supprime les colonnes inutiles et ajoute un préfixe aux noms de colonnes
    drop_first : supprime les deux premières colonnes si True
    """
    if drop_first:
        df = df.drop(df.columns[:2], axis=1)
    df.columns = [prefix + col for col in df.columns]
    return df

train_home = prep(train_home, 'HOME_', drop_first=True)
train_away = prep(train_away, 'AWAY_', drop_first=True)
test_home  = prep(test_home,  'HOME_')
test_away  = prep(test_away,  'AWAY_')

# Fusion des données Home & Away pour l'entraînement et le test
X_train_raw = pd.concat([train_home, train_away], axis=1).dropna()
X_test_raw  = pd.concat([test_home,  test_away ], axis=1).fillna(0)



In [4]:
# === Étape 3 : Encodage des étiquettes de sortie ===
# 0 = victoire à domicile, 1 = match nul, 2 = victoire à l'extérieur
y = (
    0 * y_train.loc[X_train_raw.index, 'HOME_WINS'] +
    1 * y_train.loc[X_train_raw.index, 'DRAW'] +
    2 * y_train.loc[X_train_raw.index, 'AWAY_WINS']
)

In [5]:
# === Étape 4 : Ingénierie des caractéristiques ===
def features(df):
    """Ajoute des variables dérivées basées sur les moyennes et sommes saisonnières"""
    df['GOALS_AVG_DIFF'] = df['HOME_TEAM_GOALS_season_average'] - df['AWAY_TEAM_GOALS_season_average']
    df['SHOTS_AVG_DIFF'] = df['HOME_TEAM_SHOTS_TOTAL_season_average'] - df['AWAY_TEAM_SHOTS_TOTAL_season_average']
    df['SOT_AVG_DIFF']   = df['HOME_TEAM_SHOTS_ON_TARGET_season_average'] - df['AWAY_TEAM_SHOTS_ON_TARGET_season_average']
    df['SHOT_CONV_DIFF'] = (
        df['HOME_TEAM_GOALS_season_sum'] / df['HOME_TEAM_SHOTS_TOTAL_season_sum'].replace(0, np.nan)
        - df['AWAY_TEAM_GOALS_season_sum'] / df['AWAY_TEAM_SHOTS_TOTAL_season_sum'].replace(0, np.nan)
    )
    df['POSSESSION_DIFF'] = df['HOME_TEAM_BALL_POSSESSION_season_average'] - df['AWAY_TEAM_BALL_POSSESSION_season_average']
    df['PASS_ACC_DIFF']   = df['HOME_TEAM_SUCCESSFUL_PASSES_PERCENTAGE_season_average'] - df['AWAY_TEAM_SUCCESSFUL_PASSES_PERCENTAGE_season_average']
    df['CORNERS_DIFF']    = df['HOME_TEAM_CORNERS_season_average'] - df['AWAY_TEAM_CORNERS_season_average']
    df['YEL_DIFF']        = df['HOME_TEAM_YELLOWCARDS_season_average'] - df['AWAY_TEAM_YELLOWCARDS_season_average']
    df['RED_DIFF']        = df['HOME_TEAM_REDCARDS_season_average'] - df['AWAY_TEAM_REDCARDS_season_average']
    df['LAST5_GOALS_DIFF'] = (
        df['HOME_TEAM_GOALS_5_last_match_sum'] - df['AWAY_TEAM_GOALS_5_last_match_sum']
    )
    return df

# Application de l'ingénierie des caractéristiques
df_train = features(X_train_raw.copy())
df_test  = features(X_test_raw.copy())

In [6]:
# === Étape 5 : Séparation Train/Validation ===
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
# === Étape 6 : Sélection des caractéristiques ===
base = xgb.XGBClassifier(
    objective='multi:softprob', num_class=3,
    tree_method='hist', eval_metric='mlogloss', random_state=42
)
base.fit(X_train, y_train)
sel = SelectFromModel(base, threshold='median', prefit=True)
X_train_sel = sel.transform(X_train)
feature_names = df_train.columns[sel.get_support()]
print(f"Sélection de {len(feature_names)} caractéristiques :", list(feature_names))

Sélection de 145 caractéristiques : ['HOME_TEAM_SHOTS_TOTAL_season_sum', 'HOME_TEAM_SHOTS_ON_TARGET_season_sum', 'HOME_TEAM_PASSES_season_sum', 'HOME_TEAM_SAVES_season_sum', 'HOME_TEAM_REDCARDS_season_sum', 'HOME_TEAM_ATTACKS_season_sum', 'HOME_TEAM_PENALTIES_season_sum', 'HOME_TEAM_INJURIES_season_sum', 'HOME_TEAM_GAME_WON_season_sum', 'HOME_TEAM_GAME_LOST_season_sum', 'HOME_TEAM_SHOTS_TOTAL_season_average', 'HOME_TEAM_SHOTS_INSIDEBOX_season_average', 'HOME_TEAM_SHOTS_OUTSIDEBOX_season_average', 'HOME_TEAM_PASSES_season_average', 'HOME_TEAM_SUCCESSFUL_PASSES_season_average', 'HOME_TEAM_CORNERS_season_average', 'HOME_TEAM_BALL_POSSESSION_season_average', 'HOME_TEAM_REDCARDS_season_average', 'HOME_TEAM_OFFSIDES_season_average', 'HOME_TEAM_ATTACKS_season_average', 'HOME_TEAM_SUBSTITUTIONS_season_average', 'HOME_TEAM_BALL_SAFE_season_average', 'HOME_TEAM_GAME_WON_season_average', 'HOME_TEAM_GAME_DRAW_season_average', 'HOME_TEAM_GAME_LOST_season_average', 'HOME_TEAM_SHOTS_INSIDEBOX_season_

In [8]:
# === Étape 7 : Recherche d'hyperparamètres ===
# Définition de la grille d'hyperparamètres à tester
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],       # taux d'apprentissage à tester
    'max_depth': [4, 6, 8],                    # profondeur maximale des arbres
    'subsample': [0.7, 0.85, 1.0],             # échantillonnage des lignes
    'colsample_bytree': [0.7, 0.85, 1.0],       # échantillonnage des colonnes
    'n_estimators': [100, 300, 500]            # nombre d'arbres
}
# skf : Stratégie de validation croisée stratifiée en 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# search : RandomizedSearchCV pour tester aléatoirement des combinaisons de param_grid
search = RandomizedSearchCV(
    estimator=xgb.XGBClassifier(
        objective='multi:softprob', num_class=3,
        tree_method='hist', eval_metric='mlogloss', random_state=42
    ),
    param_distributions=param_grid,
    n_iter=20,
    cv=skf,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
search.fit(X_train_sel, y_train)
best = search.best_estimator_
print("Meilleurs paramètres :", search.best_params_)

Meilleurs paramètres : {'subsample': 0.7, 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.7}


In [None]:
# === Étape 7 : Utilisation directe des meilleurs hyperparamètres ===
best = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    tree_method='hist',
    eval_metric='mlogloss',
    random_state=42,
    subsample=0.7,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.01,
    colsample_bytree=0.7
)
best.fit(X_train_sel, y_train)


In [78]:
# === Étape 8 : Évaluation sur validation ===
X_test_sel = sel.transform(X_test)
y_pred = best.predict(X_test_sel)
print("Précision validation :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Score si l'on prédit systématiquement home_wins
all_away = np.full_like(y_test, fill_value=0)
away_only_acc = accuracy_score(y_test, all_away)
print("Précision pour prédiction uniquement 'home_wins' :", np.round(away_only_acc, 4))

Précision validation : 0.49364791288566245
              precision    recall  f1-score   support

           0       0.51      0.79      0.62       726
           1       0.24      0.01      0.03       420
           2       0.46      0.47      0.47       507

    accuracy                           0.49      1653
   macro avg       0.41      0.42      0.37      1653
weighted avg       0.43      0.49      0.42      1653

Précision pour prédiction uniquement 'home_wins' : 0.4392


In [79]:
# === Étape 9 : Entraînement final & Prédiction === : Entraînement final & Prédiction ===
best.fit(sel.transform(df_train), y)
pred_test = best.predict(sel.transform(df_test))
submission = pd.DataFrame(
    {'HOME_WINS': (pred_test == 0).astype(int),
     'DRAW':      (pred_test == 1).astype(int),
     'AWAY_WINS': (pred_test == 2).astype(int)}
, index=df_test.index)
submission.reset_index().to_csv('resultat_prediction.csv', index=False)
print("Fichier de soumission généré : resultat_prediction.csv")

Fichier de soumission généré : resultat_prediction.csv
