In [18]:
import wandb
import pandas as pd
import numpy as np
import shap
import optuna
import matplotlib.pyplot as plt
from ift6758.data import load_cached_season_dataframe, load_cached_seasons_dataframe, new_variables, goal_rate_by_percentile
from ift6758.data.graphs import plot_goal_curve
from sklearn.calibration import CalibrationDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GroupShuffleSplit, StratifiedKFold, RandomizedSearchCV, StratifiedGroupKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df_training = load_cached_seasons_dataframe(2016,2019) 
df_test = load_cached_season_dataframe(2020) # on y touche pas jusqu'à la fin
df = new_variables(df_training)

## Séparation des données d'entrainement et de validation

In [3]:
# One hot encode pour typeShot et lastEvent
cat_cols = [c for c in ["typeShot", "lastEvent", "goalStrenght"] if c in df.columns]
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, dummy_na=True)

# transforme les inf en NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Filtrer les NaN 
mask_tr = df.notna().all(axis=1)

df = df[mask_tr]

splitter = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

train_idxs, val_idxs = next(splitter.split(df, groups=df['gameId']))
df_train = df.iloc[train_idxs].copy()
df_val = df.iloc[val_idxs].copy()

X_train = df_train.drop(columns=["isGoal","timeInPeriod","typeEvent", "shooter","goalie", "teamShot"])
X_val   = df_val.drop(columns=["isGoal","timeInPeriod","typeEvent", "shooter","goalie", "teamShot"])

y_train = df_train["isGoal"].astype(int)
y_val = df_val["isGoal"].astype(int)

# Random Forests

In [4]:
run = wandb.init(
    project="projet-hockey-ai", 
    name="random-forest",
    tags=["all-features", "random-forest"],
    save_code=True
)

rf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)

rf.fit(X_train, y_train)

# Prédictions
y_pred_rf = rf.predict(X_val)
y_proba_rf = rf.predict_proba(X_val)

# 6. Calcul des métriques
acc_rf = accuracy_score(y_val, y_pred_rf)
auc_rf = roc_auc_score(y_val, y_proba_rf[:, 1])
ll_rf = log_loss(y_val, y_proba_rf[:, 1])

print(f"Accuracy: {acc_rf}")
print(f"AUC: {auc_rf}")
print(f"Log Loss: {ll_rf}")

# Logging automatique des courbes dans WandB
# WandB crée automatiquement la courbe ROC, la matrice de confusion et la courbe de précision-rappel
wandb.sklearn.plot_classifier(
    rf, X_train, X_val, y_train, y_val,
    y_pred_rf, y_proba_rf, 
    labels=['Non-Goal', 'Goal'], 
    model_name='Random Forest', 
    feature_names=list(X_train.columns)
)

wandb.log({"accuracy": acc_rf, "auc": auc_rf, "log_loss": ll_rf})

run.finish()

wandb: Currently logged in as: stefan-sucatu (stefan-sucatu-polytechnique-montr-al) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Accuracy: 0.9106942388103412
AUC: 0.7583785894948027
Log Loss: 0.27224530759332766


wandb: 
wandb: Plotting Random Forest.
wandb: Logged feature importances.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision-recall curve.


0,1
accuracy,▁
auc,▁
log_loss,▁

0,1
accuracy,0.91069
auc,0.75838
log_loss,0.27225


## Hyperparameter tuning

In [5]:
cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)
groups = df_train["gameId"]


def objective(trial):
    
    max_cols = X_train.shape[1]
    k_value = trial.suggest_int('k', 10, max_cols)

    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2", 0.3, 0.5]),
        'class_weight': trial.suggest_categorical('class_weight', [None, "balanced", "balanced_subsample"]),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 1e-3),
    }

    pipeline = Pipeline([
        ('selection', SelectKBest(score_func=f_classif, k=k_value)),
        ('clf', RandomForestClassifier(**rf_params, bootstrap=True, random_state=42, n_jobs=-1))
    ])

    scores = cross_val_score(
        pipeline, 
        X_train, 
        y_train, 
        groups=groups, 
        cv=cv, 
        scoring="neg_log_loss",
        n_jobs=1 
    )

    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Meilleurs paramètres trouvés :")
print(study.best_params)
print(f"Meilleur score (neg_log_loss) : {study.best_value}")

# 1. On sépare le paramètre 'k' des paramètres du Random Forest
best_params = study.best_params.copy()
best_k = best_params.pop('k') # On extrait k et on l'enlève du dictionnaire

# 2. On reconstruit le pipeline gagnant
final_pipeline = Pipeline([
    ('selection', SelectKBest(score_func=f_classif, k=best_k)),
    ('clf', RandomForestClassifier(**best_params, bootstrap=True, random_state=42, n_jobs=-1))
])

# 3. On entraîne sur tout le dataset d'entraînement
final_pipeline.fit(X_train, y_train)

# Si tu veux voir quelles colonnes ont été choisies au final :
selected_mask = final_pipeline.named_steps['selection'].get_support()
selected_features = X_train.columns[selected_mask]
print(f"Features finales retenues ({len(selected_features)}) :", list(selected_features))

[I 2026-01-28 12:38:51,218] A new study created in memory with name: no-name-a22d1036-9ae9-4e9d-8497-c04f470bd485
  f = msb / msw
  f = msb / msw
  f = msb / msw
[I 2026-01-28 12:40:08,667] Trial 0 finished with value: -0.5901151809129019 and parameters: {'k': 20, 'n_estimators': 500, 'max_depth': 9, 'min_samples_split': 18, 'min_samples_leaf': 9, 'max_features': 'log2', 'class_weight': 'balanced_subsample', 'min_impurity_decrease': 0.00042545693234134877}. Best is trial 0 with value: -0.5901151809129019.
  f = msb / msw
  f = msb / msw
  f = msb / msw
[I 2026-01-28 12:41:24,250] Trial 1 finished with value: -0.5844009797544004 and parameters: {'k': 10, 'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 'log2', 'class_weight': 'balanced', 'min_impurity_decrease': 0.00013899377181588523}. Best is trial 1 with value: -0.5844009797544004.
  f = msb / msw
  f = msb / msw
  f = msb / msw
[I 2026-01-28 12:44:21,259] Trial 2 finished with valu

Meilleurs paramètres trouvés :
{'k': 47, 'n_estimators': 400, 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.3, 'class_weight': None, 'min_impurity_decrease': 6.338129671633395e-06}
Meilleur score (neg_log_loss) : -0.2616652521752987


  f = msb / msw


Features finales retenues (47) : ['gameSeconds', 'period', 'eventId', 'x', 'y', 'openNet', 'lastEventX', 'lastEventY', 'timeSinceLastEvent', 'distanceSinceLastEvent', 'friendlySkaters', 'opposingSkaters', 'timeInPowerPlay', 'gameId', 'season', 'gameType', 'attack_sign', 'x_adj', 'y_adj', 'shotDistance', 'shotAngle', 'isEmpty', 'isRebound', 'angleDifference', 'speed', 'typeShot_backhand', 'typeShot_deflected', 'typeShot_slap', 'typeShot_snap', 'typeShot_tip-in', 'typeShot_wrap-around', 'typeShot_wrist', 'typeShot_nan', 'lastEvent_blocked-shot', 'lastEvent_faceoff', 'lastEvent_giveaway', 'lastEvent_goal', 'lastEvent_hit', 'lastEvent_missed-shot', 'lastEvent_penalty', 'lastEvent_shot-on-goal', 'lastEvent_takeaway', 'lastEvent_nan', 'goalStrenght_EV', 'goalStrenght_PP', 'goalStrenght_SH', 'goalStrenght_nan']


# Neural Networks (Multilayer Percetron)

In [14]:
X_train_baseline = X_train.copy()
X_val_baseline = X_val.copy()

scaler_baseline = StandardScaler()
X_train_scaled = scaler_baseline.fit_transform(X_train_baseline)
X_val_scaled = scaler_baseline.transform(X_val_baseline)

In [15]:
run = wandb.init(
    project="projet-hockey-ai", 
    name="MLP-Baseline",
    tags=["all-features", "MLP"],
    save_code=True
)

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50), 
    activation='relu',
    solver='adam', 
    alpha=0.0001, 
    batch_size='auto',
    learning_rate='adaptive', 
    max_iter=500, 
    early_stopping=True,
    random_state=42
)

# On entraîne sur la version scalée
mlp.fit(X_train_scaled, y_train)

# Prédictions sur la version scalée
y_pred_mlp = mlp.predict(X_val_scaled)
y_proba_mlp = mlp.predict_proba(X_val_scaled)

acc_mlp = accuracy_score(y_val, y_pred_mlp)
auc_mlp = roc_auc_score(y_val, y_proba_mlp[:, 1])
ll_mlp = log_loss(y_val, y_proba_mlp[:, 1])

print(f"Accuracy: {acc_mlp}")
print(f"AUC: {auc_mlp}")
print(f"Log Loss: {ll_mlp}")

wandb.sklearn.plot_classifier(
    mlp, X_train_scaled, X_val_scaled, y_train, y_val, 
    y_pred_mlp, y_proba_mlp, 
    labels=['Non-Goal', 'Goal'], 
    model_name='MLP Baseline', 
    feature_names=list(X_train.columns) # On utilise X_train ici juste pour avoir les noms !
)

wandb.log({"accuracy": acc_mlp, "auc": auc_mlp, "log_loss": ll_mlp})
run.finish()

Accuracy: 0.9107593186144509
AUC: 0.7675296783944106
Log Loss: 0.26636425940374936


wandb: 
wandb: Plotting MLP Baseline.
wandb: Logged feature importances.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision-recall curve.


0,1
accuracy,▁
auc,▁
log_loss,▁

0,1
accuracy,0.91076
auc,0.76753
log_loss,0.26636


In [19]:
cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)
groups = df_train["gameId"]

def objective(trial):
    
    max_cols = X_train.shape[1]
    k_value = trial.suggest_int('k', 10, max_cols)

    layer_options = [(50,), (100,), (50, 25), (100, 50), (100, 50, 25)]
    selected_index = trial.suggest_categorical('hidden_layer_index', range(len(layer_options)))
    hidden_layers = layer_options[selected_index]    
    
    mlp_params = {
        'hidden_layer_sizes': hidden_layers,
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh']),
        'solver': 'adam', # 'adam' est généralement le meilleur choix par défaut
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-1, log=True), # Régularisation L2 (échelle log)
        'learning_rate_init': trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128, 'auto']),
    }

    pipeline = Pipeline([
        ('cleanup', VarianceThreshold(threshold=0)), 
        ('scaler', StandardScaler()),                
        ('selection', SelectKBest(score_func=f_classif, k=k_value)),
        ('clf', MLPClassifier(
            **mlp_params, 
            max_iter=300,        
            early_stopping=True, 
            random_state=42
        ))
    ])

    # 4. Validation Croisée
    scores = cross_val_score(
        pipeline, 
        X_train, 
        y_train, 
        groups=groups, 
        cv=cv, 
        scoring="neg_log_loss",
        n_jobs=1 
    )

    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20) 

print("Meilleurs paramètres trouvés :")
print(study.best_params)
print(f"Meilleur score (neg_log_loss) : {study.best_value}")

best_params = study.best_params.copy()
best_k = best_params.pop('k') 

final_pipeline = Pipeline([
    ('cleanup', VarianceThreshold(threshold=0)),
    ('scaler', StandardScaler()), 
    ('selection', SelectKBest(score_func=f_classif, k=best_k)),
    ('clf', MLPClassifier(
        **best_params, 
        solver='adam',       
        max_iter=300, 
        early_stopping=True,
        random_state=42
    ))
])

final_pipeline.fit(X_train, y_train)

n_features_in = final_pipeline.named_steps['selection'].n_features_in_
n_features_out = best_k
print(f"Features sélectionnées : {n_features_out} (sur {n_features_in} disponibles après nettoyage)")

[I 2026-01-28 13:40:23,791] A new study created in memory with name: no-name-0118af8c-f98f-40b3-8a61-53e09f276255
[I 2026-01-28 13:41:17,828] Trial 0 finished with value: -0.26621726133824536 and parameters: {'k': 40, 'hidden_layer_sizes': (100, 50), 'activation': 'relu', 'alpha': 0.07509144084336435, 'learning_rate_init': 0.0009115983463619342, 'batch_size': 128}. Best is trial 0 with value: -0.26621726133824536.
[I 2026-01-28 13:41:44,276] Trial 1 finished with value: -0.2675607684273507 and parameters: {'k': 16, 'hidden_layer_sizes': (50, 25), 'activation': 'relu', 'alpha': 0.0803607356924783, 'learning_rate_init': 0.0009538436883425712, 'batch_size': 'auto'}. Best is trial 0 with value: -0.26621726133824536.
[I 2026-01-28 13:42:56,510] Trial 2 finished with value: -0.26551227275608186 and parameters: {'k': 31, 'hidden_layer_sizes': (100, 50), 'activation': 'tanh', 'alpha': 5.296547086146484e-05, 'learning_rate_init': 0.0002418522160108974, 'batch_size': 'auto'}. Best is trial 2 wit

Meilleurs paramètres trouvés :
{'k': 37, 'hidden_layer_sizes': (50,), 'activation': 'tanh', 'alpha': 0.0011465499908929738, 'learning_rate_init': 0.00021717964398152416, 'batch_size': 32}
Meilleur score (neg_log_loss) : -0.2650772449429608
Features sélectionnées : 37 (sur 45 disponibles après nettoyage)


# Gaussian Mixture Model (clustering method)

# Support Vector Machines

# Model Comparaison

In [None]:
# Proba
pos_idx_rf  = np.where(rf.classes_  == 1)[0][0]
pos_idx_mlp = np.where(mlp.classes_ == 1)[0][0]
pos_idx_gmm = np.where(gmm.classes_ == 1)[0][0]
pos_idx_svm = np.where(svm.classes_ == 1)[0][0]

proba_rf  = rf.predict_proba(X_val_rf)[:,  pos_idx_rf]
proba_mlp = mlp.predict_proba(X_val_mlp)[:,  pos_idx_mlp]
proba_gmm  = gmm.predict_proba(X_val_)[:,  pos_idx_gmm]
proba_svm  = svm.predict_proba(X_val_)[:,  pos_idx_svm]

# ROC-AUC
fpr_rf,  tpr_rf,  _ = roc_curve(y_val_rf, proba_rf)
auc_rf = roc_auc_score(y_val_rf, proba_rf)
fpr_mlp, tpr_mlp, _ = roc_curve(y_val_mlp, proba_mlp)
auc_mlp = roc_auc_score(y_val_mlp, proba_mlp)
fpr_gmm, tpr_gmm, _ = roc_curve(y_val_gmm, proba_gmm)
auc_gmm = roc_auc_score(y_val_gmm, proba_gmm)
fpr_svm, tpr_svm, _ = roc_curve(y_val_svm, proba_svm)
auc_svm = roc_auc_score(y_val_svm, proba_svm)


plt.plot(fpr_rf,  tpr_rf,  label=f"Random Forest (AUC={auc_rf:.3f})")
plt.plot(fpr_mlp,  tpr_mlp,  label=f"Multilayer Percerton (AUC={auc_mlp:.3f})")
plt.plot(fpr_gmm,  tpr_gmm,  label=f"Gaussian Mixture Model (AUC={auc_gmm:.3f})")
plt.plot(fpr_svm,  tpr_svm,  label=f"Support Vector Machines (AUC={auc_svm:.3f})")


plt.plot([0, 1], [0, 1], linestyle="--", label="Random 50% (AUC=0.500)")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC - AUC")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
x_distance_angle, goal_rate_distance_angle = goal_rate_by_percentile(y_val_distance_angle, proba_distance_angle, step=5)
goal_rate_percent_distance_angle = 100.0 * goal_rate_distance_angle

x_all, goal_rate_all = goal_rate_by_percentile(y_val_all, proba_all, step=5)
goal_rate_percent_all = 100.0 * goal_rate_all

x_top15, goal_rate_top15 = goal_rate_by_percentile(y_val_all, proba_top15, step=5)
goal_rate_percent_top15 = 100.0 * goal_rate_top15

plt.plot(x_distance_angle,  goal_rate_percent_distance_angle,  label="Baseline")
plt.plot(x_all, goal_rate_percent_all, label="All features")
plt.plot(x_top15, goal_rate_percent_top15, label="Top 15")

plt.title("Goal Rate")
plt.xlabel("Shot probability model percentile")
plt.ylabel("Goals / (Shots + Goals)")
plt.grid(alpha=0.3)

plt.xlim(100, 0)
plt.xticks(np.arange(0, 101, 10))
plt.ylim(0, 100)

plt.legend()
plt.tight_layout()
plt.show()