In [6]:
import wandb
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from ift6758.data import load_cached_season_dataframe, load_cached_seasons_dataframe, new_variables, goal_rate_by_percentile
from ift6758.data.graphs import plot_goal_curve
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, GroupShuffleSplit, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [7]:
df_training = load_cached_seasons_dataframe(2016,2019) 
df_test = load_cached_season_dataframe(2020) # on y touche pas jusqu'à la fin
df = new_variables(df_training)

## Séparation des données d'entrainement et de validation

In [8]:
# One hot encode pour typeShot et lastEvent
cat_cols = [c for c in ["typeShot", "lastEvent","teamShot","goalStrenght"] if c in df.columns]
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, dummy_na=True)

# transforme les inf en NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Filtrer les NaN 
mask_tr = df.notna().all(axis=1)


splitter = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

train_idxs, val_idxs = next(splitter.split(df, groups=df['gameId']))
df_train = df.iloc[train_idxs].copy()
df_val = df.iloc[val_idxs].copy()



X_train = df_train.drop(columns=["isGoal","timeInPeriod","typeEvent", "shooter","goalie"])
X_val   = df_val.drop(columns=["isGoal","timeInPeriod","typeEvent", "shooter","goalie"])

y_train = df_train["isGoal"].astype(int)
y_val = df_val["isGoal"].astype(int)

# Random Forests

In [10]:

run = wandb.init(
    project="projet-hockey-ai", 
    name="random-forest",
    tags=["all-features", "random-forest"],
    save_code=True
)

rf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)

rf.fit(X_train, y_train)

# Prédictions
y_pred_rf = rf.predict(X_val)
y_proba_rf = rf.predict_proba(X_val)

# 6. Calcul des métriques
acc_rf = accuracy_score(y_val, y_pred_rf)
auc_rf = roc_auc_score(y_val, y_proba_rf[:, 1])
ll_rf = log_loss(y_val, y_proba_rf[:, 1])

print(f"Accuracy: {acc_rf}")
print(f"AUC: {auc_rf}")
print(f"Log Loss: {ll_rf}")

# Logging automatique des courbes dans WandB
# WandB crée automatiquement la courbe ROC, la matrice de confusion et la courbe de précision-rappel
wandb.sklearn.plot_classifier(
    rf, X_train, X_val, y_train, y_val,
    y_pred_rf, y_proba_rf, 
    labels=['Non-Goal', 'Goal'], 
    model_name='Random Forest', 
    feature_names=list(X_train.columns)
)

wandb.log({"accuracy": acc_rf, "auc": auc_rf, "log_loss": ll_rf})

run.finish()

Accuracy: 0.9094579450729955
AUC: 0.7649165576203626
Log Loss: 0.27407266486043697


wandb: 
wandb: Plotting Random Forest.
wandb: Logged feature importances.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision-recall curve.


0,1
accuracy,▁
auc,▁
log_loss,▁

0,1
accuracy,0.90946
auc,0.76492
log_loss,0.27407


## Hyperparameter tuning

In [15]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [16]:
rf_param_grid = {
    "n_estimators": [300, 600, 1000, 1500],
    "max_depth": [None, 6, 10, 14, 18],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2", 0.3, 0.5, 0.8],
    "bootstrap": [True],
    "class_weight": [None, "balanced", "balanced_subsample"],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],
}

base_model = RandomForestClassifier(
    n_estimators=600,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features="sqrt",
    bootstrap=True,
    class_weight="balanced_subsample",  # utile vu le déséquilibre buts/non-buts
    random_state=42,
    n_jobs=-1,
)

In [None]:
search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_grid,
    n_iter=40,                  # peut augmenter mais risque d'être très long
    scoring="neg_log_loss",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=2,
)

search.fit(X_train, y_train, groups=df_train["gameId"])
best_rf = search.best_estimator_
print(search.best_score_, search.best_params_)

Fitting 3 folds for each of 40 candidates, totalling 120 fits




## Feature Selection (K-Best features)

# Neural Networks (Multilayer Percetron)

# Gaussian Mixture Model (clustering method)

# Support Vector Machines

# Model Comparaison

In [None]:
# Proba
pos_idx_rf  = np.where(rf.classes_  == 1)[0][0]
pos_idx_mlp = np.where(mlp.classes_ == 1)[0][0]
pos_idx_gmm = np.where(gmm.classes_ == 1)[0][0]
pos_idx_svm = np.where(svm.classes_ == 1)[0][0]

proba_rf  = rf.predict_proba(X_val_rf)[:,  pos_idx_rf]
proba_mlp = mlp.predict_proba(X_val_mlp)[:,  pos_idx_mlp]
proba_gmm  = gmm.predict_proba(X_val_)[:,  pos_idx_gmm]
proba_svm  = svm.predict_proba(X_val_)[:,  pos_idx_svm]

# ROC-AUC
fpr_rf,  tpr_rf,  _ = roc_curve(y_val_rf, proba_rf)
auc_rf = roc_auc_score(y_val_rf, proba_rf)
fpr_mlp, tpr_mlp, _ = roc_curve(y_val_mlp, proba_mlp)
auc_mlp = roc_auc_score(y_val_mlp, proba_mlp)
fpr_gmm, tpr_gmm, _ = roc_curve(y_val_gmm, proba_gmm)
auc_gmm = roc_auc_score(y_val_gmm, proba_gmm)
fpr_svm, tpr_svm, _ = roc_curve(y_val_svm, proba_svm)
auc_svm = roc_auc_score(y_val_svm, proba_svm)


plt.plot(fpr_rf,  tpr_rf,  label=f"Random Forest (AUC={auc_rf:.3f})")
plt.plot(fpr_mlp,  tpr_mlp,  label=f"Multilayer Percerton (AUC={auc_mlp:.3f})")
plt.plot(fpr_gmm,  tpr_gmm,  label=f"Gaussian Mixture Model (AUC={auc_gmm:.3f})")
plt.plot(fpr_svm,  tpr_svm,  label=f"Support Vector Machines (AUC={auc_svm:.3f})")


plt.plot([0, 1], [0, 1], linestyle="--", label="Random 50% (AUC=0.500)")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC - AUC")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
x_distance_angle, goal_rate_distance_angle = goal_rate_by_percentile(y_val_distance_angle, proba_distance_angle, step=5)
goal_rate_percent_distance_angle = 100.0 * goal_rate_distance_angle

x_all, goal_rate_all = goal_rate_by_percentile(y_val_all, proba_all, step=5)
goal_rate_percent_all = 100.0 * goal_rate_all

x_top15, goal_rate_top15 = goal_rate_by_percentile(y_val_all, proba_top15, step=5)
goal_rate_percent_top15 = 100.0 * goal_rate_top15

plt.plot(x_distance_angle,  goal_rate_percent_distance_angle,  label="Baseline")
plt.plot(x_all, goal_rate_percent_all, label="All features")
plt.plot(x_top15, goal_rate_percent_top15, label="Top 15")

plt.title("Goal Rate")
plt.xlabel("Shot probability model percentile")
plt.ylabel("Goals / (Shots + Goals)")
plt.grid(alpha=0.3)

plt.xlim(100, 0)
plt.xticks(np.arange(0, 101, 10))
plt.ylim(0, 100)

plt.legend()
plt.tight_layout()
plt.show()