In [10]:
import wandb
import joblib
import pandas as pd
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBClassifier

In [11]:
# Configure WandB
wandb.login()
wandb.init(
    project="Models Test",
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: thierry-bedard-cortey (IFT67582024-A07). Use `wandb login --relogin` to force relogin


### Load test dataset

In [None]:
# Load dataset
play_by_play_path = "../../data/dataframe_2020_to_2021.csv"
play_by_play = pd.read_csv(play_by_play_path)

# Filter for regular-season games and remove missing values
play_by_play = play_by_play.loc[play_by_play["gameType"] == "regular-season"].dropna()

# Remove non-relevant features
X_all_features = play_by_play.drop(play_by_play.columns[[1, 2, 3, 5, 6, 14, 15, 16, 18, 19, 20, 21]], axis=1)
features_to_encode = ["previousEventType", "shotType"]

# Encode categorical features
def encode_and_bind(df, feature):
    dummies = pd.get_dummies(df[feature], prefix=feature)
    return pd.concat([df.drop(columns=feature), dummies], axis=1)

# Apply encoding on selected features
for feature in features_to_encode:
    X_all_features = encode_and_bind(X_all_features, feature)

X_shot_distance = play_by_play[["shotDistance"]]
X_shot_angle = play_by_play[["shotAngle"]]
X_shot_distance_angle = play_by_play[["shotDistance", "shotAngle"]]

y = play_by_play["isGoal"].values

### Load the models from WandB

Pour des raisons de compatibilité entre version Python, nous n'avons pas pu utiliser les fichiers .pkl de WandB pour les modèles de régression logistique

In [16]:
model_path = "logistic_regression_dist.pkl"
lr_distance_model = joblib.load(model_path)

In [15]:
model_path = "logistic_regression_angle.pkl"
lr_angle_model = joblib.load(model_path)

In [14]:
model_path = "logistic_regression_dist_angle.pkl"
lr_distance_angle_model = joblib.load(model_path)

In [12]:
xgboost_artifact = wandb.use_artifact('philippe-bergeron-7-universit-de-montr-al-org/wandb-registry-model/XGBoost:v4', type='model')
xgboost_artifact_dir = xgboost_artifact.download()

model_path = f"{xgboost_artifact_dir}/xgboost_all_features.pkl"
xgboost_model = joblib.load(model_path)

wandb:   1 of 1 files downloaded.  


In [13]:
random_forest_artifact = wandb.use_artifact('philippe-bergeron-7-universit-de-montr-al-org/wandb-registry-model/Random Forest:v0', type='model')
random_forest_artifact_dir = random_forest_artifact.download()

model_path = f"{random_forest_artifact_dir}/RandomForest_Model.pkl"
rf_model = joblib.load(model_path)

wandb:   1 of 1 files downloaded.  


In [12]:
def generate_figures(model, X, y, model_name):
    # Generate predictions and probabilities
    y_pred_probs = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X)
    y_pred = model.predict(X)

    # Random baseline
    np.random.seed(42)
    prob_random = np.random.uniform(0, 1, len(y))

    # **1. ROC Curve and AUC**
    plt.figure(figsize=(8, 6))
    for name, prob in [
        ("Modèle", y_pred_probs),
        ("Aléatoire", prob_random),
    ]:
        # Compute ROC curve and AUC
        fpr, tpr, _ = roc_curve(y, prob)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

    plt.xlabel("Taux de faux positifs (FPR)")
    plt.ylabel("Taux de vrais positifs (TPR)")
    plt.title(f"Courbe ROC - {model_name}")
    plt.legend()
    plt.grid()
    plt.show()

    # **2. Goal Rate by Probability Percentile**
    plt.figure(figsize=(8, 6))
    for name, prob in [
        ("Modèle", y_pred_probs),
        ("Aléatoire", prob_random),
    ]:
        sorted_indices = np.argsort(prob)
        prob_sorted = np.array(prob[sorted_indices])
        y_sorted = np.array(y[sorted_indices])

        percentiles = np.percentile(prob_sorted, np.arange(0, 101, 10))
        goal_rates = [
            y_sorted[(prob_sorted >= percentiles[i]) & (prob_sorted < percentiles[i + 1])].mean()
            if np.sum((prob_sorted >= percentiles[i]) & (prob_sorted < percentiles[i + 1])) > 0
            else 0
            for i in range(len(percentiles) - 1)
        ]
        plt.plot(np.arange(0, 100, 10), [rate * 100 for rate in goal_rates], label=name)

    plt.xlabel("Centile de la probabilité prédite")
    plt.ylabel("Taux de buts (%)")
    plt.title(f"Taux de buts par centile de probabilité - {model_name}")
    plt.ylim(0, 100)
    plt.grid()
    plt.legend()
    plt.gca().invert_xaxis()
    plt.show()

    # **3. Cumulative Goal Proportion**
    plt.figure(figsize=(8, 6))
    for name, prob in [
        ("Modèle", y_pred_probs),
        ("Aléatoire", prob_random),
    ]:
        sorted_indices = np.argsort(prob)[::-1]
        y_sorted = np.array(y)[sorted_indices]

        cumulative_goals = np.cumsum(y_sorted)
        total_goals = np.sum(y_sorted)

        cumulative_goal_proportion = cumulative_goals / total_goals

        centiles = np.linspace(100, 0, len(cumulative_goal_proportion))
        plt.plot(centiles, cumulative_goal_proportion * 100, label=name)

    plt.xlabel("Centile de la probabilité prédite")
    plt.ylabel("Proportion cumulée des buts (%)")
    plt.title(f"Proportion cumulée des buts par centile de probabilité - {model_name}")
    plt.ylim(0, 100)
    plt.grid()
    plt.legend()
    plt.gca().invert_xaxis()
    plt.show()

    # **4. Calibration Curve**
    plt.figure(figsize=(8, 6))
    for name, prob in [
        ("Modèle", y_pred_probs),
        ("Aléatoire", prob_random),
    ]:
        fraction_of_positives, mean_predicted_value = calibration_curve(y, prob, n_bins=10, strategy="quantile")
        plt.plot(mean_predicted_value, fraction_of_positives, label=name)

    plt.plot([0, 1], [0, 1], "k--", label="Calibration parfaite")
    plt.xlabel("Probabilité prédite")
    plt.ylabel("Fréquence observée (empirique)")
    plt.title(f"Diagramme de fiabilité (Calibration) - {model_name}")
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# Generate figures for each model
generate_figures(lr_distance_model, X_shot_distance, y, "Logistic Regression (Distance)")
generate_figures(lr_angle_model, X_shot_angle, y, "Logistic Regression (Angle)")
generate_figures(lr_distance_angle_model, X_shot_distance_angle, y, "Logistic Regression (Distance + Angle)")
generate_figures(rf_model, X_all_features, y, "Random Forest")
generate_figures(xgboost_model, X_all_features, y, "XGBoost")