In [3]:
import os
from dotenv import load_dotenv
from comet_ml import Experiment
import joblib
load_dotenv('..\.env')

True

In [58]:
import pandas as pd
import numpy as np
import os 

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import src.features.feature_selection as fsel

In [73]:
input_path = os.path.join('..\data\datasets\csv_files','2016-2020-v2.csv')
df = pd.read_csv(input_path)

$$ \textbf{Traitement des valeurs manquantes dans la colonne 'strength':} $$

In [74]:
df = fsel.remove_nan_from_strength(df)

$$ \textbf{Encodage des caractéristiques : } $$

In [75]:
# Encodage des caractéristiques de type catégorielle :
# On utilise LabelEncoder() pour les variables pour lesquelles l'ordre n'est pas important

# Colonne pour lesquels l'ordre n'est pas important
categorical_columns_1 = ['period_type', 'attacking_team_name', 'shooter', 'goalie', 'rebound', 'last_event_type', 'home_team']

# Colonne pour laquelle l'ordre est important
# Sachant que certains types de tirs sont plus efficaces en moyenne que d'autre, on encode les 
# types de tirs les plus efficaces avec des valeurs élevées
# (Au Milestone 1, on a vu que les 'Tip-in' sont les plus efficaces et que les 'Wrap-around' sont les moins
# efficaces)

shot_type_classified = [['Wrap-around',0], ['Slap Shot', 1], ['Snap Shot', 2], ['Wrist Shot', 3], ['Backhand', 4], ['Deflected', 5], ['Tip-In',6]]

# La caractéristique 'strength' doit aussi étre encodée de manière ordinale, étant donné
# que lorsqu'une équipe est en 'Power Play', elle a plus de chances de marquer tandis que lorsqu'elle est
# 'Short handed', ses chances de marquer diminuent
strength_classified = [['Short Handed',0], ['Even', 1], ['Power Play', 2]]

In [76]:
df = df.dropna()
df = fsel.encode_categorical_features(df, categorical_columns_1, shot_type_classified, strength_classified)

$$ \textbf{Sélection des caractéristiques + Séparation des données (entrainement, validation, test):} $$

$$\textbf{Méthode de filtrage (K-best) } $$

In [77]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import recall_score, f1_score

In [78]:
def get_test_df(df: pd.DataFrame, test_year : int) :
    df = df.copy()
    # Ajout d'une colonne pour l'année
    df['year'] = df['gameID'].apply(lambda x : x//1000000)

    # Récupération du DataFrame de test
    test_df = df[df['year'] == test_year]
    test_df.drop(columns = 'year')

    # Récupération du DataFrame d'entrainement et validation
    train_val_df = df[df['year'] == test_year]
    train_val_df.drop(columns = 'year')

    return test_df, train_val_df

In [79]:
# Récupération des deux DataFrames
test_df, train_val_df = get_test_df(df, 2020)

In [80]:
X = train_val_df.drop(columns=['is_goal', 'period_time'])
Y = train_val_df['is_goal']

# On récupère le dataset avec les K-meilleures caractéristiques
X_Kbest, Kbest_features = fsel.get_features_KBest(X, Y, 10)

  f = msb / msw


Unbalanced dataset

In [81]:
X_Kbest.to_csv('../data/datasets/csv_files/Train_features.csv', index=False)

In [82]:
Y.to_csv('../data/datasets/csv_files/Train_labels.csv', index = False)

In [83]:
# Mise à jour de notre ensemble de test
test_label = test_df['is_goal']
test_features = test_df[Kbest_features]

In [84]:
test_features.to_csv('../data/datasets/csv_files/Test_features.csv', index = False)
test_label.to_csv('../data/datasets/csv_files/Test_labels.csv', index = False)

In [26]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_Kbest, Y, train_size = 0.8, random_state = 42 
)

In [None]:
X

In [36]:
X_val

Unnamed: 0,shot_type,strength,last_event_type,time_since_last_event,distance_from_last_event,home_team_players,away_team_players,distance_to_net,rebound,speed
347783,6,1,7,21,154.081147,5,5,16.0,1,7.337197
373714,3,1,7,19,17.464249,5,5,32.0,1,0.919171
368766,3,1,2,2,13.000000,5,5,11.0,0,6.500000
358433,3,1,2,6,127.200629,5,5,6.0,0,21.200105
351560,3,1,1,20,20.248457,5,5,10.0,0,1.012423
...,...,...,...,...,...,...,...,...,...,...
354786,3,1,4,7,45.177428,5,5,36.0,0,6.453918
341880,3,0,7,4,5.830952,4,5,15.0,1,1.457738
375363,6,1,1,9,146.768525,5,5,14.0,0,16.307614
373127,3,1,1,58,39.395431,5,5,68.0,0,0.679232


$$ \textbf{Réequilibrage des données : }  $$ 

In [27]:
# 1ere approche : Oversampling en utilisant la méthode SMOTE 
X_train_over, Y_train_over = fsel.oversample_dataset(X_train, Y_train)

Execute the following two rows to get a unique dataset on which you will continue to work on for the rest of the experiences

In [28]:
X_train_over.to_csv('../data/datasets/csv_files/Oversampled_train_dataset.csv', index = False)

In [29]:
Y_train_over.to_csv('../data/datasets/csv_files/Oversampled_train_labels.csv', index = False)

In [37]:
X_val.to_csv('../data/datasets/csv_files/Validation_set_features.csv', index = False)

In [38]:
Y_val.to_csv('../data/datasets/csv_files/Validation_set_labels.csv', index = False)

In [91]:
# 2e approche : Utilisation de RandomUnderSampler pour réduire la taille de la classe majoritaire
from imblearn.under_sampling import RandomUnderSampler
# https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html#imblearn.under_sampling.RandomUnderSampler

In [92]:
rus = RandomUnderSampler(random_state = 42)
X_train_res, Y_train_res = rus.fit_resample(X_train, Y_train)

In [95]:
rf = RandomForestClassifier(random_state = 42)

rf.fit(X_train_res, Y_train_res)

y_pred = rf.predict(X_val)

print("Accuracy_score :",accuracy_score(y_pred, Y_val))
print("Recall score :" , recall_score(y_pred, Y_val))
print("F1-score :" , f1_score(y_pred, Y_val))
print("Roc_auc :", roc_auc_score(y_pred, Y_val))

Accuracy_score : 0.6433651506644208
Recall score : 0.15031722791605662
F1-score : 0.24429902835613723
Roc_auc : 0.5501889721851079


$$ \textbf{Entrainement d'un modèle RandomForest} $$ 

In [31]:
RandomForest_model = RandomForestClassifier(random_state = 42)

In [32]:
# Entrainement du modèle sur les données sur-échantillonées par SMOTE
RandomForest_model.fit(X_train_over,Y_train_over)

In [33]:
Y_pred = RandomForest_model.predict(X_val)

In [35]:
confusion_matrix(Y_val,Y_pred)

array([[9108,  633],
       [ 806,  139]], dtype=int64)

In [30]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))

Accuracy_score : 0.8664607898184541
Recall score : 0.17432432432432432
F1-score : 0.15311572700296736


$$ \textbf{Méthode de wrapping : Recursive feature elimination (RFE)} $$

In [58]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

In [59]:
X_1 = train_val_df.drop(columns=['is_goal', 'period_time'])
Y_1 = train_val_df['is_goal']

In [60]:
estimator = SVC(kernel = 'linear')
selector = RFE(estimator, n_features_to_select = 10, step = 1)

selector = selector.fit(X_1,Y_1)

In [61]:
selector.get_feature_names_out()

array(['game_seconds', 'gameID', 'shooter', 'goalie', 'last_event_x',
       'time_since_last_event', 'distance_from_last_event',
       'powerplay_duration', 'distance_to_net', 'speed'], dtype=object)

$$ \textbf{Optimisation des hyperparamètres du modèle : Cross-Validation} $$ 

In [40]:
from sklearn.model_selection import RandomizedSearchCV 

In [41]:
from scipy.stats import randint

In [42]:
param_dist = {'n_estimators' : [100, 150, 200, 250, 300, 400, 500],
              'max_depth' : [5, 10, 15, 20]}

rf = RandomForestClassifier(random_state = 42)

rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, scoring='f1', n_iter = 15, cv = 5)

rand_search.fit(X_train_over, Y_train_over)

best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'n_estimators': 200, 'max_depth': 20}


In [48]:
rf_opti = RandomForestClassifier(n_estimators = 200, max_depth = 20, random_state = 42)

rf_opti.fit(X_train_over,Y_train_over)

In [49]:
Y_pred = rf_opti.predict(X_val)

In [50]:
from sklearn.metrics import roc_auc_score

In [51]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))
print("Roc_auc :", roc_auc_score(Y_pred, Y_val))

Accuracy_score : 0.8504585438892008
Recall score : 0.18454106280193236
F1-score : 0.19292929292929292
Roc_auc : 0.5532072218993601


In [47]:
confusion_matrix(Y_pred, Y_val)

array([[9741,  411],
       [   0,  534]], dtype=int64)

In [54]:
# Randomized Search CV on the original data
param_dist = {'n_estimators' : [100, 150, 200, 250, 300, 400, 500],
              'max_depth' : [5, 10, 15, 20]}

rf = RandomForestClassifier(random_state = 42)

rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, scoring='f1', n_iter = 15, cv = 5)

rand_search.fit(X_Kbest, Y)

best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'n_estimators': 100, 'max_depth': 20}


In [55]:
rf_opti_1 = RandomForestClassifier(n_estimators = 100, max_depth = 20, random_state = 42)

rf_opti_1.fit(X_Kbest,Y)

In [56]:
Y_pred = rf_opti_1.predict(X_val)

In [57]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))
print("Roc_auc :", roc_auc_score(Y_pred, Y_val))

Accuracy_score : 0.9606962380685008
Recall score : 1.0
F1-score : 0.7142857142857143
Roc_auc : 0.9793327428402716


In [96]:
y_pred_1 = rf_opti_1.predict(test_features)
#test_labels

In [99]:
print("Accuracy_score :",accuracy_score(y_pred_1, test_label))
print("Recall score :" , recall_score(y_pred_1, test_label))
print("F1-score :" , f1_score(y_pred_1, test_label))
print("Roc_auc :", roc_auc_score(y_pred_1, test_label))

Accuracy_score : 0.9598884438288474
Recall score : 1.0
F1-score : 0.7185817465528562
Roc_auc : 0.9788617084237522


In [100]:
confusion_matrix(y_pred_1, test_label)

array([[48547,  2143],
       [    0,  2736]], dtype=int64)

$$ \textbf{Utilisation d'un autre modèle de classification : AdaBoost} $$

In [86]:
from sklearn.ensemble import AdaBoostClassifier

In [101]:
Ada_clf = AdaBoostClassifier(n_estimators = 100, random_state = 42)

Ada_clf.fit(X_train_over, Y_train_over)

In [102]:
Y_pred = Ada_clf.predict(X_val)

In [103]:
print("Accuracy_score :",accuracy_score(Y_pred, Y_val))
print("Recall score :" , recall_score(Y_pred, Y_val))
print("F1-score :" , f1_score(Y_pred, Y_val))

Accuracy_score : 0.7427475201197828
Recall score : 0.1693548387096774
F1-score : 0.2515654778110536


In [104]:
confusion_matrix(Y_val, Y_pred)

array([[7475, 2266],
       [ 483,  462]], dtype=int64)