In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score, 
accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

# Display all columns
pd.set_option('display.max_columns', None)
# Display all rows
pd.set_option('display.max_rows', None)

# Load the data
df = pd.read_csv('shotData.csv')

# One hot encoding
one_hot_columns = ['locationOnNet', 'manpowerSituation', 
                   'shotType', 'ozEntryType', 'entryType', 
                   'precedingEventOneType', 'precedingEventTwoType', 
                   'precedingEventThreeType', 'period', 'strength']
df = pd.get_dummies(df, columns=one_hot_columns, drop_first=True)

# Label encoding for binary columns
label_encoders = {}
for column in ['screen', 'shooterUnderPressure', 'oneTimer', 'lastFaceoffOutcome']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Handle deflection columns
df['deflection'] = ~df['deflectionX'].isna() * 1
df = df.drop(columns=['deflectionX', 'deflectionY'])

df['is_goal'] = df['shotResult'].apply(lambda x: 1 if x == 'goal' else 0)
X = df.drop(columns=['shotResult', 'is_goal', 'playerId'])
y = df['is_goal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Imputation for missing values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Check for NaNs
assert not pd.isna(X_train_imputed).any(), "NaN values found in X_train after imputation"
assert not pd.isna(X_test_imputed).any(), "NaN values found in X_test after imputation"

# Normalizing the data
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

X_train_imputed_normalized = scaler.fit_transform(X_train_imputed)
X_test_imputed_normalized = scaler.transform(X_test_imputed)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed_normalized, y_train)

# Check the distribution of the target variable after SMOTE
print("Distribution of y_train before SMOTE:", y_train.value_counts())
print("Distribution of y_train after SMOTE:", y_train_resampled.value_counts())

# Training & Evaluating models
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), X_train_resampled, X_test_imputed_normalized),
    'Random Forest': (RandomForestClassifier(), X_train_resampled, X_test_imputed_normalized),
    'SVM': (SVC(probability=True), X_train_resampled, X_test_imputed_normalized),
    'Gradient Boosted Trees (XGBoost)': (xgb.XGBClassifier(), X_train_resampled, X_test_normalized),
    'Gradient Boosted Trees (LightGBM)': (lgb.LGBMClassifier(), X_train_resampled, X_test_normalized),
    'Gradient Boosted Trees (CatBoost)': (CatBoostClassifier(silent=True), X_train_resampled, X_test_normalized)
}

results = {}
for name, (model, X_train_model, X_test_model) in models.items():
    model.fit(X_train_model, y_train_resampled)  
    y_pred = model.predict(X_test_model)
    y_prob = model.predict_proba(X_test_model)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results[name] = [accuracy, precision, recall, auc]
    print(f"Results for {name}:\n")
    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", auc)
    print("---------------------------\n")

# Select model
best_model_name = max(results, key=lambda k: results[k][3])
best_model, _, _ = models[best_model_name]
print(f"The best model is: {best_model_name} with ROC AUC: {results[best_model_name][3]:.4f}")

# Ranking players by shooting ability using the best model
players = df['playerId'].unique()
player_goal_prob = {}

for player in players:
    player_shots = df[df['playerId'] == player].drop(columns=['shotResult', 'is_goal', 'playerId'])
    
    player_shots_imputed = imputer.transform(player_shots)
    
    player_shots_normalized = scaler.transform(player_shots_imputed)
    
    player_goal_prob[player] = best_model.predict_proba(player_shots_normalized)[:, 1].mean()

sorted_players = sorted(player_goal_prob.items(), key=lambda x: x[1], reverse=True)
print("\nPlayers ranked from best shooting ability to worst using the best model:")
for rank, (player, prob) in enumerate(sorted_players, 1):
    print(f"{rank}. Player {player} with average goal probability: {prob:.4f}")


Distribution of y_train before SMOTE: is_goal
0    758
1     42
Name: count, dtype: int64
Distribution of y_train after SMOTE: is_goal
0    758
1    758
Name: count, dtype: int64
Results for Logistic Regression:

              precision    recall  f1-score   support

           0       0.97      0.81      0.89       188
           1       0.19      0.67      0.29        12

    accuracy                           0.81       200
   macro avg       0.58      0.74      0.59       200
weighted avg       0.93      0.81      0.85       200

ROC AUC Score: 0.8754432624113475
---------------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for Random Forest:

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       188
           1       0.00      0.00      0.00        12

    accuracy                           0.94       200
   macro avg       0.47      0.50      0.48       200
weighted avg       0.88      0.94      0.91       200

ROC AUC Score: 0.7865691489361702
---------------------------

Results for SVM:

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       188
           1       0.00      0.00      0.00        12

    accuracy                           0.92       200
   macro avg       0.47      0.49      0.48       200
weighted avg       0.88      0.92      0.90       200

ROC AUC Score: 0.7938829787234043
---------------------------

Results for Gradient Boosted Trees (XGBoost):

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       188
           1      



Results for Gradient Boosted Trees (LightGBM):

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       188
           1       0.12      0.08      0.10        12

    accuracy                           0.91       200
   macro avg       0.53      0.52      0.53       200
weighted avg       0.89      0.91      0.90       200

ROC AUC Score: 0.8280141843971631
---------------------------

Results for Gradient Boosted Trees (CatBoost):

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       188
           1       0.00      0.00      0.00        12

    accuracy                           0.94       200
   macro avg       0.47      0.50      0.48       200
weighted avg       0.88      0.94      0.91       200

ROC AUC Score: 0.7699468085106383
---------------------------

The best model is: Logistic Regression with ROC AUC: 0.8754

Players ranked from best shooting ability to worst using the be

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
y_train

29     0
535    0
695    0
557    0
836    0
596    0
165    0
918    0
495    1
824    0
65     0
141    0
925    0
827    0
655    0
331    0
664    0
249    0
907    0
708    0
305    0
734    0
975    1
49     0
896    0
2      0
544    0
350    0
904    0
536    0
344    0
994    0
481    0
575    0
33     0
31     0
231    0
963    0
192    0
333    0
3      0
204    1
514    0
799    0
306    0
109    0
430    0
77     1
84     0
286    0
82     0
991    0
789    0
894    0
398    0
323    0
519    0
916    0
922    0
5      0
731    0
465    0
97     0
266    0
357    0
868    0
798    0
380    0
631    0
381    0
490    0
118    0
900    1
250    0
523    0
9      0
196    0
603    0
81     0
783    0
587    0
797    0
239    0
290    0
211    0
717    0
359    0
449    0
227    0
950    0
946    0
796    0
501    0
464    0
362    0
468    0
935    0
428    0
7      0
155    0
541    0
440    0
482    0
422    0
778    0
949    0
334    0
576    0
934    0
567    0
594    0
5