In [1]:
!pip install xgboost
!pip install lightgbm
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, make_scorer, brier_score_loss
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from joblib import dump, load
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, randint as sp_randint, uniform as sp_uniform
from scipy.cluster import hierarchy

In [2]:
seed = 42

In [3]:
# define Mcfadden's pseduo r-squared
def mcfadden_r2(y, y_pred):
    ll = log_loss(y, y_pred)
    ll_null = log_loss(y, np.full(len(y), y.mean()))
    return 1 - (ll/ll_null)
pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)

In [4]:
scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}

In [5]:
df = pd.read_csv('Statsbomb_Shots_WO_Outliers.csv')

  df = pd.read_csv('Statsbomb_Shots_WO_Outliers.csv')


In [6]:
df.drop(columns='Unnamed: 0',inplace=True)

In [7]:
df.head(10)

Unnamed: 0,match_id,id,eventSec,period,goal,team_id,team_name,player_id,shot_type_name,x,...,gk_player_id,firstName,middleName,lastName,Name,area_shot,area_goal,opps_in_n_angle,goalkeeper_x,goalkeeper_y
0,2275050,25e512a2-6ce0-41db-8749-88d988947058,780.422,2,False,969,Birmingham City WFC,15580.0,open_play,91.158333,...,33664.0,Claudia,,Walker,Claudia Walker,3836.901495,757.206131,1.0,101.975,29.606667
1,2275050,3ece6827-76fa-4ae1-8c4c-ef4af1ae4c4e,950.81,2,False,969,Birmingham City WFC,31563.0,free_kick,101.333333,...,33664.0,Rachel,,Williams,Rachel Williams,5272.748045,200.239061,3.0,104.45,36.4705
2,2275050,dcc3c9f6-1b25-4fb4-ae1a-5dcf52d16769,2386.037,1,False,969,Birmingham City WFC,15562.0,throw_in,84.471429,...,33664.0,Lucy,,Staniforth,Lucy Staniforth,5509.133803,286.591404,2.0,102.066667,31.621
3,2275050,9bdbfc3d-959d-4fc4-bac2-6b5d154191e9,746.245,1,False,974,Reading WFC,26570.0,free_kick,88.328571,...,22032.0,Amalie,Vevle,Eikeland,Amalie Vevle Eikeland,3791.877712,1048.194587,1.0,103.9,31.5295
4,2275050,a2068f0f-d4fd-44e3-b1a2-2497d7b47b0a,902.629,1,False,974,Reading WFC,26570.0,free_kick,93.725,...,22032.0,Amalie,Vevle,Eikeland,Amalie Vevle Eikeland,317.530346,1126.366907,1.0,104.175,35.83
5,2275050,8439b96b-fc49-4511-9628-059c2c996a83,2310.041,1,True,974,Reading WFC,26570.0,free_kick,85.671429,...,22032.0,Amalie,Vevle,Eikeland,Amalie Vevle Eikeland,3724.209813,1794.567066,0.0,99.408333,31.5295
6,2275050,db9cc459-2dbe-42b8-8c75-e50d1b49bdc5,1468.335,2,False,974,Reading WFC,26570.0,open_play,95.191667,...,22032.0,Amalie,Vevle,Eikeland,Amalie Vevle Eikeland,3969.499272,1197.269435,0.0,102.341667,37.843333
7,2275050,537f7379-0bf7-4bd4-be7f-7cc25c43de98,2626.523,2,False,974,Reading WFC,26570.0,open_play,94.733333,...,22032.0,Amalie,Vevle,Eikeland,Amalie Vevle Eikeland,6900.467557,613.895436,2.0,104.45,32.4445
8,2275050,ce34292d-68c7-40e9-a949-31c5fc7f2ec7,781.452,1,False,974,Reading WFC,15723.0,open_play,84.471429,...,22032.0,Brooke,,Chaplen,Brooke Chaplen,1896.902245,1086.456635,1.0,104.45,36.6535
9,2275050,9c0ec486-fade-48fe-b57d-28a6a5e175ae,2748.546,1,False,974,Reading WFC,15723.0,open_play,93.541667,...,22032.0,Brooke,,Chaplen,Brooke Chaplen,4442.202604,1003.381131,0.0,102.25,31.255


In [8]:
df['strong_foot'] = (((df.strong_foot.isin(['left', 'both'])) &(df.body_part_name == 'Left Foot')) |
                                    ((df.strong_foot.isin(['right', 'both'])) & (df.body_part_name == 'Right Foot')))

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32680 entries, 0 to 32679
Data columns (total 43 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   match_id                            32680 non-null  int64  
 1   id                                  32680 non-null  object 
 2   eventSec                            32680 non-null  float64
 3   period                              32680 non-null  int64  
 4   goal                                32680 non-null  bool   
 5   team_id                             32680 non-null  int64  
 6   team_name                           32680 non-null  object 
 7   player_id                           32680 non-null  float64
 8   shot_type_name                      32680 non-null  object 
 9   x                                   32680 non-null  float64
 10  y                                   32680 non-null  float64
 11  counter_attack                      32680

In [10]:
df.drop(['match_id', 'id', 'eventSec','period', 'player_id', 'competition_gender', 'team_name',
         'firstName', 'middleName', 'lastName', 'Name','team_id'], axis=1, inplace=True)
X = df.drop('goal', axis=1)
y = df.goal

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)


In [12]:
print('Shots train', len(y_train), ';Number goals', y_train.sum(),
      ';Goals %: ', round(y_train.mean()*100, 1))
print('Shots test', len(y_test), ';Number goals', y_test.sum(),
      ';Goals %: ', round(y_test.mean()*100, 1))

Shots train 26144 ;Number goals 2921 ;Goals %:  11.2
Shots test 6536 ;Number goals 730 ;Goals %:  11.2


**LOGISTIC REGRESSION** 
                          

In [13]:
logisticReg_drop_cols = ['x', 'y',  # logistic regression does not deal well with dependent features
                      # The model will use the distance/ angle features capture these location features instead
                      # lots of missings for the below features as they come from StatsBomb data only.
                      # It's not fair to impute these as they are not missing at random
                      # while logistic regression does not allow missings so I removed them
                      'pass_end_y', 'pass_end_x',  # <- note these were in Wyscout, but often were just the shot location
                      'gk_player_id',
                      'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal',
                      'under_pressure', 'area_shot', 'area_goal', 'opps_in_n_angle']
X_train_logistic = X_train.drop(logisticReg_drop_cols, axis=1).copy()
X_test_logistic = X_test.drop(logisticReg_drop_cols, axis=1).copy()

In [14]:
X_train_logistic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26144 entries, 8111 to 2939
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   shot_type_name                      26144 non-null  object 
 1   counter_attack                      26144 non-null  bool   
 2   fast_break                          26144 non-null  bool   
 3   strong_foot                         26144 non-null  bool   
 4   body_part_name                      26144 non-null  object 
 5   assist_type                         26144 non-null  object 
 6   pass_switch                         18493 non-null  object 
 7   pass_cross                          18493 non-null  object 
 8   pass_cut_back                       18493 non-null  object 
 9   pass_height_name                    18493 non-null  object 
 10  pass_technique_name                 18493 non-null  object 
 11  visible_angle                       261

In [15]:
def split(X, y):
    mask = X.assist_type == 'pass'
    X_pass = X[mask].drop('assist_type', axis=1).copy()
    y_pass = y[mask]
    X_other = X[~mask].dropna(axis=1, how='all').copy()
    y_other = y[~mask]
    return X_pass, y_pass, X_other, y_other

In [16]:
X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train_logistic, y_train)
X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test_logistic, y_test)

In [17]:
cols = ['shot_type_name', 'body_part_name', 'pass_technique_name', 'pass_height_name']
cats = [['open_play', 'free_kick', 'corner', 'throw_in'],
        ['Right Foot', 'Left Foot', 'Other'],
        ['other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging'],
        ['Ground/ Low Pass', 'High Pass']]
pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_pass = Pipeline([('one_hot', pass_one_hot),
                      ('impute', SimpleImputer()),
                      ('scale', StandardScaler()),
                      ('lr', LogisticRegression(random_state=seed))])

In [18]:
original_cols_remain = [col for col in X_train_pass.columns if col not in cols]
new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_pass.extend(original_cols_remain)

In [19]:
print(new_cols_pass)

['free_kick', 'corner', 'throw_in', 'Left Foot', 'Other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging', 'High Pass', 'counter_attack', 'fast_break', 'strong_foot', 'pass_switch', 'pass_cross', 'pass_cut_back', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle_interaction', 'log_distance']


In [20]:
# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')
X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'
X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'

cols = ['shot_type_name', 'body_part_name', 'assist_type']
cats = [['open_play', 'free_kick', 'corner', 'throw_in', 'direct_set_piece'],
        ['Right Foot', 'Left Foot', 'Other'],
        ['recovery', 'clearance', 'rebound']]
other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_other = Pipeline([('one_hot', other_one_hot),
                       ('impute', SimpleImputer()),
                       ('scale', StandardScaler()),
                       ('lr', LogisticRegression(random_state=seed))])

In [21]:
original_cols_remain = [col for col in X_train_other.columns if col not in cols]
new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_other.extend(original_cols_remain)

In [22]:
print(new_cols_other)

['free_kick', 'corner', 'throw_in', 'direct_set_piece', 'Left Foot', 'Other', 'clearance', 'rebound', 'counter_attack', 'fast_break', 'strong_foot', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle_interaction', 'log_distance']


In [23]:
param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}

In [24]:
clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_pass.fit(X_train_pass, y_train_pass)
print('C:', clf_pass.best_estimator_.named_steps.lr.C)

C: 0.049080512716538564


In [25]:
clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_other.fit(X_train_other, y_train_other)
print('C:', clf_other.best_estimator_.named_steps.lr.C)

C: 0.1253082884091345


In [None]:
nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)
print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared shots assisted by passes:", nested_score_pass['test_mcfaddens_r2'].mean())

  * (last_sum / last_over_new_count - new_sum) ** 2
  * (last_sum / last_over_new_count - new_sum) ** 2


In [None]:
nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)
print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared for other model:", nested_score_other['test_mcfaddens_r2'].mean())

**LIGHT GBM**

In [30]:
from imblearn.over_sampling import SMOTENC

In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26144 entries, 8111 to 2939
Data columns (total 30 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   shot_type_name                      26144 non-null  object 
 1   x                                   26144 non-null  float64
 2   y                                   26144 non-null  float64
 3   counter_attack                      26144 non-null  bool   
 4   fast_break                          26144 non-null  bool   
 5   strong_foot                         26144 non-null  bool   
 6   body_part_name                      26144 non-null  object 
 7   assist_type                         26144 non-null  object 
 8   pass_end_x                          18493 non-null  float64
 9   pass_end_y                          18493 non-null  float64
 10  pass_switch                         18493 non-null  object 
 11  pass_cross                          184

In [32]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 26144 entries, 8111 to 2939
Series name: goal
Non-Null Count  Dtype
--------------  -----
26144 non-null  bool 
dtypes: bool(1)
memory usage: 229.8 KB


In [33]:
columns_with_missing = ['pass_end_x', 'pass_end_y', 'pass_switch', 'pass_cross', 'pass_cut_back', 
                        'pass_height_name', 'pass_technique_name', 'shot_one_on_one', 'shot_open_goal', 
                        'opps_in_n_angle', 'goalkeeper_x', 'goalkeeper_y']

# create a SimpleImputer object, you can specify the strategy for imputing missing values
# here we use "most_frequent" strategy to replace missing values with the most frequent value in the column
imputer = SimpleImputer(strategy='most_frequent')

# fit the imputer to your DataFrame
imputer.fit(X_train[columns_with_missing])

# transform the DataFrame by replacing missing values with the imputed values
X_train[columns_with_missing] = imputer.transform(X_train[columns_with_missing])


In [None]:
smote_nc = SMOTENC(categorical_features=[0, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 19, 21, 24, 23], 
                   random_state=42, sampling_strategy='minority',k_neighbors=10)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)


In [None]:
X_train

In [None]:
categorical_features = [0, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 19, 21, 24, 23]
bool_features = [3, 4, 5, 19, 24]
print(categorical_features.extend(bool_features))

In [None]:
X_train