In [1]:
import os
import pandas as pd
import numpy as np


from sklearn.metrics import log_loss, roc_auc_score, roc_curve, make_scorer, brier_score_loss
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve

from joblib import dump, load
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

Metric for comparison: McFadden's Pseudo R-Squared

In [2]:
def mcfadden_r2(y, y_pred):
    ll = log_loss(y, y_pred)
    ll_null = log_loss(y, np.full(len(y), y.mean()))
    return 1 - (ll/ll_null)
pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)

In [3]:
scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}

Load the data

In [4]:
cwd = os.getcwd()
df = pd.read_parquet(os.path.join(cwd, 'data', 'shots.parquet'))

# Logistic regression

Drop first the columns without any information useful such as id's and names. Also drop columns for logistic regression. LR does not deal well with dependent features such as X and Y as we will use use the distance/ angle features capture these location features instead. Same problem for features with columns with missing data (the ones that come from StatsBomb only).
Split data in penalty and non-penalty shots and subset dataset for logistic regression.

In [5]:
df.drop(['x', 'y','match_id', 'statsbomb_id', 'statsbomb_team_id', 'player_id_statsbomb', 'competition_gender', 'team_name',
         'player_id', 'firstName', 'middleName', 'lastName', 'Name', 'dataset', 'wyscout_id', 'wyscout_team_id', 'team_id',
         'player_id_wyscout','competition_name','minute','shot_zone','match_week','pass_end_y',
         'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal','under_pressure', 'area_shot', 
         'area_goal', 'n_angle', 'smart_pass','pass_end_x'], axis=1, inplace=True)
mask_penalty = (df.shot_type_name=='penalty')
df_penalty = df[mask_penalty].copy()
df_penalty.drop(['visible_angle','middle_angle','distance_to_goal','distance_visible_angle','log_distance_to_goal',
                 'assist_type', 'pass_switch', 'pass_cross', 'pass_cut_back','counter_attack', 'pass_height_name',
                 'pass_technique_name', 'shot_zone_number','shot_zone_player_number','fast_break', 'strong_foot',
                 'body_part_name','shot_type_name'], axis=1, inplace=True)
df_non_penalty = df[~mask_penalty].copy()
X_penalty = df_penalty.drop('goal', axis=1)
y_penalty = df_penalty.goal
X = df_non_penalty.drop('goal', axis=1)
y = df_non_penalty.goal

Split into train and test datasets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
X_penalty_train, X_penalty_test, y_penalty_train, y_penalty_test = train_test_split(X_penalty, y_penalty,
                                                                        train_size=0.8, random_state=42, stratify=y_penalty)

Split dataset for logistic regession into passes / other assists. They have diferent columns of information.

In [7]:
def split(X, y):
    mask = X.assist_type == 'pass'
    X_pass = X[mask].drop('assist_type', axis=1).copy()
    y_pass = y[mask]
    X_other = X[~mask].dropna(axis=1, how='all').copy()
    y_other = y[~mask]
    return X_pass, y_pass, X_other, y_other

In [8]:
X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train, y_train)
X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test, y_test)

Pipeline for cleaning pass assists

In [9]:
cols = ['competition_type','competition_part','H_A_column','match_moment','shot_type_name', 'body_part_name',
        'pass_technique_name', 'pass_height_name']
cats = [['League', 'Cup'],
        ['start', 'middle', 'final', 'group', 'knockout'],
        ['Home Team', 'Away Team'],
        ['0-15','15-30','30-45','45-60','60-75','75-90'],
        ['open_play', 'corner', 'throw_in', 'free_kick'],
        ['Right Foot', 'Left Foot', 'Other'],
        ['other', 'Outswinging', 'Through Ball', 'Inswinging', 'Straight'],
        ['Ground/ Low Pass', 'High Pass']]
pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_pass = Pipeline([('one_hot', pass_one_hot),
                      ('impute', SimpleImputer()),
                      ('scale', StandardScaler()),
                      ('lr', LogisticRegression(random_state=42))])

Column names of transformed pass data

In [10]:
original_cols_remain = [col for col in X_train_pass.columns if col not in cols]
new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_pass.extend(original_cols_remain)

Pipeline for cleaning other assists

In [11]:
# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')
X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'
X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'

cols = ['competition_type','competition_part','H_A_column','match_moment','shot_type_name', 'body_part_name', 'assist_type']
cats = [['League', 'Cup'],
        ['start', 'middle', 'final', 'group', 'knockout'],
        ['Home Team', 'Away Team'],
        ['0-15','15-30','30-45','45-60','60-75','75-90'],
        ['free_kick', 'direct_set_piece', 'corner', 'open_play', 'throw_in'],
        ['Other', 'Right Foot', 'Left Foot'],
        ['rebound', 'recovery', 'clearance']]
other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_other = Pipeline([('one_hot', other_one_hot),
                       ('impute', SimpleImputer()),
                       ('scale', StandardScaler()),
                       ('lr', LogisticRegression(random_state=42))])

Column names of transformed passes

In [12]:

original_cols_remain = [col for col in X_train_other.columns if col not in cols]
new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_other.extend(original_cols_remain)

Pipeline for cleaning penalties

In [13]:
cols = ['competition_type','competition_part','H_A_column','match_moment']
cats = [['League', 'Cup'],
        ['start', 'middle', 'final', 'group', 'knockout'],
        ['Home Team', 'Away Team'],
        ['0-15','15-30','30-45','45-60','60-75','75-90']]
penalty_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_penalty = Pipeline([('one_hot', penalty_one_hot),
                      ('impute', SimpleImputer()),
                      ('scale', StandardScaler()),
                      ('lr', LogisticRegression(random_state=42))])

Column names of transformed penalties

In [14]:
original_cols_remain = [col for col in X_penalty_train.columns if col not in cols]
new_cols_penalties = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_penalties.extend(original_cols_remain)

Search parameters for gridsearchcv

In [15]:
param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}

Fit the inner grid search for shots assisted by passes

In [16]:
clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_pass.fit(X_train_pass, y_train_pass)
lr = clf_pass.best_estimator_.named_steps['lr']

In [17]:
nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)
print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared shots assisted by passes:", nested_score_pass['test_mcfaddens_r2'].mean())

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


Fit the inner grid search for shots assisted other than passes

In [None]:
clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_other.fit(X_train_other, y_train_other)
nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)
print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared for other model:", nested_score_other['test_mcfaddens_r2'].mean())

Fit the inner grid search for penalty shots

In [None]:
# 
clf_penalty = GridSearchCV(estimator=pipe_penalty, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_penalty.fit(X_penalty_train, y_penalty_train)
nested_score_penalty = cross_validate(clf_penalty, X=X_penalty_train, y=y_penalty_train, scoring=scoring, n_jobs=-1)
print('ROC AUC for penalty model:', nested_score_penalty['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared for penalty:", nested_score_penalty['test_mcfaddens_r2'].mean())

# Test

In [None]:
y_pred_lr_pass = clf_pass.predict_proba(X_test_pass)
y_pred_lr_other = clf_other.predict_proba(X_test_other)
y_pred_lr = np.concatenate([y_pred_lr_pass, y_pred_lr_other])
y_true_test = np.concatenate([y_test_pass, y_test_other])
fraction_of_positives_lr, mean_predicted_value_lr = calibration_curve(y_true_test, y_pred_lr[:, 1], n_bins=10)

Plot calibration curve on test data

In [None]:
plt.style.use('dark_background')
fig = plt.figure(constrained_layout=True, figsize=(10, 15))
gs = fig.add_gridspec(ncols=1, nrows=2, height_ratios=(2/3, 1/3))
ax1 = fig.add_subplot(gs[0])
ax1.plot(mean_predicted_value_lr, fraction_of_positives_lr, "-o", color='#dbdf4a', label='Logistic regression')
ax1.plot([0, 1], [0, 1], "--", color='#e7aeca', label="Perfectly calibrated")
ax1.set_xlabel('Mean predicted value', fontsize=15)
ax1.set_ylabel('Fraction of positives', fontsize=15)
ax1.set_title('Calibration curve', fontsize=20, pad=10)
ax1.legend(fontsize=15)
ax1.tick_params(labelsize=15)
ax2 = fig.add_subplot(gs[1])
sns.distplot(y_pred_lr[:, 1], color='#4fe4e4', label='Logistic regression', kde=False, ax=ax2)
ax2.set_xlabel('Predicted value', fontsize=15)
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)
ax2.legend(fontsize=15)
ax2.set_title('Distribution of predictions', fontsize=20, pad=10);
fig.savefig(os.path.join(cwd, 'figures', '22_calibration_curve.png'), bbox_inches = 'tight', pad_inches = 0.2)

In [None]:
print("The log loss of the model with Random Search is: " + str(log_loss(y_true_test, y_pred_lr)))
print("The ROC AUC score of the model with Random Search is: " +str(roc_auc_score(y_true_test, y_pred_lr[:,1])))
print('Pseudo R-squared, logistic regression:', mcfadden_r2(y_true_test, y_pred_lr[:,1]))
print('Brier score, logistic regression:',brier_score_loss(y_true_test, y_pred_lr[:,1], pos_label=y_true_test.max()))

In [None]:
y_pred_lr_p = clf_penalty.predict_proba(X_penalty_test)
y_true_test_p = y_penalty_test
fraction_of_positives_lr_p, mean_predicted_value_lr_p = calibration_curve(y_true_test_p, y_pred_lr_p[:, 1], n_bins=10)

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(10, 15))
gs = fig.add_gridspec(ncols=1, nrows=2, height_ratios=(2/3, 1/3))
ax1 = fig.add_subplot(gs[0])
ax1.plot(mean_predicted_value_lr_p, fraction_of_positives_lr_p, "-o", color='#dbdf4a', label='Logistic regression')
ax1.plot([0, 1], [0, 1], "--", color='#e7aeca', label="Perfectly calibrated")
ax1.set_xlabel('Mean predicted value', fontsize=15)
ax1.set_ylabel('Fraction of positives', fontsize=15)
ax1.set_title('Calibration curve', fontsize=20, pad=10)
ax1.legend(fontsize=15)
ax1.tick_params(labelsize=15)
ax2 = fig.add_subplot(gs[1])
sns.distplot(y_pred_lr_p[:, 1], color='#4fe4e4', label='Logistic regression', kde=False, ax=ax2)
ax2.set_xlabel('Predicted value', fontsize=15)
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)
ax2.legend(fontsize=15)
ax2.set_title('Distribution of predictions', fontsize=20, pad=10);
fig.savefig(os.path.join(cwd, 'figures', '22_calibration_curve.png'), bbox_inches = 'tight', pad_inches = 0.2)

In [None]:
print("The log loss of the model with Random Search is: " + str(log_loss(y_true_test_p, y_pred_lr_p)))
print("The ROC AUC score of the model with Random Search is: " +str(roc_auc_score(y_true_test_p, y_pred_lr_p[:,1])))
print('Pseudo R-squared, logistic regression:', mcfadden_r2(y_true_test_p, y_pred_lr_p[:,1]))
print('Brier score, logistic regression:',brier_score_loss(y_true_test_p, y_pred_lr_p[:,1], pos_label=y_true_test_p.max()))

# Save models

In [None]:
dump(clf_pass.best_estimator_, os.path.join(cwd, 'models', 'lr_pass.joblib'))

In [None]:
dump(clf_other.best_estimator_, os.path.join(cwd, 'models', 'lr_other.joblib'))

In [None]:
dump(clf_penalty.best_estimator_, os.path.join(cwd, 'models', 'lr_penalty.joblib'))

# Save data

Reload shot dataset for ids

In [None]:
df = pd.read_parquet(os.path.join(cwd, 'data', 'shots.parquet'))
df = df[['match_id', 'wyscout_id', 'statsbomb_id']].copy()

In [None]:
X_train_other['goal'] = y_train_other
X_train_other['split'] = 'train'
X_test_other['goal'] = y_test_other
X_test_other['split'] = 'test'
df_other = pd.concat([X_train_other, X_test_other])
df_other = df_other.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_other.reset_index(drop=True, inplace=True)
df_other.to_parquet(os.path.join(cwd, 'data', 'modelling', 'lr_other.parquet'))

In [None]:
X_train_pass['goal'] = y_train_pass
X_train_pass['split'] = 'train'
X_test_pass['goal'] = y_test_pass
X_test_pass['split'] = 'test'
df_pass = pd.concat([X_train_pass, X_test_pass])
df_pass = df_pass.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_pass.reset_index(drop=True, inplace=True)
df_pass.to_parquet(os.path.join(cwd, 'data', 'modelling', 'lr_pass.parquet'))

In [None]:
X_penalty_train['goal'] = y_penalty_train
X_penalty_train['split'] = 'train'
X_penalty_test['goal'] = y_penalty_test
X_penalty_test['split'] = 'test'
df_penalty = pd.concat([X_penalty_train, X_penalty_test])
df_penalty = df_penalty.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_penalty.reset_index(drop=True, inplace=True)
df_penalty.to_parquet(os.path.join(cwd, 'data', 'modelling', 'lr_penalty.parquet'))