In [3]:
import pandas as pd
import sqlite3
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import csv

# Import XGBoost classifier
from xgboost import XGBClassifier

# Import scikit-learn functions
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Import scikit-plot functions
from scikitplot.metrics import plot_roc_curve
from scikitplot.metrics import plot_precision_recall_curve
from scikitplot.metrics import plot_calibration_curve

# Import SciPy function
from scipy.spatial import distance

pd.set_option("display.max_columns", None)




In [4]:
# Data Directory
data_dir = "../../Data/Big-Data-Cup-2021"
bucketless_data_dir = '../../Data/bdc/data'
tracking_data_dir = "{}/TrackingData".format(data_dir)
db_path = '/Users/keltim01/Documents/databases/'

#connect to database 
conn = sqlite3.connect(db_path + 'bdc_2022.db')
cursor = conn.cursor()

In [5]:
df_train = pd.read_sql(
    '''
    SELECT * FROM pbp_training 
    WHERE (is_shot = 1 OR is_goal = 1)
    '''
,conn)

In [6]:
df_test = pd.read_sql(
    '''
    SELECT * FROM play_by_play
    WHERE is_shot = 1
    '''
,conn)

## Choosing Features of the Expected Goals Model
### Source:
source/inspiration: https://hockey-graphs.com/2019/08/12/expected-goals-model-with-pre-shot-movement-part-1-the-model/

- Unblocked Shots (Fenwick Shots)


### Features:
- shot type, 
- shot location 
- distance from the net
- angle from the net
- type event prior 
- location event prior
- time 
- distance from last event
- time since last event
- angle change between the shot and its prior event
- score state
- strength state

- Dataset mostly contains neutral site games so the home and away should not be as important

In [7]:
df_test.columns

Index(['index', 'game_date', 'season_year', 'team_name', 'opp_team_name',
       'venue', 'period', 'clock_seconds', 'situation_type', 'goals_for',
       'goals_against', 'player_name', 'event', 'event_successful', 'x_coord',
       'y_coord', 'event_type', 'player_name_2', 'x_coord_2', 'y_coord_2',
       'event_detail_1', 'event_detail_2', 'event_detail_3', 'frame_id_1',
       'frame_id_2', 'home_team', 'away_team', 'game_id', 'is_shot', 'is_goal',
       'event_id', 'team_id', 'player_id', 'detail_1_code', 'goal_diff',
       'game_seconds_remaining', 'event_code', 'event_type_code',
       'skaters_for', 'skaters_against', 'strength_state', 'distance_to_goal',
       'angle_to_goal', 'prev_event', 'prev_event_code', 'prev_event_type',
       'prev_event_type_code', 'prev_event_x_coord', 'prev_event_y_coord',
       'prev_event_game_seconds_remaining', 'prev_event_distance_to_goal',
       'prev_event_angle_to_goal', 'time_diff_last_event',
       'angle_diff_last_event', 'distanc

In [8]:
df_train.columns

Index(['index', 'game_date', 'home_team', 'away_team', 'period', 'clock',
       'home_team_skaters', 'away_team_skaters', 'home_team_goals',
       'away_team_goals', 'team', 'player', 'event', 'x_coord', 'y_coord',
       'detail_1', 'detail_2', 'detail_3', 'detail_4', 'player_2',
       'x_coordinate_2', 'y_coordinate_2', 'game_id', 'event_id', 'team_id',
       'player_id', 'is_shot', 'is_goal', 'detail_1_code', 'detail_2_code',
       'detail_3_code', 'detail_4_code', 'is_home', 'goal_diff',
       'game_seconds_remaining', 'event_code', 'strength_state',
       'distance_to_goal', 'angle_to_goal', 'prev_event', 'prev_event_code',
       'prev_event_type', 'prev_event_type_code', 'prev_event_x_coord',
       'prev_event_y_coord', 'prev_event_game_seconds_remaining',
       'prev_event_distance_to_goal', 'prev_event_angle_to_goal',
       'time_diff_last_event', 'angle_diff_last_event',
       'distance_diff_last_event', 'second_prev_event',
       'second_prev_event_code', 'second

In [9]:
df_train['detail_1'].unique(), df_train['detail_2'].unique()

(array(['Snapshot', 'Fan', 'Slapshot', 'Wristshot', 'Deflection',
        'Wrap Around'], dtype=object),
 array(['On Net', 'Missed', 'Blocked'], dtype=object))

In [10]:
df_test['event_type'].unique(), df_test['event_detail_1'].unique()

(array(['Slapshot', 'Snapshot', 'Fan', 'Wristshot', 'Deflection',
        'Wrap Around'], dtype=object),
 array(['Blocked', 'Missed', 'On Net'], dtype=object))

In [11]:
df_test.head()

Unnamed: 0,index,game_date,season_year,team_name,opp_team_name,venue,period,clock_seconds,situation_type,goals_for,goals_against,player_name,event,event_successful,x_coord,y_coord,event_type,player_name_2,x_coord_2,y_coord_2,event_detail_1,event_detail_2,event_detail_3,frame_id_1,frame_id_2,home_team,away_team,game_id,is_shot,is_goal,event_id,team_id,player_id,detail_1_code,goal_diff,game_seconds_remaining,event_code,event_type_code,skaters_for,skaters_against,strength_state,distance_to_goal,angle_to_goal,prev_event,prev_event_code,prev_event_type,prev_event_type_code,prev_event_x_coord,prev_event_y_coord,prev_event_game_seconds_remaining,prev_event_distance_to_goal,prev_event_angle_to_goal,time_diff_last_event,angle_diff_last_event,distance_diff_last_event,second_prev_event,second_prev_event_code,second_prev_event_type,second_prev_event_type_code,second_prev_event_x_coord,second_prev_event_y_coord,second_prev_event_game_seconds_remaining,second_prev_event_distance_to_goal,second_prev_event_angle_to_goal,prev_time_diff_last_event,prev_angle_diff_last_event,prev_distance_diff_last_event
0,12,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1174,5 on 5,0,0,Jocelyne Larocque,Shot,0,149,13,Slapshot,,,,Blocked,1,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,1,0,5,0,42,0,0,3574.0,5,19,5,5,0,50.5099,1.389831,Play,3.0,Direct,6.0,153.0,76.0,3576.0,49.912423,1.104478,-2.0,0.285353,0.597477,Puck Recovery,4.0,,-1.0,170.0,79.0,3577.0,41.620308,0.547945,-1.0,0.556532,8.292116
1,18,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1165,5 on 5,0,0,Sarah Fillier,Shot,0,148,4,Snapshot,,,,Missed,1,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,1,0,5,0,87,1,0,3565.0,5,21,5,5,0,56.975872,1.090909,Puck Recovery,4.0,,-1.0,146.0,8.0,3565.0,55.912879,1.275362,0.0,-0.184453,1.062993,Play,3.0,Direct,6.0,130.0,67.0,3567.0,64.809336,2.44898,-2.0,-1.173617,-8.896457
2,58,8/2/2022,2021,Olympic (Women) - United States,Olympic (Women) - Canada,home,1,1095,5 on 5,0,0,Abby Roque,Shot,0,23,57,Snapshot,,,,On Net,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,1,0,5,4,1,2,0,3495.0,5,21,5,5,0,167.628309,11.517241,Puck Recovery,4.0,,-1.0,50.0,69.0,3499.0,142.485964,5.283019,-4.0,6.234223,25.142345,Puck Recovery,4.0,,-1.0,10.0,9.0,3502.0,183.090824,5.373134,-3.0,-0.090115,-40.60486
3,68,8/2/2022,2021,Olympic (Women) - United States,Olympic (Women) - Canada,home,1,1083,5 on 5,0,0,Kelly Pannek,Shot,0,37,32,Snapshot,,,,On Net,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,1,0,5,4,46,2,0,3483.0,5,21,5,5,0,153.359871,14.571429,Puck Recovery,4.0,,-1.0,37.0,32.0,3483.0,153.359871,14.571429,0.0,0.0,0.0,Play,3.0,Direct,6.0,66.0,3.0,3485.0,130.139348,3.139241,-2.0,11.432188,23.220522
4,82,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1061,5 on 5,0,0,Sarah Nurse,Shot,0,154,2,Snapshot,,,,Missed,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,1,0,5,0,89,1,0,3461.0,5,21,5,5,0,54.187176,0.888889,Puck Recovery,4.0,,-1.0,151.0,5.0,3462.0,54.104066,1.04,-1.0,-0.151111,0.083109,Dump In/Out,0.0,,-1.0,165.0,82.0,3464.0,46.746658,0.632911,-2.0,0.407089,7.357409


In [12]:
df_test_model = pd.read_sql(
    '''
    SELECT player_name, is_goal, event_type_code, x_coord, y_coord, distance_to_goal, angle_to_goal, prev_event_code, prev_event_x_coord, prev_event_y_coord, second_prev_event_code, second_prev_event_x_coord, second_prev_event_y_coord, game_seconds_remaining, time_diff_last_event, angle_diff_last_event, distance_diff_last_event, prev_time_diff_last_event, prev_angle_diff_last_event, prev_distance_diff_last_event, goal_diff, strength_state
    FROM play_by_play
    WHERE is_shot = 1
    '''
,conn)

In [13]:
df_test['event_detail_1'].unique()

array(['Blocked', 'Missed', 'On Net'], dtype=object)

In [14]:
df_test_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709 entries, 0 to 708
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   player_name                    709 non-null    object 
 1   is_goal                        709 non-null    int64  
 2   event_type_code                709 non-null    int64  
 3   x_coord                        709 non-null    int64  
 4   y_coord                        709 non-null    int64  
 5   distance_to_goal               709 non-null    float64
 6   angle_to_goal                  709 non-null    float64
 7   prev_event_code                709 non-null    float64
 8   prev_event_x_coord             709 non-null    float64
 9   prev_event_y_coord             709 non-null    float64
 10  second_prev_event_code         709 non-null    float64
 11  second_prev_event_x_coord      709 non-null    float64
 12  second_prev_event_y_coord      709 non-null    flo

In [15]:
df_test_model.shape

(709, 22)

In [16]:
df_train_model = pd.read_sql(
    '''
    SELECT is_goal, detail_1_code event_type_code, x_coord, y_coord, distance_to_goal, angle_to_goal, prev_event_code, prev_event_x_coord, prev_event_y_coord, second_prev_event_code, second_prev_event_x_coord, second_prev_event_y_coord, game_seconds_remaining, time_diff_last_event, angle_diff_last_event, distance_diff_last_event, prev_time_diff_last_event, prev_angle_diff_last_event, prev_distance_diff_last_event, goal_diff, strength_state
    FROM pbp_training 
    WHERE (is_shot = 1 OR is_goal = 1)
    '''
,conn)

In [17]:
df_train_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3656 entries, 0 to 3655
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   is_goal                        3656 non-null   int64  
 1   event_type_code                3656 non-null   int64  
 2   x_coord                        3656 non-null   int64  
 3   y_coord                        3656 non-null   int64  
 4   distance_to_goal               3656 non-null   float64
 5   angle_to_goal                  3656 non-null   float64
 6   prev_event_code                3656 non-null   float64
 7   prev_event_x_coord             3656 non-null   float64
 8   prev_event_y_coord             3656 non-null   float64
 9   second_prev_event_code         3656 non-null   float64
 10  second_prev_event_x_coord      3656 non-null   float64
 11  second_prev_event_y_coord      3656 non-null   float64
 12  game_seconds_remaining         3656 non-null   i

In [18]:
df_train_model.shape, df_test_model.shape

((3656, 21), (709, 22))

In [19]:
df_test_model['event_type_code'].unique()

array([19, 21,  8, 26,  5, 25])

In [20]:
features = ['event_type_code', 'x_coord', 'y_coord', 'distance_to_goal', 'angle_to_goal', 'prev_event_code', 'prev_event_x_coord', 'prev_event_y_coord', 'second_prev_event_code', 'second_prev_event_x_coord', 'second_prev_event_y_coord', 'game_seconds_remaining', 'time_diff_last_event', 'angle_diff_last_event', 'distance_diff_last_event','prev_time_diff_last_event', 'prev_angle_diff_last_event', 'prev_distance_diff_last_event', 'goal_diff', 'strength_state']
label = ['is_goal']
X_train = df_train_model[features]
y_train = df_train_model[label]
X_test = df_test_model[features]
y_test = df_test_model[label]

categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]


In [21]:
parameters = {
    'nthread': [4],
    'objective': ['binary:logistic'],
    'max_depth': [3,4,5,6],
    'learning_rate': [0.01],
    'n_estimators': [100, 500, 1000],
    'seed': [42]
}

In [22]:
df_xg_model = pd.DataFrame()
auc_roc = []
kf = KFold(10, shuffle=True)
vals_y_test = pd.DataFrame()
vals_y_pred = np.ndarray(shape=(0,))

for train_idx, text_idx in kf.split(df_test_model):
    df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True) 
    df_test_data = df_test_model.iloc[text_idx].copy()
    X_train = df_train_data[features]
    y_train = df_train_data[label]
    X_test = df_test_data[features]
    y_test = df_test_data[label]

    classifier = XGBClassifier()
    classifier = GridSearchCV(classifier, parameters, cv=5, scoring='roc_auc', verbose=2)
    classifier.fit(X_train, y_train)
    dfs_predictions = {}
    y_pred = classifier.predict_proba(X_test)
    dfs_predictions[label[0]] = pd.Series(y_pred[:, 1],index=df_test_data.index)
    df_predictions = pd.concat(dfs_predictions, axis=1)
    df_xg_model = df_xg_model.append(df_predictions)

    vals_y_test = vals_y_test.append(y_test) 
    vals_y_pred = np.concatenate([vals_y_pred,y_pred[:, 1]], axis=0)
    auc_roc.append(roc_auc_score(y_test, y_pred[:, 1]))

  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.9s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)
  df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

  df_xg_model = df_xg_model.append(df_predictions)
  vals_y_test = vals_y_test.append(y_test)


In [23]:
new_auc_roc = roc_auc_score(vals_y_test, vals_y_pred)
new_auc_roc

0.7702957094674091

In [24]:
df_test_model['expected_goals'] = df_xg_model['is_goal']

In [25]:
df_test_model.columns

Index(['player_name', 'is_goal', 'event_type_code', 'x_coord', 'y_coord',
       'distance_to_goal', 'angle_to_goal', 'prev_event_code',
       'prev_event_x_coord', 'prev_event_y_coord', 'second_prev_event_code',
       'second_prev_event_x_coord', 'second_prev_event_y_coord',
       'game_seconds_remaining', 'time_diff_last_event',
       'angle_diff_last_event', 'distance_diff_last_event',
       'prev_time_diff_last_event', 'prev_angle_diff_last_event',
       'prev_distance_diff_last_event', 'goal_diff', 'strength_state',
       'expected_goals'],
      dtype='object')

In [26]:
df_test_model['is_goal'].sum(), df_test_model['expected_goals'].sum()

(38, 32.24613)

In [27]:
df_xg_model

Unnamed: 0,is_goal
2,0.015879
3,0.019445
14,0.013177
25,0.251420
29,0.046159
...,...
626,0.009907
641,0.014726
673,0.019312
688,0.040376


In [28]:
df_shots = pd.read_sql(
    '''
    SELECT * FROM play_by_play
    WHERE is_shot = 1
    '''
,conn)

In [29]:
df_shots['expected_goals'] = df_xg_model['is_goal']

In [30]:
df_shots.columns

Index(['index', 'game_date', 'season_year', 'team_name', 'opp_team_name',
       'venue', 'period', 'clock_seconds', 'situation_type', 'goals_for',
       'goals_against', 'player_name', 'event', 'event_successful', 'x_coord',
       'y_coord', 'event_type', 'player_name_2', 'x_coord_2', 'y_coord_2',
       'event_detail_1', 'event_detail_2', 'event_detail_3', 'frame_id_1',
       'frame_id_2', 'home_team', 'away_team', 'game_id', 'is_shot', 'is_goal',
       'event_id', 'team_id', 'player_id', 'detail_1_code', 'goal_diff',
       'game_seconds_remaining', 'event_code', 'event_type_code',
       'skaters_for', 'skaters_against', 'strength_state', 'distance_to_goal',
       'angle_to_goal', 'prev_event', 'prev_event_code', 'prev_event_type',
       'prev_event_type_code', 'prev_event_x_coord', 'prev_event_y_coord',
       'prev_event_game_seconds_remaining', 'prev_event_distance_to_goal',
       'prev_event_angle_to_goal', 'time_diff_last_event',
       'angle_diff_last_event', 'distanc

In [31]:
df_shots.to_sql('pbp_shots', conn, if_exists='replace')

709

In [32]:
df_shots['expected_goals'].describe()

count    709.000000
mean       0.045481
std        0.055233
min        0.001793
25%        0.013341
50%        0.023971
75%        0.054055
max        0.456082
Name: expected_goals, dtype: float64

In [33]:
df_shots.loc[df_shots['player_name'] == 'Michelle Karvinen',['expected_goals', 'is_goal','player_name']].sort_values(by='expected_goals', ascending=False)

Unnamed: 0,expected_goals,is_goal,player_name
161,0.456082,1,Michelle Karvinen
126,0.212136,0,Michelle Karvinen
513,0.136488,0,Michelle Karvinen
212,0.105233,0,Michelle Karvinen
140,0.097461,0,Michelle Karvinen
625,0.087007,0,Michelle Karvinen
708,0.078098,1,Michelle Karvinen
576,0.075666,0,Michelle Karvinen
514,0.06507,0,Michelle Karvinen
177,0.064705,0,Michelle Karvinen


In [34]:
df_shots.loc[(df_shots['player_name'] == 'Alina Muller') & (df_shots['strength_state'] > 0)]

Unnamed: 0,index,game_date,season_year,team_name,opp_team_name,venue,period,clock_seconds,situation_type,goals_for,goals_against,player_name,event,event_successful,x_coord,y_coord,event_type,player_name_2,x_coord_2,y_coord_2,event_detail_1,event_detail_2,event_detail_3,frame_id_1,frame_id_2,home_team,away_team,game_id,is_shot,is_goal,event_id,team_id,player_id,detail_1_code,goal_diff,game_seconds_remaining,event_code,event_type_code,skaters_for,skaters_against,strength_state,distance_to_goal,angle_to_goal,prev_event,prev_event_code,prev_event_type,prev_event_type_code,prev_event_x_coord,prev_event_y_coord,prev_event_game_seconds_remaining,prev_event_distance_to_goal,prev_event_angle_to_goal,time_diff_last_event,angle_diff_last_event,distance_diff_last_event,second_prev_event,second_prev_event_code,second_prev_event_type,second_prev_event_type_code,second_prev_event_x_coord,second_prev_event_y_coord,second_prev_event_game_seconds_remaining,second_prev_event_distance_to_goal,second_prev_event_angle_to_goal,prev_time_diff_last_event,prev_angle_diff_last_event,prev_distance_diff_last_event,expected_goals
321,4996,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,737,5 on 4,2,1,Alina Muller,Shot,0,59,26,Wristshot,,,,Blocked,1,0,219.0,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,0,1,737.0,5,26,5,4,1,132.035033,7.939394,Play,3.0,Direct,6.0,68.0,42.0,739.0,122.001025,244.0,-2.0,-236.060606,10.034009,Play,3.0,Direct,6.0,50.0,5.0,741.0,144.93533,3.733333,-2.0,240.266667,-22.934306,0.089728
327,5049,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,642,5 on 4,2,1,Alina Muller,Shot,0,40,27,Slapshot,,,,On Net,0,0,4887.0,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,2,1,642.0,5,19,5,4,1,150.798707,9.677419,Play,3.0,Direct,6.0,38.0,63.0,643.0,153.376172,7.414634,-1.0,2.262785,-2.577465,Play,3.0,Direct,6.0,66.0,38.0,645.0,124.081626,27.555556,-2.0,-20.140921,29.294545,0.020704
687,10165,16/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Finland,away,3,960,5 on 4,0,2,Alina Muller,Shot,0,51,20,Wristshot,,,,Blocked,1,0,3642.0,,Olympic (Women) - Finland,Olympic (Women) - Switzerland,3,1,0,5,3,6,0,-2,960.0,5,26,5,4,1,140.809268,6.177778,Zone Entry,7.0,Carried,2.0,74.0,16.0,961.0,118.988445,4.377358,-1.0,1.800419,21.820823,Play,3.0,Direct,6.0,168.0,81.0,967.0,44.342418,0.571429,-6.0,3.80593,74.646027,0.01835


In [35]:
df_test_model.loc[(df_test_model['player_name'] == 'Alina Muller')]

Unnamed: 0,player_name,is_goal,event_type_code,x_coord,y_coord,distance_to_goal,angle_to_goal,prev_event_code,prev_event_x_coord,prev_event_y_coord,second_prev_event_code,second_prev_event_x_coord,second_prev_event_y_coord,game_seconds_remaining,time_diff_last_event,angle_diff_last_event,distance_diff_last_event,prev_time_diff_last_event,prev_angle_diff_last_event,prev_distance_diff_last_event,goal_diff,strength_state,expected_goals
268,Alina Muller,0,26,179,50,13.313527,1.466667,4.0,179.0,50.0,5.0,163.0,67.0,2382.0,0.0,0.0,0.0,-4.0,0.364626,-23.145354,0,0,0.218222
286,Alina Muller,0,26,172,50,19.5,2.4,3.0,177.0,3.0,4.0,193.0,13.0,1897.0,-2.0,2.070886,-22.084252,-1.0,0.430809,11.932102,0,0,0.070378
293,Alina Muller,0,21,164,46,26.234519,7.428571,3.0,193.0,32.0,4.0,195.0,64.0,1647.0,-1.0,7.714286,15.314354,-5.0,-0.053156,-11.153575,0,0,0.205424
298,Alina Muller,0,21,146,4,58.465802,1.142857,4.0,147.0,2.0,3.0,192.0,31.0,1406.0,0.0,0.081129,-0.604072,-3.0,1.235641,47.397256,1,0,0.001848
304,Alina Muller,0,21,20,50,170.165361,22.666667,4.0,20.0,50.0,5.0,35.0,7.0,1188.0,0.0,0.0,0.0,-2.0,18.300469,11.151997,0,0,0.037961
321,Alina Muller,0,26,59,26,132.035033,7.939394,3.0,68.0,42.0,3.0,50.0,5.0,737.0,-2.0,-236.060606,10.034009,-2.0,240.266667,-22.934306,1,1,0.089728
327,Alina Muller,0,19,40,27,150.798707,9.677419,3.0,38.0,63.0,3.0,66.0,38.0,642.0,-1.0,2.262785,-2.577465,-2.0,-20.140921,29.294545,1,1,0.020704
338,Alina Muller,0,21,72,31,118.559057,10.26087,4.0,20.0,20.0,3.0,8.0,32.0,211.0,-5.0,2.705314,-52.923449,-3.0,-9.777778,-10.820127,1,0,0.017552
343,Alina Muller,1,26,14,38,176.057519,39.111111,3.0,29.0,63.0,3.0,65.0,62.0,158.0,0.0,31.257453,13.757642,-2.0,1.443402,35.78802,0,0,0.061524
352,Alina Muller,1,21,165,21,32.973474,1.162791,4.0,193.0,58.0,5.0,192.0,14.0,5.0,-3.0,1.356339,17.185821,-1.0,-0.123373,-12.782436,1,-2,0.00887


In [36]:
df_test.loc[(df_test['player_name'] == 'Alina Muller')]

Unnamed: 0,index,game_date,season_year,team_name,opp_team_name,venue,period,clock_seconds,situation_type,goals_for,goals_against,player_name,event,event_successful,x_coord,y_coord,event_type,player_name_2,x_coord_2,y_coord_2,event_detail_1,event_detail_2,event_detail_3,frame_id_1,frame_id_2,home_team,away_team,game_id,is_shot,is_goal,event_id,team_id,player_id,detail_1_code,goal_diff,game_seconds_remaining,event_code,event_type_code,skaters_for,skaters_against,strength_state,distance_to_goal,angle_to_goal,prev_event,prev_event_code,prev_event_type,prev_event_type_code,prev_event_x_coord,prev_event_y_coord,prev_event_game_seconds_remaining,prev_event_distance_to_goal,prev_event_angle_to_goal,time_diff_last_event,angle_diff_last_event,distance_diff_last_event,second_prev_event,second_prev_event_code,second_prev_event_type,second_prev_event_type_code,second_prev_event_x_coord,second_prev_event_y_coord,second_prev_event_game_seconds_remaining,second_prev_event_distance_to_goal,second_prev_event_angle_to_goal,prev_time_diff_last_event,prev_angle_diff_last_event,prev_distance_diff_last_event
268,4183,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,2,1182,5 on 5,0,0,Alina Muller,Shot,0,179,50,Wristshot,,,,Missed,0,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,1,0,2382.0,5,26,5,5,0,13.313527,1.466667,Puck Recovery,4.0,,-1.0,179.0,50.0,2382.0,13.313527,1.466667,0.0,0.0,0.0,Shot,5.0,Wristshot,26.0,163.0,67.0,2386.0,36.458881,1.102041,-4.0,0.364626,-23.145354
286,4428,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,2,697,5 on 5,0,0,Alina Muller,Shot,0,172,50,Wristshot,,,,On Net,0,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,2,0,1897.0,5,26,5,5,0,19.5,2.4,Play,3.0,Direct,6.0,177.0,3.0,1899.0,41.584252,0.329114,-2.0,2.070886,-22.084252,Puck Recovery,4.0,,-1.0,193.0,13.0,1900.0,29.65215,-0.101695,-1.0,0.430809,11.932102
293,4551,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,2,447,5 on 5,0,0,Alina Muller,Shot,0,164,46,Snapshot,,,,Missed,1,1,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,1,0,1647.0,5,21,5,5,0,26.234519,7.428571,Play,3.0,Direct,6.0,193.0,32.0,1648.0,10.920165,-0.285714,-1.0,7.714286,15.314354,Puck Recovery,4.0,,-1.0,195.0,64.0,1653.0,22.07374,-0.232558,-5.0,-0.053156,-11.153575
298,4666,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,2,206,5 on 5,1,0,Alina Muller,Shot,0,146,4,Snapshot,,,,Missed,1,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,1,1,1406.0,5,21,5,5,0,58.465802,1.142857,Puck Recovery,4.0,,-1.0,147.0,2.0,1406.0,59.069874,1.061728,0.0,0.081129,-0.604072,Play,3.0,Direct,6.0,192.0,31.0,1409.0,11.672618,-0.173913,-3.0,1.235641,47.397256
304,4773,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,1188,5 on 5,1,1,Alina Muller,Shot,0,20,50,Snapshot,,,,On Net,0,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,2,0,1188.0,5,21,5,5,0,170.165361,22.666667,Puck Recovery,4.0,,-1.0,20.0,50.0,1188.0,170.165361,22.666667,0.0,0.0,0.0,Shot,5.0,Wristshot,26.0,35.0,7.0,1190.0,159.013364,4.366197,-2.0,18.300469,11.151997
321,4996,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,737,5 on 4,2,1,Alina Muller,Shot,0,59,26,Wristshot,,,,Blocked,1,0,219.0,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,0,1,737.0,5,26,5,4,1,132.035033,7.939394,Play,3.0,Direct,6.0,68.0,42.0,739.0,122.001025,244.0,-2.0,-236.060606,10.034009,Play,3.0,Direct,6.0,50.0,5.0,741.0,144.93533,3.733333,-2.0,240.266667,-22.934306
327,5049,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,642,5 on 4,2,1,Alina Muller,Shot,0,40,27,Slapshot,,,,On Net,0,0,4887.0,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,2,1,642.0,5,19,5,4,1,150.798707,9.677419,Play,3.0,Direct,6.0,38.0,63.0,643.0,153.376172,7.414634,-1.0,2.262785,-2.577465,Play,3.0,Direct,6.0,66.0,38.0,645.0,124.081626,27.555556,-2.0,-20.140921,29.294545
338,5216,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,211,5 on 5,2,1,Alina Muller,Shot,0,72,31,Snapshot,,,,Missed,1,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,0,5,3,6,1,1,211.0,5,21,5,5,0,118.559057,10.26087,Puck Recovery,4.0,,-1.0,20.0,20.0,216.0,171.482506,7.555556,-5.0,2.705314,-52.923449,Play,3.0,Direct,6.0,8.0,32.0,219.0,182.302633,17.333333,-3.0,-9.777778,-10.820127
343,5245,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,158,5 on 5,2,2,Alina Muller,Shot,1,14,38,Wristshot,,,,On Net,0,1,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,1,5,3,6,2,0,158.0,5,26,5,5,0,176.057519,39.111111,Play,3.0,Direct,6.0,29.0,63.0,158.0,162.299877,7.853659,0.0,31.257453,13.757642,Play,3.0,Direct,6.0,65.0,62.0,160.0,126.511857,6.410256,-2.0,1.443402,35.78802
352,5311,12/2/2022,2021,Olympic (Women) - Switzerland,Olympic (Women) - Olympic Athletes from Russia,away,3,5,4 on 6,3,2,Alina Muller,Shot,1,165,21,Snapshot,,,,On Net,0,0,,,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Switzerland,0,1,1,5,3,6,2,1,5.0,5,21,4,6,-2,32.973474,1.162791,Puck Recovery,4.0,,-1.0,193.0,58.0,8.0,15.787653,-0.193548,-3.0,1.356339,17.185821,Shot,5.0,Snapshot,21.0,192.0,14.0,9.0,28.570089,-0.070175,-1.0,-0.123373,-12.782436


In [37]:
df_test.shape

(709, 67)

In [38]:
df_test[df_test['strength_state'] >0].shape, df_test_model[df_test_model['strength_state'] >0].shape

((124, 67), (124, 23))

## xG values for all coordinates

In [39]:
df_non_shot_train = pd.read_sql(
    '''
    SELECT is_goal, detail_1_code event_type_code, x_coord, y_coord, distance_to_goal, angle_to_goal, prev_event_code, prev_event_x_coord, prev_event_y_coord, second_prev_event_code, second_prev_event_x_coord, second_prev_event_y_coord, game_seconds_remaining, time_diff_last_event, angle_diff_last_event, distance_diff_last_event, prev_time_diff_last_event, prev_angle_diff_last_event, prev_distance_diff_last_event, goal_diff, strength_state
    FROM pbp_training 
    WHERE (is_shot = 1 OR is_goal = 1)
    '''
,conn)

In [40]:
df_non_shot_test = pd.read_sql(
    '''
    SELECT * FROM play_by_play
    '''
,conn)

In [41]:
df_non_shot_train.shape, df_non_shot_test.shape

((3656, 21), (10609, 67))

In [42]:
non_shot_features = ['x_coord', 'y_coord', 'distance_to_goal', 'angle_to_goal', 'prev_event_code', 'prev_event_x_coord', 'prev_event_y_coord', 'second_prev_event_code', 'second_prev_event_x_coord', 'second_prev_event_y_coord', 'game_seconds_remaining', 'time_diff_last_event', 'angle_diff_last_event', 'distance_diff_last_event','prev_time_diff_last_event', 'prev_angle_diff_last_event', 'prev_distance_diff_last_event', 'goal_diff', 'strength_state']
non_shot_label = ['is_goal']
X_non_shot_train = df_non_shot_train[non_shot_features]
y_non_shot_train = df_non_shot_train[non_shot_label]
X_non_shot_test = df_non_shot_test[non_shot_features]
y_non_shot_test = df_non_shot_test[non_shot_label]

In [43]:
df_non_shot_xg = pd.DataFrame()

non_shot_classifier = XGBClassifier()
non_shot_classifier = GridSearchCV(non_shot_classifier, parameters, cv=5, scoring='roc_auc', verbose=2)
non_shot_classifier.fit(X_non_shot_train, y_non_shot_train)
dfs_predictions_non_shot_xg = {}
y_non_shot_pred = non_shot_classifier.predict_proba(X_non_shot_test)
dfs_predictions_non_shot_xg[non_shot_label[0]] = pd.Series(y_non_shot_pred[:, 1],index=df_non_shot_test.index)
df_non_shot_xg = pd.concat(dfs_predictions_non_shot_xg, axis=1)

non_shot_auc_roc = roc_auc_score(y_non_shot_test, y_non_shot_pred[:, 1])
non_shot_auc_roc

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, nthread=4, objective=binary:logistic, seed=42; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=500, nthread=4, objective=binary:logistic, seed=42; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=50

0.47365184790564063

In [44]:
df_non_shot_test['non_shot_expected_goals'] = df_non_shot_xg['is_goal']

In [45]:
df_non_shot_model = df_non_shot_test.merge(df_shots[['index','expected_goals']], on='index', how='left')

In [46]:
df_non_shot_model.loc[df_non_shot_model['is_shot']==1,['player_name','index','expected_goals','non_shot_expected_goals']].head()

Unnamed: 0,player_name,index,expected_goals,non_shot_expected_goals
12,Jocelyne Larocque,12,0.00236,0.189897
18,Sarah Fillier,18,0.007372,0.189897
58,Abby Roque,58,0.015879,0.32833
68,Kelly Pannek,68,0.019445,0.32833
82,Sarah Nurse,82,0.001793,0.189897


In [47]:
df_non_shot_model.to_sql('play_by_play', conn, if_exists='replace')

10609

In [61]:
df_non_shot_model.loc[(df_non_shot_model['venue'] == 'home') & (df_non_shot_model['period'] == 2),['expected_goals','non_shot_expected_goals']].describe()

Unnamed: 0,expected_goals,non_shot_expected_goals
count,136.0,1934.0
mean,0.047989,0.277688
std,0.046024,0.067059
min,0.001879,0.189897
25%,0.017865,0.197509
50%,0.030366,0.32833
75%,0.062793,0.32833
max,0.225509,0.615189


In [62]:
df_non_shot_model.loc[(df_non_shot_model['venue'] == 'away') & (df_non_shot_model['period'] == 2),['expected_goals','non_shot_expected_goals']].describe()

Unnamed: 0,expected_goals,non_shot_expected_goals
count,116.0,1641.0
mean,0.049866,0.279285
std,0.061529,0.066238
min,0.001848,0.189897
25%,0.011807,0.197509
50%,0.019503,0.32833
75%,0.0611,0.32833
max,0.306325,0.391686


In [63]:
df_non_shot_model.loc[(df_non_shot_model['venue'] == 'home') & (df_non_shot_model['period'] == 1 | 3 ),['expected_goals','non_shot_expected_goals']].describe()

Unnamed: 0,expected_goals,non_shot_expected_goals
count,137.0,1864.0
mean,0.04306,0.271089
std,0.052648,0.069069
min,0.001937,0.189897
25%,0.013368,0.189897
50%,0.024071,0.32833
75%,0.051976,0.32833
max,0.368627,0.472939


In [64]:
df_non_shot_model.loc[(df_non_shot_model['venue'] == 'away') & (df_non_shot_model['period'] == 3 | 1),['expected_goals','non_shot_expected_goals']].describe()

Unnamed: 0,expected_goals,non_shot_expected_goals
count,91.0,1532.0
mean,0.035975,0.276645
std,0.031434,0.068646
min,0.003279,0.189897
25%,0.013205,0.197226
50%,0.025072,0.32833
75%,0.044665,0.32833
max,0.164171,0.615756


In [65]:
df_non_shot_model.head()

Unnamed: 0,index,game_date,season_year,team_name,opp_team_name,venue,period,clock_seconds,situation_type,goals_for,goals_against,player_name,event,event_successful,x_coord,y_coord,event_type,player_name_2,x_coord_2,y_coord_2,event_detail_1,event_detail_2,event_detail_3,frame_id_1,frame_id_2,home_team,away_team,game_id,is_shot,is_goal,event_id,team_id,player_id,detail_1_code,goal_diff,game_seconds_remaining,event_code,event_type_code,skaters_for,skaters_against,strength_state,distance_to_goal,angle_to_goal,prev_event,prev_event_code,prev_event_type,prev_event_type_code,prev_event_x_coord,prev_event_y_coord,prev_event_game_seconds_remaining,prev_event_distance_to_goal,prev_event_angle_to_goal,time_diff_last_event,angle_diff_last_event,distance_diff_last_event,second_prev_event,second_prev_event_code,second_prev_event_type,second_prev_event_type_code,second_prev_event_x_coord,second_prev_event_y_coord,second_prev_event_game_seconds_remaining,second_prev_event_distance_to_goal,second_prev_event_angle_to_goal,prev_time_diff_last_event,prev_angle_diff_last_event,prev_distance_diff_last_event,non_shot_expected_goals,expected_goals
0,0,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1200,5 on 5,0,0,Marie-Philip Poulin,Faceoff Win,1,100,42,Backhand,Hannah Brandt,,,,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,0,0,1,0,60,-1,0,3600.0,1,0,5,5,0,90.001389,180.0,,,,,,,,,,,,,,,,,,,,,,,,,0.32833,
1,1,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1199,5 on 5,0,0,Jocelyne Larocque,Puck Recovery,1,86,31,,,,,,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,0,0,4,0,42,-1,0,3599.0,4,-1,5,5,0,104.633886,9.043478,Faceoff Win,1.0,Backhand,0.0,100.0,42.0,3600.0,90.001389,180.0,-1.0,-170.956522,14.632497,,,,,,,,,,,,,0.32833,
2,2,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1198,5 on 5,0,0,Jocelyne Larocque,Dump In/Out,0,96,15,,,,,,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,0,0,0,0,42,-1,0,3598.0,0,-1,5,5,0,97.940033,3.418182,Puck Recovery,4.0,,-1.0,86.0,31.0,3599.0,104.633886,9.043478,-1.0,-5.625296,-6.693853,Faceoff Win,1.0,Backhand,0.0,100.0,42.0,3600.0,90.001389,180.0,-1.0,-170.956522,14.632497,0.32833,
3,3,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1197,5 on 5,0,0,Jocelyne Larocque,Zone Entry,1,124,1,Dumped,Cayla Barnes,,,,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,0,0,7,0,42,-1,0,3597.0,7,7,5,5,0,77.963132,1.590361,Dump In/Out,0.0,,-1.0,96.0,15.0,3598.0,97.940033,3.418182,-1.0,-1.82782,-19.9769,Puck Recovery,4.0,,-1.0,86.0,31.0,3599.0,104.633886,9.043478,-1.0,-5.625296,-6.693853,0.32833,
4,4,8/2/2022,2021,Olympic (Women) - United States,Olympic (Women) - Canada,home,1,1194,5 on 5,0,0,Lee Stecklein,Puck Recovery,1,194,73,,,,,,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,5,0,0,4,4,53,-1,0,3594.0,4,-1,5,5,0,30.761177,-0.131148,Zone Entry,7.0,Dumped,7.0,124.0,1.0,3597.0,77.963132,1.590361,-3.0,-1.721509,-47.201955,Dump In/Out,0.0,,-1.0,96.0,15.0,3598.0,97.940033,3.418182,-1.0,-1.82782,-19.9769,0.189897,
