In [414]:
import pandas as pd
import sqlite3
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import csv

# Import XGBoost classifier
from xgboost import XGBClassifier

# Import scikit-learn functions
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Import scikit-plot functions
from scikitplot.metrics import plot_roc_curve
from scikitplot.metrics import plot_precision_recall_curve
from scikitplot.metrics import plot_calibration_curve

# Import SciPy function
from scipy.spatial import distance

pd.set_option("display.max_columns", None)


In [415]:
# Data Directory
data_dir = "../../Data/Big-Data-Cup-2021"
bucketless_data_dir = '../../Data/bdc/data'
tracking_data_dir = "{}/TrackingData".format(data_dir)
db_path = '/Users/keltim01/Documents/databases/'

#connect to database 
conn = sqlite3.connect(db_path + 'bdc_2022.db')
cursor = conn.cursor()

In [416]:
df_train = pd.read_sql(
    '''
    SELECT * FROM pbp_training 
    WHERE (is_shot = 1 OR is_goal = 1) AND detail_2 NOT IN ('Blocked')
    '''
,conn)

In [417]:
df_test = pd.read_sql(
    '''
    SELECT * FROM play_by_play
    WHERE is_shot = 1 AND event_detail_1 NOT IN ('Blocked')
    '''
,conn)

## Choosing Features of the Expected Goals Model
### Source:
source/inspiration: https://hockey-graphs.com/2019/08/12/expected-goals-model-with-pre-shot-movement-part-1-the-model/

- Unblocked Shots (Fenwick Shots)
- using catboost from the bonsai-tree package 


### Features:
- shot type, 
- shot location 
- distance from the net
- angle from the net
- type event prior 
- location event prior
- time 
- distance from last event
- time since last event
- angle change between the shot and its prior event
- score state
- strength state

- Dataset mostly contains neutral site games so the home and away should not be as important

In [418]:
df_test.columns

Index(['index', 'game_date', 'season_year', 'team_name', 'opp_team_name',
       'venue', 'period', 'clock_seconds', 'situation_type', 'goals_for',
       'goals_against', 'player_name', 'event', 'event_successful', 'x_coord',
       'y_coord', 'event_type', 'player_name_2', 'x_coord_2', 'y_coord_2',
       'event_detail_1', 'event_detail_2', 'event_detail_3', 'frame_id_1',
       'frame_id_2', 'home_team', 'away_team', 'is_shot', 'is_goal',
       'event_id', 'team_id', 'player_id', 'detail_1_code', 'goal_diff',
       'game_seconds_remaining', 'event_code', 'event_type_code',
       'skaters_for', 'skaters_against', 'strength_state', 'distance_to_goal',
       'angle_to_goal', 'prev_event', 'prev_event_code', 'prev_event_type',
       'prev_event_type_code', 'prev_event_x_coord', 'prev_event_y_coord',
       'prev_event_game_seconds_remaining', 'prev_event_distance_to_goal',
       'prev_event_angle_to_goal', 'time_diff_last_event',
       'angle_diff_last_event', 'distance_diff_last

In [419]:
df_train.columns

Index(['index', 'game_date', 'home_team', 'away_team', 'period', 'clock',
       'home_team_skaters', 'away_team_skaters', 'home_team_goals',
       'away_team_goals', 'team', 'player', 'event', 'x_coord', 'y_coord',
       'detail_1', 'detail_2', 'detail_3', 'detail_4', 'player_2',
       'x_coordinate_2', 'y_coordinate_2', 'game_id', 'event_id', 'team_id',
       'player_id', 'is_shot', 'is_goal', 'detail_1_code', 'detail_2_code',
       'detail_3_code', 'detail_4_code', 'is_home', 'goal_diff',
       'game_seconds_remaining', 'event_code', 'strength_state',
       'distance_to_goal', 'angle_to_goal', 'prev_event', 'prev_event_code',
       'prev_event_type', 'prev_event_type_code', 'prev_event_x_coord',
       'prev_event_y_coord', 'prev_event_game_seconds_remaining',
       'prev_event_distance_to_goal', 'prev_event_angle_to_goal',
       'time_diff_last_event', 'angle_diff_last_event',
       'distance_diff_last_event'],
      dtype='object')

In [420]:
df_train['detail_1'].unique(), df_train['detail_2'].unique()

(array(['Snapshot', 'Fan', 'Slapshot', 'Wristshot', 'Deflection',
        'Wrap Around'], dtype=object),
 array(['On Net', 'Missed'], dtype=object))

In [421]:
df_test['event_type'].unique(), df_test['event_detail_1'].unique()

(array(['Snapshot', 'Fan', 'Wristshot', 'Slapshot', 'Deflection',
        'Wrap Around'], dtype=object),
 array(['Missed', 'On Net'], dtype=object))

In [422]:
df_test.head()

Unnamed: 0,index,game_date,season_year,team_name,opp_team_name,venue,period,clock_seconds,situation_type,goals_for,goals_against,player_name,event,event_successful,x_coord,y_coord,event_type,player_name_2,x_coord_2,y_coord_2,event_detail_1,event_detail_2,event_detail_3,frame_id_1,frame_id_2,home_team,away_team,is_shot,is_goal,event_id,team_id,player_id,detail_1_code,goal_diff,game_seconds_remaining,event_code,event_type_code,skaters_for,skaters_against,strength_state,distance_to_goal,angle_to_goal,prev_event,prev_event_code,prev_event_type,prev_event_type_code,prev_event_x_coord,prev_event_y_coord,prev_event_game_seconds_remaining,prev_event_distance_to_goal,prev_event_angle_to_goal,time_diff_last_event,angle_diff_last_event,distance_diff_last_event
0,18,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1165,5 on 5,0,0,Sarah Fillier,Shot,0,148,4,Snapshot,,,,Missed,1,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,1,0,5,0,87,1,0,3565.0,5,21,5,5,0,56.975872,1.090909,Puck Recovery,4.0,,-1.0,146.0,8.0,3565.0,55.912879,1.275362,0.0,-0.184453,1.062993
1,58,8/2/2022,2021,Olympic (Women) - United States,Olympic (Women) - Canada,home,1,1095,5 on 5,0,0,Abby Roque,Shot,0,23,57,Snapshot,,,,On Net,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,1,0,5,4,1,2,0,3495.0,5,21,5,5,0,167.628309,11.517241,Puck Recovery,4.0,,-1.0,50.0,69.0,3499.0,142.485964,5.283019,-4.0,6.234223,25.142345
2,68,8/2/2022,2021,Olympic (Women) - United States,Olympic (Women) - Canada,home,1,1083,5 on 5,0,0,Kelly Pannek,Shot,0,37,32,Snapshot,,,,On Net,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,1,0,5,4,46,2,0,3483.0,5,21,5,5,0,153.359871,14.571429,Puck Recovery,4.0,,-1.0,37.0,32.0,3483.0,153.359871,14.571429,0.0,0.0,0.0
3,82,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1061,5 on 5,0,0,Sarah Nurse,Shot,0,154,2,Snapshot,,,,Missed,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,1,0,5,0,89,1,0,3461.0,5,21,5,5,0,54.187176,0.888889,Puck Recovery,4.0,,-1.0,151.0,5.0,3462.0,54.104066,1.04,-1.0,-0.151111,0.083109
4,106,8/2/2022,2021,Olympic (Women) - Canada,Olympic (Women) - United States,away,1,1017,5 on 5,0,0,Natalie Spooner,Shot,0,158,50,Snapshot,,,,Missed,0,0,,,Olympic (Women) - United States,Olympic (Women) - Canada,1,0,5,0,68,1,0,3417.0,5,21,5,5,0,32.867157,4.266667,Takeaway,6.0,,-1.0,196.0,60.0,3421.0,18.5,-0.342857,-4.0,4.609524,14.367157


In [423]:
df_test_model = pd.read_sql(
    '''
    SELECT is_goal, event_type_code, x_coord, y_coord, distance_to_goal, angle_to_goal, prev_event_code, prev_event_x_coord, prev_event_y_coord, game_seconds_remaining, time_diff_last_event, angle_diff_last_event, distance_diff_last_event, goal_diff, strength_state
    FROM play_by_play
    WHERE is_shot = 1 AND event_detail_1 NOT IN ('Blocked')
    '''
,conn)

In [424]:
df_test_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   is_goal                   502 non-null    int64  
 1   event_type_code           502 non-null    int64  
 2   x_coord                   502 non-null    int64  
 3   y_coord                   502 non-null    int64  
 4   distance_to_goal          502 non-null    float64
 5   angle_to_goal             502 non-null    float64
 6   prev_event_code           502 non-null    float64
 7   prev_event_x_coord        502 non-null    float64
 8   prev_event_y_coord        502 non-null    float64
 9   game_seconds_remaining    502 non-null    float64
 10  time_diff_last_event      502 non-null    float64
 11  angle_diff_last_event     502 non-null    float64
 12  distance_diff_last_event  502 non-null    float64
 13  goal_diff                 502 non-null    int64  
 14  strength_s

In [425]:
df_test_model.shape

(502, 15)

In [426]:
df_train_model = pd.read_sql(
    '''
    SELECT is_goal, detail_1_code event_type_code, x_coord, y_coord, distance_to_goal, angle_to_goal, prev_event_code, prev_event_x_coord, prev_event_y_coord, game_seconds_remaining, time_diff_last_event, angle_diff_last_event, distance_diff_last_event, goal_diff, strength_state
    FROM pbp_training 
    WHERE (is_shot = 1 OR is_goal = 1) AND detail_2 NOT IN ('Blocked')
    '''
,conn)

In [427]:
df_train_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   is_goal                   2652 non-null   int64  
 1   event_type_code           2652 non-null   int64  
 2   x_coord                   2652 non-null   int64  
 3   y_coord                   2652 non-null   int64  
 4   distance_to_goal          2652 non-null   float64
 5   angle_to_goal             2652 non-null   float64
 6   prev_event_code           2652 non-null   float64
 7   prev_event_x_coord        2652 non-null   float64
 8   prev_event_y_coord        2652 non-null   float64
 9   game_seconds_remaining    2652 non-null   int64  
 10  time_diff_last_event      2652 non-null   float64
 11  angle_diff_last_event     2652 non-null   float64
 12  distance_diff_last_event  2652 non-null   float64
 13  goal_diff                 2652 non-null   int64  
 14  strength

In [428]:
df_train_model.shape, df_test_model.shape

((2652, 15), (502, 15))

In [429]:
df_test_model['event_type_code'].unique()

array([21,  8, 26, 19,  5, 25])

In [430]:
features = ['event_type_code', 'x_coord', 'y_coord', 'distance_to_goal', 'angle_to_goal', 'prev_event_code', 'prev_event_x_coord', 'prev_event_y_coord', 'game_seconds_remaining', 'time_diff_last_event', 'angle_diff_last_event', 'distance_diff_last_event', 'goal_diff', 'strength_state']
label = ['is_goal']
X_train = df_train_model[features]
y_train = df_train_model[label]
X_test = df_test_model[features]
y_test = df_test_model[label]

categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

In [431]:
parameters = {
    'nthread': [4],
    'objective': ['binary:logistic'],
    'max_depth': [3,4,5,6],
    'learning_rate': [0.01],
    'n_estimators': [100, 500, 1000],
    'seed': [42]
}

In [None]:
df_xg_model = pd.DataFrame()
auc_roc = []
kf = KFold(10, shuffle=True)
vals_y_test = pd.DataFrame()
vals_y_pred = np.ndarray(shape=(0,))

for train_idx, text_idx in kf.split(df_test_model):
    df_train_data = df_train_model.append(df_test_model.iloc[train_idx].copy(), ignore_index=True) 
    df_test_data = df_test_model.iloc[text_idx].copy()
    X_train = df_train_data[features]
    y_train = df_train_data[label]
    X_test = df_test_data[features]
    y_test = df_test_data[label]

    classifier = XGBClassifier()
    classifier = GridSearchCV(classifier, parameters, cv=5, scoring='roc_auc', verbose=2)
    classifier.fit(X_train, y_train)
    dfs_predictions = {}
    y_pred = classifier.predict_proba(X_test)
    dfs_predictions[label[0]] = pd.Series(y_pred[:, 1],index=df_test_data.index)
    df_predictions = pd.concat(dfs_predictions, axis=1)
    df_xg_model = df_xg_model.append(df_predictions)

    vals_y_test = vals_y_test.append(y_test) 
    vals_y_pred = np.concatenate([vals_y_pred,y_pred[:, 1]], axis=0)
    auc_roc.append(roc_auc_score(y_test, y_pred[:, 1]))

In [439]:
new_auc_roc = roc_auc_score(vals_y_test, vals_y_pred)
new_auc_roc

0.7325034029038112

In [444]:
df_test_model['expected_goals'] = df_xg_model['is_goal']

In [449]:
df_test_model.columns

Index(['is_goal', 'event_type_code', 'x_coord', 'y_coord', 'distance_to_goal',
       'angle_to_goal', 'prev_event_code', 'prev_event_x_coord',
       'prev_event_y_coord', 'game_seconds_remaining', 'time_diff_last_event',
       'angle_diff_last_event', 'distance_diff_last_event', 'goal_diff',
       'strength_state', 'expected_goals'],
      dtype='object')

In [453]:
df_test_model['is_goal'].sum(), df_test_model['expected_goals'].sum()

(38, 30.986307)

In [441]:
df_xg_model

Unnamed: 0,is_goal
8,0.024361
32,0.042356
47,0.035301
52,0.092760
57,0.049870
...,...
422,0.043602
447,0.023864
490,0.032824
493,0.053627
