# Big Data Cup 2021 
## How to value Zone Entries and other actions that are not shots or goals
### VAEP (Valuing actions by estimating probabilities) framework for Hockey 
Inspired by paper of the Soccer version [Actions Speak Louder Than Goals: Valuing Player Actions in Soccer](https://arxiv.org/abs/1802.07127) by Tom Decroos, Lotte Bransen, Jan Van Haaren and Jesse Davis. Very helpful was the Tutorial as part of the Friends of Tracking initiative by Lotte Bransen and Jan Van Haaren: [Friends of Tracking: Valuing actions in football](https://github.com/SciSports-Labs/fot-valuing-actions)

In [12]:
%load_ext nb_black
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold
from xgboost import XGBClassifier, plot_importance

import shap
from ipywidgets import interact_manual, fixed, widgets
%matplotlib inline

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

# Importing data, renaming columns, creating extra columns

In [13]:
# Import and Data Frame for womens data
project_dir = '/Users/keltim01/git_repos/TK5/Data/Big-Data-Cup-2021/'
womens = pd.read_csv(project_dir + 'hackathon_womens.csv')
nwhl = pd.read_csv(project_dir + 'hackathon_nwhl.csv')
womens = womens.append(nwhl, ignore_index=True)
# important numbers for the hockey rink 
ICE_LENGTH = 200
ICE_WIDTH = 85
GOAL_X = ICE_LENGTH - 10
GOAL_Y = ICE_WIDTH / 2\

womens.columns = ['game_date', 'home_team', 'away_team', 'period', 'clock', 'home_team_skaters', 'away_team_skaters', 'home_team_goals','away_team_goals', 'team', 'player', 'event', 'x_coord', 'y_coord', 'detail_1', 'detail_2', 'detail_3', 'detail_4', 'player_2', 'x_coord_2', 'y_coord_2']
womens['game_id'] = womens.loc[:, ['game_date', 'home_team', 'away_team']].sum(axis=1).astype('category').cat.codes
womens['is_home'] = 0
womens['is_shot'] = 0
womens['is_goal'] = 0
womens['event_id'] = womens['event'].astype('category').cat.codes
womens['team_id'] = womens['team'].astype('category').cat.codes
womens['player_id'] = womens['player'].astype('category').cat.codes

for x in range(1,5):
    womens[f'detail_{x}_code'] = womens[f'detail_{x}'].astype('category').cat.codes
womens.loc[womens['home_team'] == womens['team'], 'is_home'] = 1
womens.loc[womens['event']=='Shot', 'is_shot'] = 1
womens.loc[womens['event']=='Goal', 'is_goal'] = 1
womens['goal_diff'] = womens['home_team_goals'].sub(womens['away_team_goals'])
womens['clock'] = pd.to_datetime(womens['clock'], format='%M:%S')
womens['seconds_remaining'] = womens['clock'].dt.minute.mul(60).add(womens['clock'].dt.second)

<IPython.core.display.Javascript object>

#  Calculate differences in disctance for actions
## create endpoint for actions
### Shot \n"
* on net: position of the goal
* missed/blocked possesion lost or Retained: location next event -> Puck Recovery
### Goal 
* position of the goal
### Takeaway
* same position
### Puck Recovery
* same position
### Dump In/Out
* Possession Lost or Retained: location next event -> Puck Recovery
### Zone Entry
* same position
### Faceoff Wins
* same position
### Penalty Taken
* same position

In [14]:
womens.loc[(womens['event']=='Shot') & (womens['detail_2'] == 'On Net'),['x_coord_2','y_coord_2']] = [GOAL_X,GOAL_Y]
shifted_coords = womens.loc[:,['x_coord','y_coord']].shift(-1)
womens2 = womens.loc[:]
womens2.loc[:,['x_coord','y_coord']] = shifted_coords
womens.loc[(womens['event']=='Shot') & (womens['detail_2'] == 'Blocked'),'x_coord_2'] = womens2.loc[(womens2['event']=='Shot') & (womens2['detail_2'] == 'Blocked'),'x_coord']
womens.loc[(womens['event']=='Shot') & (womens['detail_2'] == 'Blocked'),'y_coord_2'] = womens2.loc[(womens2['event']=='Shot') & (womens2['detail_2'] == 'Blocked'),'y_coord']
womens.loc[(womens['event']=='Shot') & (womens['detail_2'] == 'Missed'),'x_coord_2'] = womens2.loc[(womens2['event']=='Shot') & (womens2['detail_2'] == 'Missed'),'x_coord']
womens.loc[(womens['event']=='Shot') & (womens['detail_2'] == 'Missed'),'y_coord_2'] = womens2.loc[(womens2['event']=='Shot') & (womens2['detail_2'] == 'Missed'),'y_coord']
womens.loc[womens['event']=='Goal',['x_coord_2','y_coord_2']] = [GOAL_X,GOAL_Y]
womens.loc[womens['event']=='Takeaway','x_coord_2'] = womens.loc[womens['event']=='Takeaway','x_coord']
womens.loc[womens['event']=='Takeaway','y_coord_2'] = womens.loc[womens['event']=='Takeaway','y_coord']
womens.loc[womens['event']=='Puck Recovery','x_coord_2'] = womens.loc[womens['event']=='Puck Recovery','x_coord']
womens.loc[womens['event']=='Puck Recovery','y_coord_2'] = womens.loc[womens['event']=='Puck Recovery','y_coord']
womens.loc[womens['event']=='Dump In/Out','x_coord_2'] = womens2.loc[womens2['event']=='Dump In/Out','x_coord']
womens.loc[womens['event']=='Dump In/Out','y_coord_2'] = womens2.loc[womens2['event']=='Dump In/Out','y_coord']
womens.loc[womens['event']=='Zone Entry','x_coord_2'] = womens.loc[womens['event']=='Zone Entry','x_coord']
womens.loc[womens['event']=='Zone Entry','y_coord_2'] = womens.loc[womens['event']=='Zone Entry','y_coord']
womens.loc[womens['event']=='Faceoff Win','x_coord_2'] = womens.loc[womens['event']=='Faceoff Win','x_coord']
womens.loc[womens['event']=='Faceoff Win','y_coord_2'] = womens.loc[womens['event']=='Faceoff Win','y_coord']
womens.loc[womens['event']=='Penalty Taken','x_coord_2'] = womens.loc[womens['event']=='Penalty Taken','x_coord']
womens.loc[womens['event']=='Penalty Taken','y_coord_2'] = womens.loc[womens['event']=='Penalty Taken','y_coord']

<IPython.core.display.Javascript object>

In [15]:
diff_x1 = GOAL_X - womens['x_coord']
diff_y1 = abs(GOAL_Y - womens['y_coord'])
diff_x2 = GOAL_X - womens['x_coord_2']
diff_y2 = abs(GOAL_Y - womens['y_coord_2'])
womens['start_distance_to_goal'] = np.sqrt(diff_x1 ** 2 + diff_y1 ** 2)
womens['end_distance_to_goal'] = np.sqrt(diff_x2 ** 2 + diff_y2 ** 2)
womens['diff_x'] = womens['x_coord_2'] - womens['x_coord']
womens['diff_y'] = womens['y_coord_2'] - womens['y_coord']
womens['distance_covered'] = np.sqrt((womens['x_coord_2'] - womens['x_coord']) ** 2 + (womens['y_coord_2'] - womens['y_coord']) ** 2)

<IPython.core.display.Javascript object>

# Columns to add
* Zone the puck is in from the possessing players perspective. One for start of the action and end of the action
* possession change afterwards
* skater differences

# create Labels

In [16]:
goals = womens['event'].str.contains('Goal')
y = pd.concat([womens.loc[:, 'is_goal'], womens.loc[:,'team_id']], axis = 1)
y.columns = ['goal','team_id']
for i in range(1, 10):
    for col in ['team_id', 'goal']:
        shifted = y[col].shift(-i)
        shifted[-i:] = y[col][len(y) - 1]
        y[f'{col}+{i}'] = shifted.astype(int)

scores = y['goal']
concedes = y['goal']
for i in range(1, 10):
    goal_scored = y[f'goal+{i}'] & (y[f'team_id+{i}'] == y['team_id'])
    goal_opponent = y[f'goal+{i}'] & (y[f'team_id+{i}'] != y['team_id'])
    scores = scores | goal_scored
    concedes = concedes | goal_opponent
label_scores = pd.DataFrame(scores, columns=['scores'])
label_concedes = pd.DataFrame(concedes, columns=['concedes'])
df_labels = pd.concat([label_scores, label_concedes], axis=1)

<IPython.core.display.Javascript object>

# Features

In [17]:
features = ['game_id','team_id', 'player_id', 'period', 'x_coord', 'y_coord', 'x_coord_2',
       'y_coord_2', 'is_home', 'is_shot', 'is_goal', 'event_id',
       'goal_diff', 'seconds_remaining','diff_x', 'diff_y', 'distance_covered', 'start_distance_to_goal', 'end_distance_to_goal']
df_delays = [womens[features].shift(step).add_suffix(f'-{step}') for step in range(0,3)]
df_features = pd.concat(df_delays, axis=1)

<IPython.core.display.Javascript object>

In [18]:
for step in range(0,3):
    df_features[f'team-{step}'] = df_features['team_id-0'] == df_features[f'team_id-{step}']

for step in range(0,3):
    df_features.loc[~(df_features[f'team-{step}']),f'x_coord-{step}'] = ICE_LENGTH - df_features[f'x_coord-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'x_coord_2-{step}'] = ICE_LENGTH - df_features[f'x_coord_2-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'y_coord-{step}'] = ICE_WIDTH - df_features[f'y_coord-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'y_coord_2-{step}'] = ICE_WIDTH - df_features[f'y_coord_2-{step}']

<IPython.core.display.Javascript object>

In [19]:
for step in range(0,3):
    start_diff_x = GOAL_X - df_features[f'x_coord-{step}']
    start_diff_y = abs(GOAL_Y - df_features[f'y_coord-{step}'])
    df_features[f'start_distance_to_goal-{step}'] = np.sqrt(start_diff_x ** 2 + start_diff_y ** 2)
    end_diff_x = GOAL_X - df_features[f'x_coord_2-{step}']
    end_diff_y = abs(GOAL_Y - df_features[f'y_coord_2-{step}'])
    df_features[f'end_distance_to_goal-{step}'] = np.sqrt(end_diff_x ** 2 + end_diff_y ** 2)
    df_features[f'diff_x-{step}'] = df_features[f'x_coord_2-{step}'] - df_features[f'x_coord-{step}']
    df_features[f'diff_y-{step}'] = df_features[f'y_coord_2-{step}'] - df_features[f'y_coord-{step}']
    df_features[f'distance_covered-{step}'] = np.sqrt((df_features[f'x_coord_2-{step}'] - df_features[f'x_coord-{step}']) ** 2 + (df_features[f'y_coord_2-{step}'] - df_features[f'y_coord-{step}']) ** 2)

<IPython.core.display.Javascript object>

In [20]:
df_features['xdiff_sequenc_pre'] = df_features['x_coord-0'] - df_features['x_coord-2']
df_features['ydiff_sequenc_pre'] = df_features['y_coord-0'] - df_features['y_coord-2']
df_features['time_sequence_pre'] = df_features['seconds_remaining-0'] - df_features['seconds_remaining-2']
df_features[['start_distance_to_goal-0', 'end_distance_to_goal-0', 'start_distance_to_goal-1', 'end_distance_to_goal-1', 'start_distance_to_goal-2', 'end_distance_to_goal-2', 'team-1', 'team-2']].isnull().sum()
df_features[['start_distance_to_goal-0', 'end_distance_to_goal-0', 'start_distance_to_goal-1', 'end_distance_to_goal-1', 'start_distance_to_goal-2', 'end_distance_to_goal-2', 'team-1', 'team-2']]
            

Unnamed: 0,start_distance_to_goal-0,end_distance_to_goal-0,start_distance_to_goal-1,end_distance_to_goal-1,start_distance_to_goal-2,end_distance_to_goal-2,team-1,team-2
0,90.001389,90.001389,,,,,False,False
1,101.986519,101.986519,90.001389,90.001389,,,True,False
2,92.402651,92.402651,101.986519,101.986519,90.001389,90.001389,True,True
3,92.402651,92.402651,92.402651,92.402651,101.986519,101.986519,True,True
4,46.970736,46.970736,92.402651,92.402651,92.402651,92.402651,True,True
...,...,...,...,...,...,...,...,...
50879,182.937831,182.937831,162.059403,18.607794,149.141711,162.059403,False,False
50880,182.937831,165.774697,182.937831,182.937831,162.059403,18.607794,True,False
50881,161.425679,69.615013,182.937831,165.774697,182.937831,182.937831,True,True
50882,69.615013,69.615013,47.940067,127.930645,18.607794,43.832066,False,False


<IPython.core.display.Javascript object>

# Split Dataset & Train Classifiers

In [21]:
labels = ['scores','concedes']
feat = ['start_distance_to_goal-0', 'end_distance_to_goal-0', 'start_distance_to_goal-1', 'end_distance_to_goal-1', 'start_distance_to_goal-2', 'end_distance_to_goal-2','team-1', 'team-2','seconds_remaining-0','goal_diff-0']
            

<IPython.core.display.Javascript object>

In [23]:
df_model = pd.concat([df_features,df_labels],axis=1)
df_score_concede_prob = pd.DataFrame()
kf = KFold(10, shuffle=True)

for train_idx, test_idx in kf.split(df_model):
    train_data = df_model.iloc[train_idx].copy()
    test_data = df_model.iloc[test_idx].copy()

    models = {}
    for label in tqdm(labels):
        model = XGBClassifier(
            n_estimators=50,
            max_depth=3
        )
        model.fit(
            X=train_data[feat],
            y=train_data[label]
        )
        models[label] = model

    dfs_predictions = {}
    for label in tqdm(labels):
        model = models[label]
        probabilities = model.predict_proba(test_data[feat])
        predictions = probabilities[:, 1]
        print(np.isnan(probabilities).sum())
        dfs_predictions[label] = pd.Series(predictions, index=test_data.index)
    df_predictions = pd.concat(dfs_predictions, axis=1)
    df_score_concede_prob = df_score_concede_prob.append(df_predictions)

100%|██████████| 2/2 [00:00<00:00,  2.04it/s]
100%|██████████| 2/2 [00:00<00:00, 101.05it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:01<00:00,  1.96it/s]
100%|██████████| 2/2 [00:00<00:00, 110.50it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:00<00:00,  2.05it/s]
100%|██████████| 2/2 [00:00<00:00, 119.10it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:00<00:00,  2.05it/s]
100%|██████████| 2/2 [00:00<00:00, 118.76it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:00<00:00,  2.01it/s]
100%|██████████| 2/2 [00:00<00:00, 89.00it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:01<00:00,  1.97it/s]
100%|██████████| 2/2 [00:00<00:00, 114.98it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:01<00:00,  1.93it/s]
100%|██████████| 2/2 [00:00<00:00, 114.73it/s]
  0%|          | 0/2 [00:00<?, ?it/s]0
0
100%|██████████| 2/2 [00:01<00:00,  1.91it/s]
100%|██████████| 

<IPython.core.display.Javascript object>

In [24]:
dfs_actions = []
dfs_actions.append(womens)
df_actions = pd.concat(dfs_actions).reset_index(drop=True)

df_actions_predictions = pd.concat([df_actions, df_score_concede_prob], axis=1)
df_actions_predictions = df_actions_predictions.dropna(subset=['start_distance_to_goal', 'end_distance_to_goal', 'diff_x', 'diff_y',
       'distance_covered', 'scores', 'concedes'])

<IPython.core.display.Javascript object>

# calculate the VAEP value\

In [25]:
def prev(x: pd.Series) -> pd.Series:
    prev_x = x.shift(1)
    prev_x[:1] = x.values[0]
    return prev_x

<IPython.core.display.Javascript object>

In [26]:
dfs_values = []
df_values = pd.DataFrame()

sameteam = prev(df_actions_predictions.team_id) == df_actions_predictions.team_id
prev_scores = prev(df_actions_predictions.scores) * sameteam + prev(df_actions_predictions.concedes) * (~sameteam)
prev_concedes = prev(df_actions_predictions.concedes) * sameteam + prev(df_actions_predictions.scores) * (~sameteam)

toolong_idx = abs(prev(df_actions_predictions.seconds_remaining) - df_actions_predictions.seconds_remaining) > 10
prev_scores[toolong_idx] = 0
prev_concedes[toolong_idx] = 0

prevgoal_idx = prev(df_actions_predictions.event) == 'Goal'
prev_scores[prevgoal_idx] = 0
prev_concedes[prevgoal_idx] = 0

df_values['offensive_value'] = df_actions_predictions.scores - prev_scores
df_values['defensive_value'] = df_actions_predictions.concedes - prev_concedes
df_values['vaep'] = df_values['offensive_value'] + df_values['defensive_value']

<IPython.core.display.Javascript object>

# Analysis

In [27]:
df_final = pd.concat([df_actions_predictions,df_values],axis=1).dropna(subset=['vaep'])

df_ranking = (df_final[['player','team','vaep']]
.groupby(['player','team'])
.agg(vaep_count=('vaep','count'),
vaep_mean=('vaep','mean'),
vaep_sum=('vaep','sum'))
.sort_values('vaep_sum',ascending=False)
.reset_index()
)

df_rank_events = (df_final[['event','vaep']]
.groupby(['event'])
.agg(vaep_count=('vaep','count'),
vaep_mean=('vaep','mean'),
vaep_sum=('vaep','sum'))
.sort_values('vaep_sum',ascending=False)
.reset_index()
)

df_zone_entries = (df_final.loc[womens['event']=='Zone Entry',['detail_1','defensive_value','offensive_value','vaep']]
.groupby(['detail_1'])
.agg(vaep_count=('vaep','count'),
vaep_mean=('vaep','mean'),
vaep_sum=('vaep','sum'))
.sort_values('vaep_sum',ascending=False)
.reset_index()
)

<IPython.core.display.Javascript object>

In [28]:
df_ranking.head(10)

Unnamed: 0,player,team,vaep_count,vaep_mean,vaep_sum
0,Rebecca Johnston,Olympic (Women) - Canada,686,0.007739,5.309006
1,Sarah-Eve Coutu Godbout,Toronto Six,274,0.014365,3.935986
2,Kendall Coyne Schofield,Olympic (Women) - United States,466,0.007787,3.628755
3,Meghan Agosta,Olympic (Women) - Canada,324,0.011066,3.585374
4,Mikyla Grant-Mentis,Toronto Six,422,0.008114,3.42395
5,Christina Putigna,Boston Pride,365,0.008743,3.191183
6,Natalie Spooner,Olympic (Women) - Canada,411,0.007523,3.091791
7,McKenna Brand,Boston Pride,439,0.006588,2.892184
8,Autumn MacDougall,Buffalo Beauts,285,0.010106,2.880242
9,Hilary Knight,Olympic (Women) - United States,447,0.006086,2.720245


<IPython.core.display.Javascript object>

In [29]:
df_rank_events

Unnamed: 0,event,vaep_count,vaep_mean,vaep_sum
0,Shot,3524,0.053652,189.069397
1,Goal,132,0.203525,26.865257
2,Play,14673,0.00134,19.657185
3,Incomplete Play,6111,0.002888,17.64809
4,Zone Entry,3744,0.004306,16.121828
5,Dump In/Out,3545,0.001585,5.617347
6,Penalty Taken,260,-0.003776,-0.981676
7,Takeaway,2092,-0.004512,-9.438867
8,Faceoff Win,1629,-0.035515,-57.853642
9,Puck Recovery,15174,-0.010863,-164.842377


<IPython.core.display.Javascript object>

## What looks wrong here:
* Takeaway gains possesion and has negative value
* Incomplete Play loses Possession and has positive value
* Faceoff Win: alth negative value
* Puck Recovery

## What looks right:
* Shot: high value overall and mean
* Goal: highest value mean
* Play
* zone entry: kinda right.
* Dump in/out: low value. seems right
* penalty taken: negative value

In [30]:
df_zone_entries

Unnamed: 0,detail_1,vaep_count,vaep_mean,vaep_sum
0,Carried,2316,0.006827,15.810225
1,Dumped,1167,0.000243,0.283256
2,Played,261,0.000109,0.028348


<IPython.core.display.Javascript object>