# Hockey VAEP

In [302]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import lightgbm as lgb

In [304]:
project_dir = '/Users/keltim01/git_repos/TK5/Data/Big-Data-Cup-2021/'

In [306]:
womens = pd.read_csv(project_dir + 'hackathon_womens.csv')
womens.head()

Unnamed: 0,game_date,Home Team,Away Team,Period,Clock,Home Team Skaters,Away Team Skaters,Home Team Goals,Away Team Goals,Team,...,Event,X Coordinate,Y Coordinate,Detail 1,Detail 2,Detail 3,Detail 4,Player 2,X Coordinate 2,Y Coordinate 2
0,2018-02-11,Olympic (Women) - Canada,Olympic (Women) - Olympic Athletes from Russia,1,20:00,5,5,0,0,Olympic (Women) - Canada,...,Faceoff Win,100,42,Backhand,,,,Lyudmila Belyakova,,
1,2018-02-11,Olympic (Women) - Canada,Olympic (Women) - Olympic Athletes from Russia,1,19:59,5,5,0,0,Olympic (Women) - Canada,...,Puck Recovery,91,67,,,,,,,
2,2018-02-11,Olympic (Women) - Canada,Olympic (Women) - Olympic Athletes from Russia,1,19:59,5,5,0,0,Olympic (Women) - Canada,...,Dump In/Out,106,81,Retained,,,,,,
3,2018-02-11,Olympic (Women) - Canada,Olympic (Women) - Olympic Athletes from Russia,1,19:59,5,5,0,0,Olympic (Women) - Canada,...,Zone Entry,106,81,Dumped,,,,Liana Ganeyeva,,
4,2018-02-11,Olympic (Women) - Canada,Olympic (Women) - Olympic Athletes from Russia,1,19:53,5,5,0,0,Olympic (Women) - Canada,...,Puck Recovery,168,1,,,,,,,


## Defining the Actions

In [308]:
womens.columns

Index(['game_date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2'],
      dtype='object')

In [310]:
womens['Event'].unique()

array(['Faceoff Win', 'Puck Recovery', 'Dump In/Out', 'Zone Entry',
       'Play', 'Takeaway', 'Incomplete Play', 'Shot', 'Penalty Taken',
       'Goal'], dtype=object)

In [311]:
womens['game_id'] = womens.loc[:, ['game_date', 'Home Team', 'Away Team']].sum(axis=1).astype('category').cat.codes
womens['is_home'] = 0 
womens['is_shot'] = 0
womens['is_goal'] = 0
womens['event_id'] = womens['Event'].astype('category').cat.codes
womens['team_id'] = womens['Team'].astype('category').cat.codes
womens['player_id'] = womens['Player'].astype('category').cat.codes

for x in range(1,5):
    womens[f'Detail {x}_code'] = womens[f'Detail {x}'].astype('category').cat.codes
womens.loc[womens['Home Team'] == womens['Team'], 'is_home'] = 1
womens.loc[womens['Event']=='Shot', 'is_shot'] = 1
womens.loc[womens['Event']=='Goal', 'is_goal'] = 1
womens['goal_diff'] = womens['Home Team Goals'].sub(womens['Away Team Goals'])
womens['Clock'] = pd.to_datetime(womens['Clock'], format='%M:%S')
womens['seconds_remaining'] = womens['Clock'].dt.minute.mul(60).add(womens['Clock'].dt.second)


In [312]:
womens['team_id'].unique()

array([1, 3, 2, 4, 5, 0], dtype=int8)

In [313]:
womens['Team'].unique()

array(['Olympic (Women) - Canada',
       'Olympic (Women) - Olympic Athletes from Russia',
       'Olympic (Women) - Finland', 'Olympic (Women) - United States',
       'St. Lawrence Saints', 'Clarkson Golden Knights'], dtype=object)

In [314]:
womens.loc[womens['team_id'] == 3,'Team']

6       Olympic (Women) - Olympic Athletes from Russia
7       Olympic (Women) - Olympic Athletes from Russia
8       Olympic (Women) - Olympic Athletes from Russia
9       Olympic (Women) - Olympic Athletes from Russia
10      Olympic (Women) - Olympic Athletes from Russia
                             ...                      
6937    Olympic (Women) - Olympic Athletes from Russia
6952    Olympic (Women) - Olympic Athletes from Russia
6953    Olympic (Women) - Olympic Athletes from Russia
6966    Olympic (Women) - Olympic Athletes from Russia
6967    Olympic (Women) - Olympic Athletes from Russia
Name: Team, Length: 1219, dtype: object

In [315]:
PITCH_LENGTH = 200
PITCH_WIDTH = 85 

In [316]:
action_id = 5


In [317]:
womens[action_id:action_id+1][['X Coordinate',
       'Y Coordinate', 'X Coordinate 2', 'Y Coordinate 2']]

Unnamed: 0,X Coordinate,Y Coordinate,X Coordinate 2,Y Coordinate 2
5,168,1,197.0,26.0


In [318]:
GOAL_X = PITCH_LENGTH - 10 
GOAL_Y = PITCH_WIDTH / 2

In [319]:
diff_x1 = GOAL_X - womens['X Coordinate']
diff_y1 = abs(GOAL_Y - womens['Y Coordinate'])
diff_x2 = GOAL_X - womens['X Coordinate 2']
diff_y2 = abs(GOAL_Y - womens['Y Coordinate 2'])
womens['start_distance_to_goal'] = np.sqrt(diff_x1 ** 2 + diff_y1 ** 2)
womens['end_distance_to_goal'] = np.sqrt(diff_x2 ** 2 + diff_y2 ** 2)


In [320]:
womens['diff_x'] = womens['X Coordinate 2'] - womens['X Coordinate']
womens['diff_y'] = womens['Y Coordinate 2'] - womens['Y Coordinate']
womens['distance_covered'] = np.sqrt((womens['X Coordinate 2'] - womens['X Coordinate']) ** 2 + (womens['Y Coordinate 2'] - womens['Y Coordinate']) ** 2)


In [321]:
womens['distance_covered']

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
23997    50.159745
23998    22.360680
23999    44.204072
24000    20.124612
24001          NaN
Name: distance_covered, Length: 24002, dtype: float64

In [322]:
womens.columns

Index(['game_date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2', 'game_id', 'is_home',
       'is_shot', 'is_goal', 'event_id', 'team_id', 'player_id',
       'Detail 1_code', 'Detail 2_code', 'Detail 3_code', 'Detail 4_code',
       'goal_diff', 'seconds_remaining', 'start_distance_to_goal',
       'end_distance_to_goal', 'diff_x', 'diff_y', 'distance_covered'],
      dtype='object')

## Sequence Features

In [323]:
womens.dtypes

game_date                         object
Home Team                         object
Away Team                         object
Period                             int64
Clock                     datetime64[ns]
Home Team Skaters                  int64
Away Team Skaters                  int64
Home Team Goals                    int64
Away Team Goals                    int64
Team                              object
Player                            object
Event                             object
X Coordinate                       int64
Y Coordinate                       int64
Detail 1                          object
Detail 2                          object
Detail 3                          object
Detail 4                          object
Player 2                          object
X Coordinate 2                   float64
Y Coordinate 2                   float64
game_id                             int8
is_home                            int64
is_shot                            int64
is_goal         

In [299]:
_womens = pd.DataFrame()
for gid in womens['game_id'].unique():
    _df = womens.loc[womens['game_id']==gid].sort_values(['Period','seconds_remaining'], ascending=[1,0]).reset_index(drop=True).copy()
    _df['goal_n10'] = _df.loc[:, 'is_goal'].rolling(10, min_periods=1).sum().shift(-9)
    _df = _df.merge(_df.shift(1), left_index=True, right_index=True, suffixes=('', '_1back')).merge(_df.shift(2), left_index=True, right_index=True, suffixes=('', '_2back'))
    _womens = _womens.append(_df)
womens = _womens
womens['is_goal_n10'] = womens['goal_n10'].astype(bool).astype(int)


In [300]:
womens['goal_n10']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
2139    NaN
2140    NaN
2141    NaN
2142    NaN
2143    NaN
Name: goal_n10, Length: 24002, dtype: float64

In [301]:
model_data = womens.dropna(subset=['goal_diff_2back']).copy()
model_data['goal_n10'] = model_data['goal_n10'].fillna(0)

0.0    23428
1.0      545
2.0        3
Name: goal_n10, dtype: int64

In [303]:
features = ['Period', 'Home Team Skaters', 'Away Team Skaters', 'X Coordinate',
       'Y Coordinate', 'is_home', 'event_id', 'Detail 1_code', 'Detail 2_code', 'Detail 3_code', 'Detail 4_code',
       'goal_diff', 'seconds_remaining']
features = features + [f"{x}_1back" for x in features] + [f"{x}_2back" for x in features]
features = [x for x in features if x not in ['event_id', 'Detail 1_code', 'Detail 2_code', 'Detail 3_code', 'Detail 4_code']]
target = 'is_goal_n10'


## Train model ? 

In [305]:
df_goal_prob_pred = pd.DataFrame()
kf = KFold(10, shuffle=True)

for train_idx, test_idx in kf.split(model_data):
    train_data = model_data.iloc[train_idx].copy()
    test_data = model_data.iloc[test_idx].copy()
    test_data[f'{target}_naive_pred'] = model_data[target].mean()

# linear 
model = LogisticRegression(max_iter=300)
model.fit(train_data.loc[:, features], train_data[target])
test_data[f'{target}_linear_pred'] = pd.DataFrame(model.predict_proba(test_data.loc[:, features]), index=test_data.index)[1]

# rf 
model = RandomForestClassifier()
model.fit(train_data.loc[:, features], train_data[target])
test_data[f'{target}_rf_pred'] = pd.DataFrame(model.predict_proba(test_data.loc[:, features]), index=test_data.index)[1]

# gbm 
lgb_train = lgb.Dataset(train_data.loc[:, features], train_data.loc[:, target])
lgb_test = lgb.Dataset(test_data.loc[:, features], test_data.loc[:, target])
p = dict(
    objective='binary',
    num_iterations=2000
)

model = lgb.train(p, lgb_train, early_stopping_rounds=200, valid_sets=lgb_test)
test_data[f"{target}_gbm_pred"] = pd.Series(model.predict(test_data.loc[:, features]), index=test_data.index)
df_goal_prob_pred = df_goal_prob_pred.append(test_data)

df_goal_prob_pred.shape


[LightGBM] [Info] Number of positive: 581, number of negative: 20998
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1798
[LightGBM] [Info] Number of data points in the train set: 21579, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026924 -> initscore=-3.587432
[LightGBM] [Info] Start training from score -3.587432
[1]	valid_0's binary_logloss: 0.128984
Training until validation scores don't improve for 200 rounds
[2]	valid_0's binary_logloss: 0.123203
[3]	valid_0's binary_logloss: 0.119477
[4]	valid_0's binary_logloss: 0.11757
[5]	valid_0's binary_logloss: 0.115151
[6]	valid_0's binary_logloss: 0.112857
[7]	valid_0's binary_logloss: 0.111454
[8]	valid_0's binary_logloss: 0.110633
[9]	valid_0's binary_logloss: 0.109843
[10]	valid_0's binary_logloss: 0.106863
[11]	valid_0's binary_logloss: 0.106137
[12]	valid_0's binary_logloss: 0.103153
[13]	valid_0's b

(2397, 125)

In [307]:
goal_prob_metrics = {}
goal_prob_metrics['naive_loss'] = log_loss(df_goal_prob_pred[target], df_goal_prob_pred[f"{target}_naive_pred"])
goal_prob_metrics['naive_acc'] = accuracy_score(df_goal_prob_pred[target], round(df_goal_prob_pred[f"{target}_naive_pred"],0))

goal_prob_metrics['linear_loss'] = log_loss(df_goal_prob_pred[target], df_goal_prob_pred[f"{target}_linear_pred"])
goal_prob_metrics['linear_acc'] = accuracy_score(df_goal_prob_pred[target], round(df_goal_prob_pred[f"{target}_linear_pred"], 0)) 

goal_prob_metrics['rf_loss'] = log_loss(df_goal_prob_pred[target], df_goal_prob_pred[f"{target}_rf_pred"])
goal_prob_metrics['rf_acc'] = accuracy_score(df_goal_prob_pred[target], round(df_goal_prob_pred[f"{target}_rf_pred"], 0)) 

goal_prob_metrics['gbm_loss'] = log_loss(df_goal_prob_pred[target], df_goal_prob_pred[f"{target}_gbm_pred"])
goal_prob_metrics['gbm_acc'] = accuracy_score(df_goal_prob_pred[target], round(df_goal_prob_pred[f"{target}_gbm_pred"], 0)) 

for model_type in ['naive', 'linear', 'rf', 'gbm']:
  print(f"""{model_type}_loss: {goal_prob_metrics[f'{model_type}_loss']}\n{model_type}_acc: {goal_prob_metrics[f'{model_type}_acc']}\n""")

naive_loss: 0.15277506236701402
naive_acc: 0.9649561952440551

linear_loss: 0.1453695353367623
linear_acc: 0.9649561952440551

rf_loss: 0.08198430318512152
rf_acc: 0.9745515227367543

gbm_loss: 0.07103260079311756
gbm_acc: 0.9766374634960368



In [309]:
df_goal_prob_pred.T

Unnamed: 0,5,23,24,33,39,48,60,68,103,104,...,2012,2015,2066,2072,2075,2090,2098,2111,2140,2143
game_date,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,2018-02-11,...,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14,2019-04-14
Home Team,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,Olympic (Women) - Canada,...,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland,Olympic (Women) - Finland
Away Team,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,Olympic (Women) - Olympic Athletes from Russia,...,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States,Olympic (Women) - United States
Period,1,1,1,1,1,1,1,1,1,1,...,4,4,4,4,4,4,4,4,4,4
Clock,1900-01-01 00:19:53,1900-01-01 00:19:16,1900-01-01 00:19:16,1900-01-01 00:18:58,1900-01-01 00:18:51,1900-01-01 00:18:26,1900-01-01 00:18:10,1900-01-01 00:17:56,1900-01-01 00:16:38,1900-01-01 00:16:36,...,1900-01-01 00:05:34,1900-01-01 00:05:23,1900-01-01 00:02:52,1900-01-01 00:02:47,1900-01-01 00:02:43,1900-01-01 00:02:03,1900-01-01 00:01:49,1900-01-01 00:01:15,1900-01-01 00:00:07,1900-01-01 00:00:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
is_goal_n10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
is_goal_n10_naive_pred,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,...,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361,0.0277361
is_goal_n10_linear_pred,0.0159856,0.00786753,0.00918289,0.0116487,0.00747685,0.0250762,0.0227551,0.0152011,0.0145233,0.0152378,...,0.0467222,0.0732697,0.037135,0.0748763,0.0868145,0.0212466,0.114722,0.0942207,0.131859,0.0677592
is_goal_n10_rf_pred,0.08,0.01,0,0.01,0.01,0.01,0.04,0,0,0.01,...,0.01,0.02,0.01,0,0.02,0.03,0.05,0.04,0.88,0.84
