In [73]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import lightgbm as lgb

In [74]:
project_dir = '/Users/keltim01/git_repos/TK5/Data/Big-Data-Cup-2021/'

In [75]:
womens = pd.read_csv(project_dir + 'hackathon_womens.csv')
nwhl = pd.read_csv(project_dir + 'hackathon_nwhl.csv')

In [76]:
womens = womens.append(nwhl, ignore_index=True)

In [77]:
womens['game_id'] = womens.loc[:, ['game_date', 'Home Team', 'Away Team']].sum(axis=1).astype('category').cat.codes
womens['is_home'] = 0 
womens['is_shot'] = 0
womens['is_goal'] = 0
womens['event_id'] = womens['Event'].astype('category').cat.codes
womens['team_id'] = womens['Team'].astype('category').cat.codes
womens['player_id'] = womens['Player'].astype('category').cat.codes

for x in range(1,5):
    womens[f'Detail {x}_code'] = womens[f'Detail {x}'].astype('category').cat.codes
womens.loc[womens['Home Team'] == womens['Team'], 'is_home'] = 1
womens.loc[womens['Event']=='Shot', 'is_shot'] = 1
womens.loc[womens['Event']=='Goal', 'is_goal'] = 1
womens['goal_diff'] = womens['Home Team Goals'].sub(womens['Away Team Goals'])
womens['Clock'] = pd.to_datetime(womens['Clock'], format='%M:%S')
womens['seconds_remaining'] = womens['Clock'].dt.minute.mul(60).add(womens['Clock'].dt.second)

In [78]:
PITCH_LENGTH = 200
PITCH_WIDTH = 85 
action_id = 5
GOAL_X = PITCH_LENGTH - 10 
GOAL_Y = PITCH_WIDTH / 2

In [79]:
diff_x1 = GOAL_X - womens['X Coordinate']
diff_y1 = abs(GOAL_Y - womens['Y Coordinate'])
diff_x2 = GOAL_X - womens['X Coordinate 2']
diff_y2 = abs(GOAL_Y - womens['Y Coordinate 2'])
womens['start_distance_to_goal'] = np.sqrt(diff_x1 ** 2 + diff_y1 ** 2)
womens['end_distance_to_goal'] = np.sqrt(diff_x2 ** 2 + diff_y2 ** 2)
womens['diff_x'] = womens['X Coordinate 2'] - womens['X Coordinate']
womens['diff_y'] = womens['Y Coordinate 2'] - womens['Y Coordinate']
womens['distance_covered'] = np.sqrt((womens['X Coordinate 2'] - womens['X Coordinate']) ** 2 + (womens['Y Coordinate 2'] - womens['Y Coordinate']) ** 2)

In [80]:
womens['Team'].unique()

array(['Olympic (Women) - Canada',
       'Olympic (Women) - Olympic Athletes from Russia',
       'Olympic (Women) - Finland', 'Olympic (Women) - United States',
       'St. Lawrence Saints', 'Clarkson Golden Knights', 'Boston Pride',
       'Minnesota Whitecaps', 'Connecticut Whale', 'Buffalo Beauts',
       'Toronto Six', 'Metropolitan Riveters'], dtype=object)

## Labels

In [81]:
goals = womens['Event'].str.contains('Goal')
y = pd.concat([womens.loc[:, 'is_goal'], womens.loc[:,'team_id']], axis = 1)
y.columns = ['goal','team_id']
for i in range(1, 10):
    for col in ['team_id', 'goal']:
        shifted = y[col].shift(-i)
        shifted[-i:] = y[col][len(y) - 1]
        y[f'{col}+{i}'] = shifted.astype(int)

scores = y['goal']
concedes = y['goal']
for i in range(1, 10):
    goal_scored = y[f'goal+{i}'] & (y[f'team_id+{i}'] == y['team_id'])
    goal_opponent = y[f'goal+{i}'] & (y[f'team_id+{i}'] != y['team_id'])
    scores = scores | goal_scored
    concedes = concedes | goal_opponent
label_scores = pd.DataFrame(scores, columns=['scores'])
label_concedes = pd.DataFrame(concedes, columns=['concedes'])    

In [82]:
df_labels = pd.concat([label_scores, label_concedes], axis=1)

## Features 

In [83]:
features = ['team_id','Period', 'Home Team Skaters', 'Away Team Skaters', 'X Coordinate',
       'Y Coordinate', 'X Coordinate 2', 'Y Coordinate 2', 'is_home', 'event_id', 'Detail 1_code', 'Detail 2_code', 'Detail 3_code', 'Detail 4_code',
       'goal_diff', 'seconds_remaining','diff_x', 'diff_y', 'distance_covered']

In [84]:
womens.columns

Index(['game_date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2', 'game_id', 'is_home',
       'is_shot', 'is_goal', 'event_id', 'team_id', 'player_id',
       'Detail 1_code', 'Detail 2_code', 'Detail 3_code', 'Detail 4_code',
       'goal_diff', 'seconds_remaining', 'start_distance_to_goal',
       'end_distance_to_goal', 'diff_x', 'diff_y', 'distance_covered'],
      dtype='object')

In [85]:
womens[['start_distance_to_goal','end_distance_to_goal','diff_x','diff_y','distance_covered']]

Unnamed: 0,start_distance_to_goal,end_distance_to_goal,diff_x,diff_y,distance_covered
0,90.001389,,,,
1,101.986519,,,,
2,92.402651,,,,
3,92.402651,,,,
4,46.970736,,,,
...,...,...,...,...,...
50879,182.937831,,,,
50880,182.937831,165.774697,21.0,21.0,29.698485
50881,161.425679,,,,
50882,69.615013,,,,


In [86]:
for step in range(0,3):
    df_features[f'team-{step}'] = df_features['team_id-0'] == df_features[f'team_id-{step}']

In [90]:
df_features.columns

Index(['team_id-0', 'Period-0', 'Home Team Skaters-0', 'Away Team Skaters-0',
       'X Coordinate-0', 'Y Coordinate-0', 'X Coordinate 2-0',
       'Y Coordinate 2-0', 'is_home-0', 'event_id-0', 'Detail 1_code-0',
       'Detail 2_code-0', 'Detail 3_code-0', 'Detail 4_code-0', 'goal_diff-0',
       'seconds_remaining-0', 'diff_x-0', 'diff_y-0', 'distance_covered-0',
       'team_id-1', 'Period-1', 'Home Team Skaters-1', 'Away Team Skaters-1',
       'X Coordinate-1', 'Y Coordinate-1', 'X Coordinate 2-1',
       'Y Coordinate 2-1', 'is_home-1', 'event_id-1', 'Detail 1_code-1',
       'Detail 2_code-1', 'Detail 3_code-1', 'Detail 4_code-1', 'goal_diff-1',
       'seconds_remaining-1', 'diff_x-1', 'diff_y-1', 'distance_covered-1',
       'team_id-2', 'Period-2', 'Home Team Skaters-2', 'Away Team Skaters-2',
       'X Coordinate-2', 'Y Coordinate-2', 'X Coordinate 2-2',
       'Y Coordinate 2-2', 'is_home-2', 'event_id-2', 'Detail 1_code-2',
       'Detail 2_code-2', 'Detail 3_code-2', 'De

In [87]:
for step in range(0,3):
    df_features.loc[~(df_features[f'team-{step}']),f'X Coordinate-{step}'] = PITCH_LENGTH - df_features[f'X Coordinate-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'X Coordinate 2-{step}'] = PITCH_LENGTH - df_features[f'X Coordinate 2-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'Y Coordinate-{step}'] = PITCH_WIDTH - df_features[f'Y Coordinate-{step}']
    df_features.loc[~(df_features[f'team-{step}']),f'Y Coordinate 2-{step}'] = PITCH_WIDTH - df_features[f'Y Coordinate 2-{step}']
    

In [88]:
for step in range(0,3):
    start_diff_x = GOAL_X - df_features[f'X Coordinate-{step}']
    start_diff_y = abs(GOAL_Y - df_features[f'Y Coordinate-{step}'])
    df_features[f'start_distance_to_goal-{step}'] = np.sqrt(start_diff_x ** 2 + start_diff_y ** 2)
    end_diff_x = GOAL_X - df_features[f'X Coordinate 2-{step}']
    end_diff_y = abs(GOAL_Y - df_features[f'Y Coordinate 2-{step}'])
    df_features[f'end_distance_to_goal-{step}'] = np.sqrt(end_diff_x ** 2 + end_diff_y ** 2)
    df_features[f'diff_x-{step}'] = df_features[f'X Coordinate 2-{step}'] - df_features[f'X Coordinate-{step}']
    df_features[f'diff_y-{step}'] = df_features[f'Y Coordinate 2-{step}'] - df_features[f'Y Coordinate-{step}']
    df_features[f'distance_covered-{step}'] = np.sqrt((df_features[f'X Coordinate 2-{step}'] - df_features[f'X Coordinate-{step}']) ** 2 + (df_features[f'Y Coordinate 2-{step}'] - df_features[f'Y Coordinate-{step}']) ** 2)
        

In [91]:
df_features['xdiff_sequenc_pre'] = df_features['X Coordinate-0'] - df_features['X Coordinate-2']
df_features['ydiff_sequenc_pre'] = df_features['Y Coordinate-0'] - df_features['Y Coordinate-2']
df_features['time_sequence_pre'] = df_features['seconds_remaining-0'] - df_features['seconds_remaining-2']