# data_preprocess.ipynb
Reads NHL shot data from MoneyPuck's [2015-2023 CSV](https://peter-tanner.com/moneypuck/downloads/shots_2015-2023.zip) and transforms it into the input file for our XG modelling.

In [10]:
# Importing Packages
import pandas as pd
import numpy as np

In [11]:
# Loading shot data CSV from MoneyPuck. https://peter-tanner.com/moneypuck/downloads/shots_2015-2023.zip
# TODO: Maybe download separate CSVs by season?
# TODO: Check that this CSV is here? Otherwise download it using requests library
shot_data = pd.read_csv('data/shots_2015-2023.csv')

In [12]:
# TODO: Revisit column list, reference notes/MoneyPuck_Shot_Data_Dictionary.csv for data dictionary

# Filling df with only the columns above
shot_data = shot_data.loc[:, ['isPlayoffGame', 'timeSinceLastEvent', 'period', 'location', 'event', 'homeTeamGoals', 'awayTeamGoals', 'shotAngleAdjusted', 'shotDistance', 'shotType', 'shotOnEmptyNet', 'shotRebound', 'homeSkatersOnIce', 'awaySkatersOnIce', 'isHomeTeam']]

In [13]:
# TODO: Discuss whether or not we want to do transformations (or take moneypuck transformed features)
     # X and Y cords + angle

# TODO: Discuss whether or not we include MoneyPuck features
     # Rebound
     # Empty net
     # Off the rush
     # Speed from last event
     # Last event type + coordinates
     # Last event shot angle + distance
     # Last event category
     # Home empty net
     # Away empty net
     # Penalty length/time left?
     # Shooter handedness
     # TODO: Goalie handedness? Not listed here though so we would have to collect this and merge it into the dataset
     # Shooting team # of forwards or D on the ice?
     # Shift length of the shooter, and TOI for the shooter
     # Off Wing?
     # Time in game?

In [14]:
# One hot encode shot type
one_hot_encoded = pd.get_dummies(shot_data['shotType'], prefix='shotType')
one_hot_encoded = one_hot_encoded.astype(int)
shot_data = pd.concat([shot_data, one_hot_encoded], axis=1)
shot_data = shot_data.drop(columns='shotType')

In [15]:
# TODO: Double check these 3 sets of transformations.

# Counting skaters on the shooting team and defending team instead of home/away
shot_data['shootingSkatersOnIce'] = shot_data.apply(
     lambda row: row['homeSkatersOnIce'] if row['isHomeTeam'] else row['awaySkatersOnIce'], axis=1
)
shot_data['defendingSkatersOnIce'] = shot_data.apply(
     lambda row: row['awaySkatersOnIce'] if row['isHomeTeam'] else row['homeSkatersOnIce'], axis=1
)

# Same transformation as above for score
shot_data['shootingTeamGoals'] = shot_data.apply(
     lambda row: row['homeTeamGoals'] if row['isHomeTeam'] else row['awayTeamGoals'], axis=1
)
shot_data['defendingTeamGoals'] = shot_data.apply(
     lambda row: row['awayTeamGoals'] if row['isHomeTeam'] else row['homeTeamGoals'], axis=1
)

# Setting bool for if shooting team is in offensive zone
shot_data['inOffensiveZone'] = shot_data.apply(
     lambda row: 1 if (row['isHomeTeam'] and row['location'] == 'AWAYZONE') or 
               (not row['isHomeTeam'] and row['location'] == 'HOMEZONE') 
               else 0, axis=1
)
# Setting bool for if shooting team is in neutral zone
shot_data['inNeutralZone'] = shot_data['location'].apply(lambda x: 1 if x == 'Neu. Zone' else 0)

# Drop old cols
shot_data = shot_data.drop(['homeSkatersOnIce', 'awaySkatersOnIce', 'homeTeamGoals', 'awayTeamGoals', 'location'], axis=1)


In [16]:
# Encoding goals as 1 and misses or saves as 0
shot_data['isGoal'] = shot_data['event'].apply(lambda x: 1 if x.lower() == 'goal' else 0)

shot_data = shot_data.drop('event', axis=1)

In [17]:
print(shot_data.columns)
# TODO: DEFL vs TIP shot type? Investigate

Index(['isPlayoffGame', 'timeSinceLastEvent', 'period', 'shotAngleAdjusted',
       'shotDistance', 'shotOnEmptyNet', 'shotRebound', 'isHomeTeam',
       'shotType_BACK', 'shotType_DEFL', 'shotType_SLAP', 'shotType_SNAP',
       'shotType_TIP', 'shotType_WRAP', 'shotType_WRIST',
       'shootingSkatersOnIce', 'defendingSkatersOnIce', 'shootingTeamGoals',
       'defendingTeamGoals', 'inOffensiveZone', 'inNeutralZone', 'isGoal'],
      dtype='object')


In [18]:
# Save DF to CSV
shot_data.to_csv('data/xg_shot_input_2015-2023.csv', index=False)