In [40]:
import pandas as pd
import hockey_scraper
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [41]:
season_data = pd.read_csv('data/2022_2023/nhl_pbp_20222023.csv', low_memory=False)

In [42]:
flip_str_lambda = lambda input_str:''.join([input_str[2], input_str[1], input_str[0]])
season_data["Strength"] = np.where(season_data["Ev_Team"] == season_data["Home_Team"], season_data['Strength'], season_data["Strength"].apply(flip_str_lambda))
# Above changes the order of strength to make it the event team first rather than home team. In original data home team is allways listed first.

season_data["Score_For"] = np.where(season_data["Ev_Team"] == season_data["Home_Team"], season_data["Home_Score"], season_data["Away_Score"])
season_data["Score_Against"] = np.where(season_data["Ev_Team"] == season_data["Away_Team"], season_data["Home_Score"], season_data["Away_Score"])

In [43]:
strengths_to_encode = ['5x5', '5x4', '5x3', '5x6', '6x4', '6x3', '6x5', '4x4', '4x3', '4x5', '4x6' '3x3', '3x4', '3x5', '3x6']

season_data = season_data[season_data['Strength'].isin(strengths_to_encode)]

def encode_col(inputStr, season_data):
    one_hot_encoded = pd.get_dummies(season_data[inputStr], prefix=inputStr)
    one_hot_encoded = one_hot_encoded.astype(int)
    season_data = pd.concat([season_data, one_hot_encoded], axis=1)
    season_data = season_data.drop(columns= inputStr)
    return season_data


cols_to_enc = ['Strength', 'Type']
for i in cols_to_enc:
    season_data = encode_col(i, season_data)

encoded_cols = ['Strength_3x4','Strength_3x5','Strength_4x3','Strength_4x4','Strength_4x5','Strength_5x3','Strength_5x4','Strength_5x5',
                'Strength_5x6', 'Strength_6x5','Type_BACKHAND','Type_DEFLECTED','Type_SLAP SHOT','Type_SNAP SHOT','Type_TIP-IN','Type_WRAP-AROUND','Type_WRIST SHOT']

In [44]:
shot_data = season_data[season_data["Event"].isin(["SHOT", "GOAL", "MISS"])]

The above code finds the team for which the shot, miss, or goal occurs, and creates an attribute to show which team is which

In [45]:
away_players_on_ice = np.array(['awayPlayer1', 'awayPlayer2', 'awayPlayer3', 'awayPlayer4', 'awayPlayer5', 'awayPlayer6', 'Away_Goalie'])
home_players_on_ice = np.array(['homePlayer1', 'homePlayer2', 'homePlayer3', 'homePlayer4', 'homePlayer5', 'homePlayer6', 'Home_Goalie'])

ev_team_players = np.array(['evPlayer1', 'evPlayer2','evPlayer3', 'evPlayer4','evPlayer5', 'evPlayer6', 'evGoalie'])
against_team_players = np.array(['agPlayer1', 'agPlayer2', 'agPlayer3', 'agPlayer4', 'agPlayer5', 'agPlayer6', 'agGoalie'])

for i in range(0, 7):
    shot_data[ev_team_players[i]] = np.where(shot_data['Ev_Team'] == shot_data['Home_Team'], shot_data[home_players_on_ice[i]], shot_data[away_players_on_ice[i]])
    shot_data[against_team_players[i]] = np.where(shot_data['Ev_Team'] == shot_data['Home_Team'], shot_data[away_players_on_ice[i]], shot_data[home_players_on_ice[i]])

# Convert numpy arrays to lists
ev_team_players_list = ev_team_players.tolist()
against_team_players_list = against_team_players.tolist()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-

Above, changes the home and away team attributes to the event team and against team attributes. This will be important, to analyze the run of play while a player is on the ice. We can see the chances and goals for and against a players team when they are on the ice. The players involved will not be used in the training of the expected goals model, but will be used to create metrics from this model.

In [46]:
shot_data = shot_data[shot_data["Ev_Zone"] == "Off"]

# Define model columns including the converted lists
model_Cols = ["Event", "Period", "Seconds_Elapsed", "xC", "yC", "Score_For", "Score_Against", 'p1_name'] + encoded_cols + ev_team_players_list + against_team_players_list


smaller_df = shot_data[model_Cols]
larger_df = shot_data[model_Cols]

smaller_df = smaller_df.dropna(subset=['xC', 'yC'])

larger_df = larger_df[larger_df['xC'].isna() | larger_df['yC'].isna()]

larger_df['Event'] = np.where(larger_df['Event'] == 'GOAL', 1, 0)
smaller_df['Event'] = np.where(smaller_df['Event'] == 'GOAL', 1, 0)

larger_df.to_csv('nullLocationShots.csv', index = False)

Here we conduct some data cleaning. We will remove all shots that are not in the oponents zone, or are missing crutial data to determine the expected goals, due to a data quality issue. We must then assume these errors are random and not systemic since we are removing them from training sets. We have conducted some data analysis that showed, the preportion of goals is similiar in the misentered data as it is in other data. Note we also encode the events into 0 and 1, where 1 is a goal and 0 is not a goal. We will not consider the difference between a shot and a miss.

In [47]:
smaller_df["xC"] = np.abs(smaller_df['xC'])
# smaller_df["yC"] = smaller_df["yC"] + 42
#max width is 42, since the ice is 85 feet wide at its maximum. We have made all values positive for ease of minupulation

smaller_df['Distance'] = np.sqrt((np.power((90 - smaller_df['xC']), 2) + np.power((smaller_df['yC']), 2)))

smaller_df['Angle'] = np.arctan(np.abs(smaller_df['yC'])/(smaller_df['xC']))

smaller_df["Angle"] = smaller_df["Angle"].fillna(0)
smaller_df["Distance"] = smaller_df["Distance"].fillna(0)

smaller_df = smaller_df.drop(columns= ['xC', 'yC'])

Here we determine the distance and angle of each shot, this will be an important attribute for the model. The abseloute value of the x cordinate is to standardize the data to consider a single zone, this is due to the assumption we have above. Please note that shots from the non offensive zone will be counted as 0, and empty net goals are not considered. 

Here we encode the strength (5v5, 5v4, ect.) and the type of shot. These are both variables to be used; however, they are not ordinal, and so this is done using one hot encoding, creating an indicator variable for each type and strength. 

In [48]:
smaller_df.to_csv('Tempcleaned.csv', index = False)

Finally this is printed to a csv file, so that the final data can be inspected

Next I will return to the original dataframe, so that I can find the percentile and rank things like hits, assists and blocked shots to add to a graphic 

In [49]:
basicDataCols = ['Event', 'p1_name']

hit_data = season_data[season_data["Event"].eq("HIT")]
hit_data = hit_data[basicDataCols] #Here we shorten the dataframe to only include the players and the event for only hits. Then We can find the number of the event.

hit_data.to_csv('hit_data.csv', index=False) #print to csv to inspect and use in later analysis

#repeat process for blocked shots

block_data = season_data[season_data["Event"].eq("BLOCK")]
block_data = block_data[basicDataCols]

block_data.to_csv('block_data.csv', index=False)

Next we will address assists. To do this we first isolate goals, then fine the second and third players if they exist.

In [50]:
assistDataCols = ['Event', 'p2_name', 'p3_name'] + encoded_cols

assist_data = season_data[season_data["Event"].eq("GOAL")]
assist_data = assist_data.dropna(subset=['p2_name'])
assist_data = assist_data[assistDataCols]



assist_data.to_csv('assist_data.csv', index =False)


Finally, we will analyze takeaways and giveaways. The quality of this data and subjectivity has been called into quesiton; however with reliable data this could be a useful metric to analyze. Here while we agknowledge the limitations in the data, if the data errors are random, the trends in data still could make for a usefull metric. So we will track takaways and giveaways along with the zone they occur in. That said, when not at even strength giveaways may be a part of a teams system, and so these will be removed from the dataset, only even strength giveaways and takeaways will be considered.

In [51]:
takeGiveCols = ['Event', 'p1_name', 'Ev_Zone']

tg_data = season_data[season_data["Event"].isin(["TAKE", "GIVE"])]
tg_data.loc[:, 'Event'] = np.where(tg_data['Event'] == 'TAKE', 1, 0)  #make takeaways 1 and giveaways 0, then we can find the ratio and count of both very easily.

tg_data = tg_data[takeGiveCols] #only include essential coloumns

tg_data.to_csv('tg_data.csv', index=False)
