In [53]:
import SeasonData as sdata
import pandas as pd
import ipywidgets

In [54]:
# Generating play-by-play data for year 2022
data = sdata.NHLData()  # Object creation
data.fetch_regular_season(year=2022)  # Fetching data
data_rs = data.regular_season  # Get data in variable

Data imported: 1312


In [55]:
def minutes_to_seconds(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Processing data and transformation to dataframe
    :param df: 
    :param column: 
    :return: 
    """
    # Split columns into 'minutes' and 'seconds' and 'number of period' as integer
    df['minutes'] = df[column].str.split(':').str[0].astype(int)
    df['seconds'] = df[column].str.split(':').str[1].astype(int)
    df['numberPeriod'] = df['currentPeriod'].str.split("/").str[0].astype(int)
    
    # Total in seconds
    df[column] = df['minutes'] * 60 + df['seconds'] + 20*60*(df['numberPeriod']-1)
        
    #Drop columns
    df.drop(['minutes', 'seconds', 'numberPeriod'], axis=1, inplace=True)

    return df

In [56]:
def convert_event_to_dataframe(game_nhl: dict) -> pd.DataFrame:
    """
    Convert NHL game event data into a clean dataframe  
    :param game_nhl: Dictionary containing the data of the NHL game
    :return: A Pandas DataFrame containing filtered data
    """

    # Extract the play-by-play data from the game dictionary
    df_pbp = pd.DataFrame(game_nhl['plays'])

    # Extract player data
    df_players = pd.DataFrame(game_nhl['rosterSpots'])[['playerId', 'firstName', 'lastName']]

    # Keep the default name for each player (first and last name)
    df_players['firstName'] = df_players['firstName'].apply(lambda x: x['default'])
    df_players['lastName'] = df_players['lastName'].apply(lambda x: x['default'])

    # Extract the 'home' and 'away' teams
    home_team = {'teamId': game_nhl['homeTeam']['id'], 'teamName': game_nhl['homeTeam']['name']['default'],
                 'teamSide': 'home'}
    away_team = {'teamId': game_nhl['awayTeam']['id'], 'teamName': game_nhl['awayTeam']['name']['default'],
                 'teamSide': 'away'}
    df_team = pd.DataFrame([home_team, away_team])

    # Parse the game's start time to UTC
    start_time = pd.to_datetime(game_nhl['startTimeUTC'])

    # Create a new dataframe for the event data
    new_df = pd.DataFrame(df_pbp[['periodDescriptor', 'timeInPeriod', 'situationCode', 'typeDescKey', 'details']])

    # Insert the game ID as the first column
    new_df.insert(0, 'idGame', game_nhl['id'])

    # Filter to keep only events of type 'shot-on-goal' or 'goal'
    new_df = new_df[(new_df['typeDescKey'] == 'shot-on-goal') | (new_df['typeDescKey'] == 'goal')].reset_index(
        drop=True)
    
    # Decompose the period descriptor fields
    df_period = pd.DataFrame(new_df['periodDescriptor'].tolist())
    new_df.drop('periodDescriptor', axis=1, inplace=True)

    # Convert 'number' and 'maxRegulationPeriods' columns as strings
    df_period[['number', 'maxRegulationPeriods']] = df_period[['number', 'maxRegulationPeriods']].astype(str)

    # Add 'periodType' and 'currentPeriod' columns to the new dataframe
    new_df.insert(1, 'periodType', df_period['periodType'])
    new_df.insert(2, 'currentPeriod', df_period['number'] + '/' + df_period['maxRegulationPeriods'])
    
    # Convert time in the period to seconds and add it to the game start time
    new_df = minutes_to_seconds(new_df, 'timeInPeriod')
    new_df['timeInPeriod'] = start_time + pd.to_timedelta(new_df['timeInPeriod'], unit='s')
    
    # Decompose the details fields
    df_details = pd.DataFrame(new_df['details'].tolist())
    new_df.drop('details', axis=1, inplace=True)
    
    # Combine the x and y coordinates into a tuple
    df_details['iceCoord'] = df_details[['xCoord', 'yCoord']].apply(tuple, axis=1)
    
    # Merge 'shooting' and 'scoring' player, to keep only one column
    df_details['shootingPlayerId'] = df_details['shootingPlayerId'].fillna(0) + df_details['scoringPlayerId'].fillna(0)

    # Convert 'shootingPlayerId' and 'goalieInNetId' as integer
    df_details['shootingPlayerId'] = df_details['shootingPlayerId'].astype(int)
    df_details['goalieInNetId'] = df_details['goalieInNetId'].astype('Int64')  # Int64: handling NaN values
    
    # Add the shooter names by merging IDs
    df_details = pd.merge(df_players, df_details, left_on='playerId', right_on='shootingPlayerId', how='right').drop(
        columns=['playerId'])
    
    # Keep only full name
    df_details['shootingPlayer'] = df_details['firstName'] + ' ' + df_details['lastName']
    df_details.drop(['firstName', 'lastName'], axis=1, inplace=True)
    
    # Add the goalies names by merging IDs 
    df_details = pd.merge(df_players, df_details, left_on='playerId', right_on='goalieInNetId', how='right').drop( 
        columns=['playerId']) # 'right' to keep the Nan values
    
    # Keep only full name
    df_details['goaliePlayer'] = df_details['firstName'] + ' ' + df_details['lastName']
    df_details.drop(['firstName', 'lastName'], axis=1, inplace=True)

    # Add team data by merging IDs
    df_details = pd.merge(df_team, df_details, left_on='teamId', right_on='eventOwnerTeamId', how='right')
    
    
    # Add the extracted data to the new dataframe
    new_df['iceCoord'] = df_details['iceCoord']
    new_df['shootingPlayer'] = df_details['shootingPlayer']
    new_df['goaliePlayer'] = df_details['goaliePlayer']
    new_df['shotType'] = df_details['shotType']
    new_df.insert(5, 'eventOwnerTeam', df_details['teamName'])
    new_df['teamSide'] = df_details['teamSide']
    
    # 
    new_df['emptyGoalNet'] = new_df.apply(
        lambda x: x['situationCode'][3] if x['teamSide'] == 'away' else x['situationCode'][0], axis=1).map({'0': True, '1': False})

    # TODO: advantage / disadvantage
    # Get split information 0 | 65 | 1
    
    return new_df  

In [57]:
# Test on one game (avoid to test on all data)

new_data = sdata.NHLData()
new_data.fetch_regular_season(year=2022)
data_2022 = data.regular_season[2022]
game_one_2022 = data_2022[0]
df_game_one_2022 = convert_event_to_dataframe(game_one_2022)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_game_one_2022)

Data imported: 1312


Unnamed: 0,idGame,periodType,currentPeriod,timeInPeriod,situationCode,eventOwnerTeam,typeDescKey,iceCoord,shootingPlayer,goaliePlayer,shotType,teamSide,emptyGoalNet
0,2022020001,REG,1/3,2022-10-07 18:00:23+00:00,1551,Sharks,shot-on-goal,"(44, 8)",Timo Meier,Juuse Saros,wrist,away,False
1,2022020001,REG,1/3,2022-10-07 18:00:59+00:00,1551,Predators,shot-on-goal,"(-33, 8)",Mattias Ekholm,James Reimer,slap,home,False
2,2022020001,REG,1/3,2022-10-07 18:01:01+00:00,1551,Predators,goal,"(-74, -5)",Kiefer Sherwood,James Reimer,wrist,home,False
3,2022020001,REG,1/3,2022-10-07 18:01:12+00:00,1551,Predators,shot-on-goal,"(-81, 15)",Colton Sissons,James Reimer,wrist,home,False
4,2022020001,REG,1/3,2022-10-07 18:02:42+00:00,1551,Sharks,shot-on-goal,"(72, 2)",Steven Lorentz,Juuse Saros,tip-in,away,False
5,2022020001,REG,1/3,2022-10-07 18:04:09+00:00,1551,Predators,shot-on-goal,"(-40, 22)",Dante Fabbro,James Reimer,snap,home,False
6,2022020001,REG,1/3,2022-10-07 18:04:19+00:00,1551,Sharks,shot-on-goal,"(49, 12)",Oskar Lindblom,Juuse Saros,wrist,away,False
7,2022020001,REG,1/3,2022-10-07 18:04:44+00:00,1551,Sharks,shot-on-goal,"(46, 18)",Kevin Labanc,Juuse Saros,slap,away,False
8,2022020001,REG,1/3,2022-10-07 18:05:45+00:00,1551,Predators,shot-on-goal,"(-40, 13)",Matt Duchene,James Reimer,wrist,home,False
9,2022020001,REG,1/3,2022-10-07 18:06:23+00:00,1551,Predators,shot-on-goal,"(-71, 21)",Eeli Tolvanen,James Reimer,wrist,home,False


In [58]:
# Test on all regular season
"""
#Loop on all years in regular season
for year in data_rs:
    current_year = data_rs[year]  #Get year
    sdata_year = len(current_year)  #Get the number of games in the year

    #Loop on all game in the year
    for game in range(sdata_year):
        current_year[game] = convert_event_to_dataframe(game_nhl=current_year[game])  #Processing game data

data_rs[2022][0].head()
"""

'\n#Loop on all years in regular season\nfor year in data_rs:\n    current_year = data_rs[year]  #Get year\n    sdata_year = len(current_year)  #Get the number of games in the year\n\n    #Loop on all game in the year\n    for game in range(sdata_year):\n        current_year[game] = convert_event_to_dataframe(game_nhl=current_year[game])  #Processing game data\n\ndata_rs[2022][0].head()\n'