In [1]:
import json
import pandas as pd

from pathlib import Path
from datetime import datetime

In [2]:
def find_prior_stoppage(data, idx):
    """
    once a challenge is found, this will find all events
    prior to the challenge that are a STOP, PENALTY or GOAL.
    """
    count = idx - 1
    events_prior_challenge = []
    while (data[count]['result']['eventTypeId'] == 'STOP'
        or data[count]['result']['eventTypeId'] == 'PENALTY'
        or data[count]['result']['eventTypeId'] == 'GOAL'):
        events_prior_challenge.append(data[count])
        count -= 1
    
    events_prior_challenge.append(data[count])
    events_prior_challenge.reverse()
    return events_prior_challenge

def find_next_faceoff(data, idx):
    """
    once a challenge is found, this will find all events
    up till the next faceoff.
    """
    count = idx
    events_since_challenge = []
    while data[count]['result']['eventTypeId'] != 'FACEOFF' and count < len(data) - 1:
        events_since_challenge.append(data[count])
        count += 1
    
    # add the next event
    if count < len(data) - 1:
        events_since_challenge.append(data[count])
    
    return events_since_challenge

In [3]:
def find_coaches_challenge(file):   
    with open(file, 'r') as f:
        data = json.load(f)
        meta = data['gameData']
        data = data['liveData']['plays']['allPlays']
        
    game_meta_data = {
        "game_id": file.stem,
        "game_type": meta['game']['type'],
        "home_team": meta['teams']['home']['name'],
        "away_team": meta['teams']['away']['name'],
        "final_score": data[-1]['about']['goals']
    }
    
    all_goals = []
    all_events = []
    challenge_id = 1
    for idx, event in enumerate(data):
        if event['result']['eventTypeId'] == 'CHALLENGE':
            prior = find_prior_stoppage(data, idx)
            faceoff = find_next_faceoff(data, idx)
            events = prior + faceoff
            
            all_events.append({
                "challenge_id": challenge_id,
                "events": events
            })

            challenge_id += 1
            
        if event['result']['eventTypeId'] == 'GOAL':
            all_goals.append({"goals": data[idx]})
            
    if len(all_events) > 0:  
        return {"game_data": game_meta_data, "data": all_events, "goals": all_goals}

In [4]:
def assign_challenged_teams(df):
    """
    will assign the team that initiated the challenge, except for automatic league initiated
    challenges, which will be labeled "League Challenge".
    
    38.3 League Initiated Challenge – In the final minute of play in the 3rd Period and at any 
         point in Overtime (Regular Season and Playoffs), the NHL Situation Room will initiate the 
         review of any scenario that would otherwise be subject to a Coach’s Challenge.
    """
    for idx in df['game_id'].unique().tolist():
        sub_df = df[df['game_id'] == idx].copy()
        for event_id in sub_df['data.challenge_id'].unique().tolist():
            event_df = sub_df[sub_df['data.challenge_id'] == event_id]
            team = event_df[event_df['result.eventTypeId']=='CHALLENGE']['team.name'].iloc[0]
            if event_df['home_team'].iloc[0] == team:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_home_away'] = 'home'
            if event_df['away_team'].iloc[0] == team:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_home_away'] = 'away'
            
            if type(team) == str:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_team'] = team
            else:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_team'] = 'League Challenge'
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_home_away'] = 'league'

    return df

In [5]:
def challenge_outcome(df):
    """
    38.8 Results of an Unsuccessful Coach’s Challenge – If a team initiates a Coach’s Challenge for 
         any of the enumerated scenarios in Rule 38.2 above and such Challenge does not result in 
         the original call on the ice being overturned, the team exercising such Challenge shall be
         assessed a minor penalty (2:00) for delaying the game. ... or (4:00) if already failed challenge
    """
    for idx in df['game_id'].unique().tolist():
        sub_df = df[df['game_id'] == idx].copy()
        for event_id in sub_df['data.challenge_id'].unique().tolist():
            event_df = sub_df[sub_df['data.challenge_id'] == event_id]
            
            # if there was a penalty for delay of game, then challenge failed
            desc = event_df['result.description'].str.lower().unique().tolist()
            if 'PENALTY' in event_df['result.eventTypeId'].tolist() and any('delay' in x for x in desc):
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_outcome'] = 'failed'
                
            # automatic challenge not counting as sucess/fail
            elif 'League Challenge' == event_df['challenge_team'].iloc[0]:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_outcome'] = 'automatic'
                
            # check if score changed
            elif (event_df['about.goals.home'].min() != event_df['about.goals.home'].max() or
                  event_df['about.goals.away'].min() != event_df['about.goals.away'].max()):
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'score_changed'] = True
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_outcome'] = 'success'
                           
            else:
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_outcome'] = 'success'
                
    # special issues found a cbj game with failed challenge but no penalty
    df.loc[(df['game_id']==idx) & (df['data.challenge_id']=='1'), 'challenge_outcome'] = 'failed'
    df['score_changed'] = df['score_changed'].fillna(False) 
    
    return df

In [6]:
def score_diff(df):
    """
    Reutrns the difference between the goals scored of the challenging team and the 
    opposing team. Negative means the challenging team is losing by x points, positive
    shows winning by x points.    
    """
    # home challenged
    if df['challenge_team'] == df['home_team']:
        return df['about.goals.home'] - df['about.goals.away']
    # away challenged
    if df['challenge_team'] == df['away_team']:
        return df['about.goals.away'] - df['about.goals.home']

        
def get_game_seconds(row):
    """
    input: list [period, time(MM:SS)]
    returns total seconds in the game at the time of the event
    """
    period, time = row
    period_seconds = 0
    
    time = time.split(':')
    mins = int(time[0])
    seconds = int(time[1])
    
    min_seconds = mins * 60
    
    if period > 1:
        period_seconds = (period-1) * 20 * 60
        
    return seconds + min_seconds + period_seconds
    

In [7]:
def challenge_reason(df):
    """
    Coaches need to provide the exact reason for a challenge when one is initiated.
    This will return either off-side or goalie interference where relevant. These
    are the only 2 labeled reasons within the data.
    """
    for idx in df['game_id'].unique().tolist():
        sub_df = df[df['game_id'] == idx].copy()
        for event_id in sub_df['data.challenge_id'].unique().tolist():
            event_df = sub_df[sub_df['data.challenge_id'] == event_id]
            if any('off-side' in x for x in event_df['result.description'].tolist()):
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_reason'] = 'off-side'
            
            if any('interference' in x for x in event_df['result.description'].tolist()):
                df.loc[(df['game_id']==idx) & (df['data.challenge_id']==event_id), 'challenge_reason'] = 'goalie interference'
    
    df['challenge_reason'] = df['challenge_reason'].fillna('unknown')
    return df

def manual_overwrite(df):
    """
    Replaces missing/inacurate information with data manually
    collected from news reports or video reviews.
    """
    manual = pd.read_csv('../data/external/manual_overwrite.csv', dtype={0:str})
    for idx, row in manual.iterrows():
        game_id, challenge_id = row['game_id'], row['data.challenge_id']        
        outcome, reason = row['outcome'], row['challenge_reason']
        
        sub_df = df[(df['game_id']==game_id) & (df['data.challenge_id']==challenge_id)].iloc[0]
        if (sub_df['challenge_outcome'] != outcome):
            df.loc[(df['game_id']==game_id) & (df['data.challenge_id']==challenge_id), 'challenge_outcome'] = outcome

        if (sub_df['challenge_reason'] != reason):
            df.loc[(df['game_id']==game_id) & (df['data.challenge_id']==challenge_id), 'challenge_reason'] = reason
            
    return df

def goal_during_penalty(df1, df2):
    """
    Checks to see if a goal was scored during the 2 minute bench minor given
    as a result of a failed coaches challenge.
    """
    for idx in df1['game_id'].unique().tolist():
        goals = df2[df2['game_id'] == idx].copy()
        sub_df = df1[df1['game_id'] == idx].copy()
        for event_id in sub_df['data.challenge_id'].unique().tolist():
            event_df = sub_df[sub_df['data.challenge_id'] == event_id]
                        
            # if there was a penalty for delay of game get time of penalty
            desc = event_df['result.description'].str.lower().unique().tolist()
            if 'PENALTY' in event_df['result.eventTypeId'].tolist() and any('delay' in x for x in desc):
                penalty_filter = event_df['result.eventTypeId']=='PENALTY'
                delay_filter = event_df['result.description'].str.lower().str.contains('delay')
                
                penalty_start = event_df.loc[(penalty_filter) & (delay_filter)]['game_seconds'].iloc[0]
                penalty_end = penalty_start + 120
                
                event_goals = goals[(goals['game_seconds'] > penalty_start) & (goals['game_seconds'] < penalty_end)]
                if (event_goals.shape[0] > 0 and event_goals.iloc[0]['goals.result.strength.code'] == 'PPG'):
                    df1.loc[(df1['game_id']==idx) & (df1['data.challenge_id']==event_id), 'penalty_goal'] = True

                if (event_goals.shape[0] > 0 and 
                    event_goals.iloc[0]['goals.result.strength.code'] == 'PPG' and 
                    event_goals.iloc[0]['goals.result.gameWinningGoal'] == True):
                    df1.loc[(df1['game_id']==idx) & (df1['data.challenge_id']==event_id), 'penalty_goal_gwg'] = True
    
    df1['penalty_goal'] = df1['penalty_goal'].fillna(False)
    df1['penalty_goal_gwg'] = df1['penalty_goal_gwg'].fillna(False)
    return df1


In [8]:
game_files = Path('../data/raw/live-feed/20202021/').glob('*.json')

count = 0
all_challenges = []
all_goals = []
for game in game_files:
    challenges = find_coaches_challenge(game)

    if challenges != None: 
        all_challenges.append(challenges)
        count += 1
        
        goals = challenges.copy()
        del goals['data']
        all_goals.append(goals)

In [9]:
# event dataset
df = pd.json_normalize(
    all_challenges, 
    meta=[['game_data'], ['data', 'challenge_id']], 
    record_path=['data', 'events']
)

meta_cols = list(df['game_data'][0].keys())
score_cols = ['away_final_score', 'home_final_score']

df[meta_cols] = df['game_data'].apply(pd.Series)
df[score_cols] = df['final_score'].apply(pd.Series)
df = df.drop(columns=['game_data', 'final_score'])

df = assign_challenged_teams(df)
df = challenge_outcome(df)
df = challenge_reason(df)
df = manual_overwrite(df)

df['score_diff'] = df.apply(score_diff, axis=1)
df['game_seconds'] = df[['about.period', 'about.periodTime']].apply(get_game_seconds, axis=1)
event_df = df.copy()

# goal dataset
df = pd.json_normalize(
    all_goals, 
    meta=[['game_data']], 
    record_path=['goals']
)

meta_cols = list(df['game_data'][0].keys())
df[meta_cols] = df['game_data'].apply(pd.Series)
df = df.drop(columns=['game_data', 'final_score', 'goals.players', 'goals.result.secondaryType'])
df['game_seconds'] = df[['goals.about.period', 'goals.about.periodTime']].apply(get_game_seconds, axis=1)
goal_df = df.copy()

# combined
df = goal_during_penalty(event_df, goal_df)

# save
league = df[df['challenge_team']=='League Challenge']
df = df[df['challenge_team']!='League Challenge']
df = df[df['result.eventTypeId']=='CHALLENGE']

# add timestamp
df['last_updated'] = datetime.now()

# add teams with no challenges
teams = pd.read_csv('../data/external/icons/icons.csv')['espn_team_name'].tolist()
data_teams = df['team.name'].unique().tolist()
no_challenges = [x for x in teams if x not in data_teams]

for team in no_challenges:
    df = df.append({'challenge_team': team}, ignore_index=True)

df.to_csv('../data/clean/challenges_clean.csv', index=False)

In [10]:
# view a sample of the data (CBJ)
cols = ['game_id', 'data.challenge_id', 'about.dateTime', 'away_team', 'home_team']
df[df['challenge_team']=='Columbus Blue Jackets'][cols]

Unnamed: 0,game_id,data.challenge_id,about.dateTime,away_team,home_team
19,2020020190,1,2021-02-07T21:40:20Z,Carolina Hurricanes,Columbus Blue Jackets
28,2020020255,1,2021-02-16T00:18:57Z,Columbus Blue Jackets,Carolina Hurricanes
29,2020020255,2,2021-02-16T02:28:01Z,Columbus Blue Jackets,Carolina Hurricanes
39,2020020385,1,2021-03-07T01:11:12Z,Columbus Blue Jackets,Dallas Stars
59,2020020544,1,2021-03-28T21:01:13Z,Columbus Blue Jackets,Detroit Red Wings
68,2020020630,1,2021-04-09T00:40:33Z,Tampa Bay Lightning,Columbus Blue Jackets
71,2020020643,1,2021-04-11T00:08:28Z,Chicago Blackhawks,Columbus Blue Jackets
81,2020020707,1,2021-04-20T00:28:41Z,Columbus Blue Jackets,Florida Panthers
