In [None]:
import pandas as pd
import datetime as dt
import time
import requests
import numpy as np

import joblib

import warnings
warnings.filterwarnings('ignore')

Getting the ID for the game of the day. When this notebook was created there was only one game being played per day becuase of the playoffs. Improving this portion of the notebook to be more robust is a priority moving forward on the project. 

In [3]:
schedule = requests.get('https://statsapi.web.nhl.com/api/v1/schedule')

In [4]:
schedule = schedule.json()

In [5]:
game_link = schedule['dates'][0]['games'][0]['link']

In [6]:
away_team = schedule['dates'][0]['games'][0]['teams']['away']['team']['name']

In [7]:
home_team = schedule['dates'][0]['games'][0]['teams']['home']['team']['name']

In [8]:
base_url = 'https://statsapi.web.nhl.com'

In [9]:
game_link

'/api/v1/game/2020030321/feed/live'

In [14]:
game_pull = requests.get(f'{base_url}{game_link}')
game_pull = game_pull.json()

In [41]:
modeling_df = pd.DataFrame()
game_events = pd.DataFrame()

In [16]:
# function that updates a data frame to be the current list of all game events. 
def update_game_events(current_pull):
    gameplay = current_pull['liveData']['plays']['allPlays']
    output_df = pd.DataFrame()
    for n in range(0,len(gameplay),1):
        event = gameplay[n]['result']['event']
        period = gameplay[n]['about']['period']
        periodTime = gameplay[n]['about']['periodTime']
        try:
            team = gameplay[n]['team']['name']
        except:
            team = np.nan
        home_team = current_pull['gameData']['teams']['home']['name']
        try:
            x_cords = gameplay[n]['coordinates']['x']
        except:
            x_cords = np.nan
        try:
            y_cords = gameplay[n]['coordinates']['y']
        except:
            y_cords = np.nan
        temp_dic = {'event': event, 'team': team,'x_coordinate':x_cords, 'y_coordinate': y_cords,
                       'period':period, 'period_time': periodTime, 'home_team': home_team}
        temp_df = pd.DataFrame(temp_dic, index=[n])
        output_df = pd.concat([output_df, temp_df], axis=0, ignore_index = True)
    return output_df


In [17]:
#function that calculates the ice tilt using a data frame of all game events. 
def ice_tilt(game_df):
    game_ice_tilt = game_df[(game_df['event'] == 'Faceoff') | (game_df['event'] == 'Shot') | (game_df['event'] == 'Hit') | (game_df['event'] == 'Missed Shot')]                
    mask = (game_ice_tilt['period']==2)
    game_ice_tilt['x_coordinate'][mask] = game_ice_tilt['x_coordinate'] * -1
    tilt = game_ice_tilt['x_coordinate'].sum()
    return tilt

In [20]:
#function that updates the box score to the current live box scores at any point in the game. 
def update_box_scores(current_pull):
    box_pull = current_pull['liveData']['boxscore']
    output_df = pd.DataFrame()
    for location in ['away', 'home']:
        shots = box_pull['teams'][location]['teamStats']['teamSkaterStats']['shots']
        blocked_shots = box_pull['teams'][location]['teamStats']['teamSkaterStats']['blocked']
        pim = box_pull['teams'][location]['teamStats']['teamSkaterStats']['pim']
        pp_percent = box_pull['teams'][location]['teamStats']['teamSkaterStats']['powerPlayPercentage']
        pp_goals = box_pull['teams'][location]['teamStats']['teamSkaterStats']['powerPlayGoals']
        pp_chances = box_pull['teams'][location]['teamStats']['teamSkaterStats']['powerPlayOpportunities']
        faceoff_percent = box_pull['teams'][location]['teamStats']['teamSkaterStats']['faceOffWinPercentage']
        takeaways = box_pull['teams'][location]['teamStats']['teamSkaterStats']['takeaways']
        giveaways = box_pull['teams'][location]['teamStats']['teamSkaterStats']['giveaways']
        hits = box_pull['teams'][location]['teamStats']['teamSkaterStats']['hits']
        home_status = location
        temp_dic = {
            'shots': shots, 'blocked_shots': blocked_shots,
            'penalty_minutes': pim, 'power_play_percentage': pp_percent, 'power_play_goals': pp_goals,
            'power_play_chances': pp_chances, 'faceoff_percent':faceoff_percent,'takeaways':takeaways,
            'giveaways' : giveaways,'hits':hits,'home_status': home_status
            }
        temp_df = pd.DataFrame(temp_dic, index=[0])
        output_df = pd.concat([output_df, temp_df], axis=0, ignore_index=True)
    return output_df

In [21]:
# function that updates the differences df, that was used for modeling. 
def update_differences_df(df):
    df['power_play_percentage'] = pd.to_numeric( df['power_play_percentage'])
    df['faceoff_percent'] = pd.to_numeric( df['faceoff_percent'])
    diffs = df.iloc[1][0:10] - df.iloc[0][0:10]
    output_df = pd.DataFrame(diffs.to_dict(), index=[0])
    return output_df
    

In [22]:
#converts period time from being a string to being an integer.
def time_to_int(column):
    output_list = []
    for i in [x for x in column]:
        numeric_time = []
        for digit in i:
            if digit != ":":
                numeric_time.append((digit))
            if len(numeric_time) == 4:
                numeric_time = "".join(numeric_time)
                int(numeric_time)
        output_list.append(numeric_time)  
    output_list = [int(x) for x in output_list]
    return output_list
        

In [35]:
# converts period time to full game time in order to filter for game time. 
def period_time_to_full_time(df):
    first_period = df[df['period'] == 1]
    second_period = df[df['period'] == 2]
    third_period = df[df['period'] == 3]
    second_period['period_time'] = second_period['period_time'] + 2000
    third_period['period_time'] = third_period['period_time'] + 4000
    output_df = pd.concat([first_period,second_period], axis=0, ignore_index=True)
    output_df = pd.concat([output_df,third_period], axis=0, ignore_index=True)
    return output_df
    

In [24]:
# gets the data frame difference based on minutes into a game, used for the minute over minute modeling. 
def diff_of_minutes_model_stats(df):
    events = df

    events_home = events[events['team'] == events['home_team']]
    events_home_dict = events_home.groupby('event').count()['team'].to_dict()
    base_dic_home = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_home.update(events_home_dict)

    events_away = events[events['team'] != events['home_team']]
    events_away_dict = events_away.groupby('event').count()['team'].to_dict()
    base_dic_away = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_away.update(events_away_dict)
    away_df = pd.DataFrame(base_dic_away, index=[0])
    home_df = pd.DataFrame(base_dic_home, index=[1])
    temp = home_df.iloc[0][:10]-away_df.iloc[0][:10]
    temp = temp.to_dict()
    output_df = pd.DataFrame(temp, index=[0])
    return output_df

The model frames to be exported and used and to get updated in the live scrape. 

In [179]:
modeling_df = pd.DataFrame()
game_events = pd.DataFrame()
differences_df = pd.DataFrame()

In [27]:
rfc = joblib.load('./models/random_forest_classifier.pkl')

In [55]:
def game_scrape_v3(game_link):
    base_url = 'https://statsapi.web.nhl.com'
    current_pull_raw = requests.get(f'{base_url}{game_link}')
    current_pull = current_pull_raw.json()
    modeling_df = update_box_scores(current_pull)
    differences_df = update_differences_df(modeling_df)
    game_events = update_game_events(current_pull).dropna()
    tilt = ice_tilt(game_events)
    differences_df['ice_tilt'] = tilt * -1
    differences_df['time_stamp'] = current_pull['metaData']['timeStamp']
    differences_df['period'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['period']
    differences_df['period_time'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['periodTime']
    differences_df['rfc_prediction'] = rfc.predict(differences_df.drop(columns=['time_stamp','time_stamp','period','period_time']))      
    #differences_df['rfc_prob_no'] = rfc.predict_proba(differences_df.drop(columns=['time_stamp','time_stamp','period','period_time']))[0]
    modeling_df['time_stamp'] = current_pull['metaData']['timeStamp']
    modeling_df['period'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['period']
    modeling_df['period_time'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['periodTime']
    game_events['period_time'] = time_to_int(game_events['period_time'])
    game_events = period_time_to_full_time(game_events)
    minutes_diff_df = diff_of_minutes_model_stats(game_events)
    
    
    return differences_df, modeling_df, minutes_diff_df

In [222]:

log_reg = joblib.load('./models/log_reg.pkl')

In [None]:
game = request.get('https://statsapi.web.nhl.com/api/v1/game/2020030236/feed/live')
game = game.json()

In [59]:
def get_game_status(game_link):
    base_url = 'https://statsapi.web.nhl.com'
    current_pull_raw = requests.get(f'{base_url}{game_link}')
    current_pull = current_pull_raw.json()
    return current_pull['gameData']['status']['abstractGameState']
    
    

In [240]:
get_game_status(game_link)

'Live'

First test of live scrape, was only able to get the third period of the game that night, becuase of the complications I ran into setting everything up.

In [None]:
# This is an old game scrape version, but it was used in some of these early live pulls that 
# served as a proof of concept. 
def game_scrape(game_link):
    base_url = 'https://statsapi.web.nhl.com'
    current_pull_raw = requests.get(f'{base_url}{game_link}')
    current_pull = current_pull_raw.json()
    modeling_df = update_box_scores(current_pull)
    differences_df = update_differences_df(modeling_df)
    game_events = update_game_events(current_pull)
    tilt = ice_tilt(game_events)
    differences_df['ice_tilt'] = tilt * -1
    differences_df['time_stamp'] = current_pull['metaData']['timeStamp']
    differences_df['period'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['period']
    differences_df['period_time'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['periodTime']
    modeling_df['time_stamp'] = current_pull['metaData']['timeStamp']
    modeling_df['period'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['period']
    modeling_df['period_time'] = current_pull['liveData']['plays']['allPlays'][-1]['about']['periodTime']
    
    return differences_df, modeling_df
    

In [261]:
game_status = get_game_status(game_link)
test_game_2_models = pd.DataFrame()
test_game_2_diffs = pd.DataFrame()
while game_status == 'Live':
    differences, models = game_scrape(game_link)
    test_game_2_models = pd.concat([test_game_2_models, models], axis=0, ignore_index=True)
    test_game_2_diffs = pd.concat([test_game_2_diffs, differences], axis=0, ignore_index=True)
    game_status = get_game_status(game_link)
    time.sleep(60)
    

In [262]:
game_status = get_game_status(game_link)

In [260]:
game_status

'Live'

In [263]:
test_game_2_models.head()

Unnamed: 0,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,home_status,time_stamp,period,period_time
0,22,7,0,100.0,1.0,1.0,50.0,1,1,31,away,20210611_030529,3,00:22
1,16,24,2,0.0,0.0,0.0,50.0,16,12,37,home,20210611_030529,3,00:22
2,22,7,0,100.0,1.0,1.0,48.8,1,1,31,away,20210611_030627,3,01:21
3,17,25,2,0.0,0.0,0.0,51.2,16,13,37,home,20210611_030627,3,01:21
4,22,7,0,100.0,1.0,1.0,48.8,1,1,31,away,20210611_030727,3,01:21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,23,28,2,0.0,0.0,1.0,50.0,20,15,45,home,20210611_033938,3,18:55
70,32,17,2,100.0,1.0,1.0,50.0,4,1,41,away,20210611_034038,3,19:21
71,23,28,2,0.0,0.0,1.0,50.0,20,15,45,home,20210611_034038,3,19:21
72,33,17,2,100.0,1.0,1.0,51.8,4,1,41,away,20210611_034136,3,20:00


In [264]:
test_game_2_diffs

Unnamed: 0,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,ice_tilt,time_stamp,period,period_time
0,-6,17,2,-100.0,-1.0,-1.0,0.0,15,11,6,-1343.0,20210611_030529,3,00:22
1,-5,18,2,-100.0,-1.0,-1.0,2.4,15,12,6,-1283.0,20210611_030627,3,01:21
2,-5,18,2,-100.0,-1.0,-1.0,2.4,15,12,6,-1214.0,20210611_030727,3,01:21
3,-5,18,2,-100.0,-1.0,-1.0,4.8,16,12,6,-1168.0,20210611_030825,3,02:58
4,-5,19,2,-100.0,-1.0,-1.0,4.8,16,12,6,-1226.0,20210611_030923,3,03:49
5,-5,19,2,-100.0,-1.0,-1.0,4.8,16,12,4,-1016.0,20210611_031023,3,04:41
6,-6,18,2,-100.0,-1.0,-1.0,4.8,15,12,2,-1263.0,20210611_031132,3,05:28
7,-7,18,2,-100.0,-1.0,-1.0,4.8,15,13,1,-1364.0,20210611_031230,3,05:33
8,-7,18,2,-100.0,-1.0,-1.0,4.6,15,13,1,-1364.0,20210611_031330,3,06:12
9,-10,18,2,-100.0,-1.0,-1.0,4.6,15,13,1,-1644.0,20210611_031428,3,06:37


In [265]:
rfc.predict(test_game_2_diffs.drop(columns=['time_stamp','time_stamp','period','period_time']))

array(['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no'], dtype=object)

In [267]:
test_game_2_diffs.to_csv('./live_scrapes/avs_vegas_third_diffs.csv')
test_game_2_models.to_csv('./live_scrapes/avs_vegas_third_box_scores.csv')

In [75]:
game_status = get_game_status(game_link)
game_status

'Live'

This is the one true live scrape I was able to do. It exports the main data frame at every pull number, and allows for the live prediction to slowly build out a graph using the several data frames. I realize in hindsight that this process could be greatly improved and automated. 

In [79]:
game_status = get_game_status(game_link)
game_box = pd.DataFrame()
game_diff = pd.DataFrame()
game_minute_diffs = pd.DataFrame()
counter = 1
while game_status == 'Live':
    differences, models, minute_diffs = game_scrape_v3(game_link)
    game_box = pd.concat([game_box, models], axis=0, ignore_index=True)
    game_diff = pd.concat([game_diff, differences], axis=0, ignore_index=True)
    game_minute_diffs = pd.concat([game_minute_diffs, minute_diffs], axis=0, ignore_index=True)
    game_status = get_game_status(game_link)
    game_minute_diffs.to_csv(f'./live_scrapes/islanders_bruins_v3_minute_diffs_{counter}.csv')
    game_diff.to_csv(f'./live_scrapes/islanders_bruins_v3_diffs_{counter}.csv')
    game_box.to_csv(f'./live_scrapes/islanders_bruins_v3_box_{counter}.csv')
    counter += 1
    time.sleep(60)

In [80]:
game_status

'Final'

The actual live predicting is happening in the model results notebook, to allow for the live scrape and the model predictions to happen at the same time. Eventually this will be set up to allow for all the models to predict on data as it is brought in and exported. 

In [None]:
#Live