In [1]:
import pandas as pd
import datetime as dt
import time
import requests
import numpy as np

import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
def time_to_int(column):
    output_list = []
    for i in [x for x in column]:
        numeric_time = []
        for digit in i:
            if digit != ":":
                numeric_time.append((digit))
            if len(numeric_time) == 4:
                numeric_time = "".join(numeric_time)
                int(numeric_time)
        output_list.append(numeric_time)  
    output_list = [int(x) for x in output_list]
    return output_list

In [3]:
def period_time_to_full_time(df):
    first_period = df[df['period'] == 1]
    second_period = df[df['period'] == 2]
    third_period = df[df['period'] == 3]
    second_period['period_time'] = second_period['period_time'] + 2000
    third_period['period_time'] = third_period['period_time'] + 4000
    output_df = pd.concat([first_period,second_period], axis=0, ignore_index=True)
    output_df = pd.concat([output_df,third_period], axis=0, ignore_index=True)
    return output_df
    

In [4]:
box_score_differences = pd.read_csv('./live_scrapes/islanders_bruins_v3_diffs_137.csv')
box_score_differences.drop(columns=['Unnamed: 0','time_stamp'], inplace=True)
box_score_differences['period_time'] = time_to_int(box_score_differences['period_time'])
box_score_differences = period_time_to_full_time(box_score_differences)
box_score_differences

Unnamed: 0,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,ice_tilt,period,period_time,rfc_prediction
0,0,0,0,0.0,0.0,0.0,100.0,0,0,0,-69.0,1,8,yes
1,0,0,0,0.0,0.0,0.0,100.0,0,0,0,-152.0,1,126,yes
2,0,1,0,0.0,0.0,0.0,100.0,1,0,-1,-154.0,1,211,yes
3,2,0,0,0.0,0.0,0.0,100.0,1,0,-2,-274.0,1,259,yes
4,2,0,0,0.0,0.0,0.0,0.0,1,0,-2,-385.0,1,259,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,-1,-4,2,0.0,0.0,-2.0,-22.8,2,6,-1,-674.0,3,5859,no
133,0,-5,2,50.0,1.0,-1.0,-22.8,2,6,-1,-677.0,3,5907,yes
134,0,-5,2,0.0,0.0,-1.0,-22.8,2,6,-1,-677.0,3,5907,no
135,0,-5,2,0.0,0.0,-1.0,-22.8,2,6,-1,-677.0,3,5937,no


In [5]:
minute_diffs = pd.read_csv('./live_scrapes/islanders_bruins_v3_minute_diffs_137.csv')
minute_diffs['period_time'] = box_score_differences['period_time']
minute_diffs.drop(columns='Unnamed: 0', inplace=True)
minute_diffs.drop_duplicates(inplace=True)
minute_diffs

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,period_time
0,0,0,0,2,0,0,0,0,0,8
1,0,0,1,2,1,0,-1,0,0,126
2,0,0,0,2,1,0,-2,0,0,211
3,0,2,0,1,1,0,-3,0,0,259
4,0,2,0,0,1,0,-3,1,0,259
...,...,...,...,...,...,...,...,...,...,...
131,-2,0,-4,-10,2,6,-1,3,1,5831
132,-2,1,-5,-10,2,6,-1,3,1,5859
133,-1,1,-5,-10,2,6,-1,3,1,5907
135,-1,1,-5,-11,2,6,-1,3,1,5937


In [6]:
counter = 200
predicts = []
probabilities = []
while counter <= 6000:
    temp_df = minute_diffs[(minute_diffs['period_time'] <= (counter)) & (minute_diffs['period_time'] > (counter - 200))]      
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    try:
        temp_list = model.predict(temp_df.drop(columns='period_time'))
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df.drop(columns='period_time'))
        for i in range(0,(len(probs)),1):
            probabilities.append(probs[i][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
minute_diffs['predictions'] = predicts
minute_diffs['probability_win'] = probabilities


In [7]:
box_score_differences.drop_duplicates(inplace=True)

In [8]:
minute_diffs

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,period_time,predictions,probability_win
0,0,0,0,2,0,0,0,0,0,8,no,0.261900
1,0,0,1,2,1,0,-1,0,0,126,no,0.307217
2,0,0,0,2,1,0,-2,0,0,211,no,0.258354
3,0,2,0,1,1,0,-3,0,0,259,no,0.290854
4,0,2,0,0,1,0,-3,1,0,259,no,0.275854
...,...,...,...,...,...,...,...,...,...,...,...,...
131,-2,0,-4,-10,2,6,-1,3,1,5831,no,0.000000
132,-2,1,-5,-10,2,6,-1,3,1,5859,no,0.000000
133,-1,1,-5,-10,2,6,-1,3,1,5907,no,0.000000
135,-1,1,-5,-11,2,6,-1,3,1,5937,no,0.045455


In [9]:
minute_diffs['predictions'].value_counts()

no    96
Name: predictions, dtype: int64

artifical scrape for a more exciting game

In [65]:
game_stats = requests.get('https://statsapi.web.nhl.com/api/v1/game/2020030113/feed/live')
test_1 = game_stats.json()['liveData']['plays']['allPlays']

In [66]:
game_events = pd.DataFrame()
for i in range(2,(len(test_1)-2),1):
    event = test_1[i]['result']['event']
    try:
        team = test_1[i]['team']['name']
    except:
        team = np.nan
    try:
        x_cords = test_1[i]['coordinates']['x']
    except:
        x_cords = np.nan
    try:
        y_cords = test_1[i]['coordinates']['y']
    except:
        y_cords =np.nan
    period = test_1[i]['about']['period']
    periodTime = test_1[i]['about']['periodTime']
    home_team = game_stats.json()['gameData']['teams']['home']['name']
    game_id = 2020030113
    temp_dic = {'event': event, 'team': team,'x_coordinate':x_cords, 'y_coordinate': y_cords,
               'period':period, 'period_time': periodTime, 'game_id':game_id, 'home_team': home_team}
    temp_df = pd.DataFrame(temp_dic, index=[i])
    game_events = pd.concat([game_events, temp_df], axis=0, ignore_index=True)
    

In [67]:
def minutes_into_game_df(df, minute_interval = 200):
    events = df[df['period_time'] <= minute_interval]

    events_home = events[events['team'] == events['home_team']]
    events_home_dict = events_home.groupby('event').count()['team'].to_dict()
    ice_tilt_minutes = events_home.groupby('event')['x_coordinate'].sum().sum()
    base_dic_home = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_home.update(events_home_dict)

    events_away = events[events['team'] != events['home_team']]
    events_away_dict = events_away.groupby('event').count()['team'].to_dict()
    ice_tilt_minutes = events_away.groupby('event')['x_coordinate'].sum().sum()
    base_dic_away = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_away.update(events_away_dict)
    output_df = pd.DataFrame(base_dic_away, index=[0])
    concat_df = pd.DataFrame(base_dic_home, index=[1])
    output_df = pd.concat([output_df,concat_df], axis=0, ignore_index=True)
    return output_df

In [68]:
game_events.dropna(inplace=True)

In [69]:
game_events

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
1,Faceoff,New York Islanders,0.0,0.0,1,00:00,2020030113,New York Islanders
2,Hit,Pittsburgh Penguins,-56.0,39.0,1,00:16,2020030113,New York Islanders
5,Faceoff,New York Islanders,-69.0,22.0,1,01:03,2020030113,New York Islanders
6,Hit,New York Islanders,97.0,-4.0,1,01:46,2020030113,New York Islanders
7,Hit,Pittsburgh Penguins,-94.0,22.0,1,01:50,2020030113,New York Islanders
...,...,...,...,...,...,...,...,...
327,Missed Shot,New York Islanders,44.0,12.0,3,18:26,2020030113,New York Islanders
328,Missed Shot,New York Islanders,78.0,3.0,3,18:45,2020030113,New York Islanders
330,Faceoff,New York Islanders,69.0,-22.0,3,18:55,2020030113,New York Islanders
331,Blocked Shot,Pittsburgh Penguins,72.0,-7.0,3,19:07,2020030113,New York Islanders


In [70]:
game_events['period_time'] = time_to_int(game_events['period_time'])

In [71]:
game_events = period_time_to_full_time(game_events)

In [72]:
game_events.head(12)

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
0,Faceoff,New York Islanders,0.0,0.0,1,0,2020030113,New York Islanders
1,Hit,Pittsburgh Penguins,-56.0,39.0,1,16,2020030113,New York Islanders
2,Faceoff,New York Islanders,-69.0,22.0,1,103,2020030113,New York Islanders
3,Hit,New York Islanders,97.0,-4.0,1,146,2020030113,New York Islanders
4,Hit,Pittsburgh Penguins,-94.0,22.0,1,150,2020030113,New York Islanders
5,Giveaway,New York Islanders,-92.0,30.0,1,152,2020030113,New York Islanders
6,Shot,Pittsburgh Penguins,-86.0,34.0,1,152,2020030113,New York Islanders
7,Goal,Pittsburgh Penguins,-33.0,30.0,1,201,2020030113,New York Islanders
8,Faceoff,Pittsburgh Penguins,0.0,0.0,1,201,2020030113,New York Islanders
9,Penalty,New York Islanders,84.0,1.0,1,237,2020030113,New York Islanders


In [73]:
minutes_into_game_df(game_events)

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty
0,0,1,0,0,0,0,2,0,0
1,0,0,0,2,0,1,1,0,0


In [77]:
counter = 200
i = 1
artifical_scrape = pd.DataFrame()
while counter <= 6000:
    df = minutes_into_game_df(game_events,counter)
    differences = df.iloc[1][:9] - df.iloc[0][:9]
    differences = differences.to_dict()
    output = pd.DataFrame(differences, index=[i])
    artifical_scrape = pd.concat([artifical_scrape,output], axis=0, ignore_index=True)
    counter += 200
    i += 1 

In [94]:
columns = artifical_scrape.columns.to_list()

In [121]:
counter = 200
predicts = []
probabilities = []
i = 0
while counter <= 6000:
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    temp_df = artifical_scrape.loc[[i], columns]
    try:
        temp_list = model.predict(temp_df)
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df)
        probabilities.append(probs[0][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
    i += 1
artifical_scrape['predictions'] = predicts
artifical_scrape['probability_win'] = probabilities


In [141]:
artifical_scrape

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,predictions,probability_win
0,0,-1,0,2,0,1,-1,0,0,no,0.247342
1,-1,-3,1,2,0,1,1,0,1,no,0.005556
2,-1,0,2,2,0,1,2,0,1,no,0.062321
3,-1,-2,3,7,0,2,2,0,1,no,0.049982
4,-1,-3,3,9,0,3,3,0,1,no,0.049982
5,-1,-3,3,9,0,3,2,-1,1,no,0.039527
6,-1,-1,2,10,0,3,2,-1,1,no,0.039527
7,-1,-3,2,9,0,3,3,-1,1,no,0.039527
8,-1,-2,-1,7,0,3,2,-1,1,no,0.013889
9,-1,-1,-3,8,0,3,3,0,0,no,0.013889


one where the home team wins:

In [124]:
game_stats = requests.get('https://statsapi.web.nhl.com/api/v1/game/2020030224/feed/live')
test_1 = game_stats.json()['liveData']['plays']['allPlays']

In [128]:
game_events_2 = pd.DataFrame()
for i in range(2,(len(test_1)-2),1):
    event = test_1[i]['result']['event']
    try:
        team = test_1[i]['team']['name']
    except:
        team = np.nan
    try:
        x_cords = test_1[i]['coordinates']['x']
    except:
        x_cords = np.nan
    try:
        y_cords = test_1[i]['coordinates']['y']
    except:
        y_cords =np.nan
    period = test_1[i]['about']['period']
    periodTime = test_1[i]['about']['periodTime']
    home_team = game_stats.json()['gameData']['teams']['home']['name']
    game_id = 2020030224
    temp_dic = {'event': event, 'team': team,'x_coordinate':x_cords, 'y_coordinate': y_cords,
               'period':period, 'period_time': periodTime, 'game_id':game_id, 'home_team': home_team}
    temp_df = pd.DataFrame(temp_dic, index=[i])
    game_events_2 = pd.concat([game_events, temp_df], axis=0, ignore_index=True)

In [130]:
game_events_2.dropna(inplace=True)

In [131]:
game_events_2['period_time'] = time_to_int(game_events_2['period_time'])

In [132]:
game_events_2 = period_time_to_full_time(game_events_2)

In [133]:
game_events_2

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
0,Penalty,Tampa Bay Lightning,0.0,-14.0,1,0,2020030224,Tampa Bay Lightning
1,Penalty,Carolina Hurricanes,0.0,-16.0,1,0,2020030224,Tampa Bay Lightning
2,Faceoff,Carolina Hurricanes,0.0,0.0,1,0,2020030224,Tampa Bay Lightning
3,Hit,Carolina Hurricanes,15.0,35.0,1,20,2020030224,Tampa Bay Lightning
4,Faceoff,Tampa Bay Lightning,69.0,22.0,1,41,2020030224,Tampa Bay Lightning
...,...,...,...,...,...,...,...,...
461,Blocked Shot,Tampa Bay Lightning,-76.0,-1.0,3,5937,2020030224,Tampa Bay Lightning
462,Faceoff,Carolina Hurricanes,-69.0,-22.0,3,5939,2020030224,Tampa Bay Lightning
463,Blocked Shot,Tampa Bay Lightning,-71.0,-18.0,3,5953,2020030224,Tampa Bay Lightning
464,Blocked Shot,Tampa Bay Lightning,-82.0,-5.0,3,5955,2020030224,Tampa Bay Lightning


In [137]:
counter = 200
i = 1
artifical_scrape_2 = pd.DataFrame()
while counter <= 6000:
    df = minutes_into_game_df(game_events_2,counter)
    differences = df.iloc[1][:9] - df.iloc[0][:9]
    differences = differences.to_dict()
    output = pd.DataFrame(differences, index=[i])
    artifical_scrape_2 = pd.concat([artifical_scrape_2,output], axis=0, ignore_index=True)
    counter += 200
    i += 1 

In [139]:
counter = 200
predicts = []
probabilities = []
i = 0
while counter <= 6000:
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    temp_df = artifical_scrape_2.loc[[i], columns]
    try:
        temp_list = model.predict(temp_df)
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df)
        probabilities.append(probs[0][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
    i += 1
artifical_scrape_2['predictions'] = predicts
artifical_scrape_2['probability_win'] = probabilities


In [140]:
artifical_scrape_2

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,predictions,probability_win
0,0,2,0,0,0,0,-2,0,0,no,0.268453
1,0,-2,-2,-4,0,2,-2,-4,0,no,0.248931
2,0,4,-4,-4,4,2,0,-2,0,no,0.376402
3,0,4,-4,-4,4,2,-2,-2,0,no,0.360152
4,0,8,-6,-4,4,2,-2,4,0,no,0.41595
5,0,2,-6,-4,4,2,4,2,-2,no,0.471565
6,0,4,-6,-4,2,4,4,4,-2,no,0.482988
7,2,2,-4,-4,0,4,0,0,-2,yes,0.995833
8,2,6,-4,-4,0,4,0,-2,-4,yes,0.995833
9,2,8,-4,-2,-2,4,-6,0,-4,yes,0.995833
