In [1]:
import pandas as pd
import time
import requests
import numpy as np

import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
def time_to_int(column):
    output_list = []
    for i in [x for x in column]:
        numeric_time = []
        for digit in i:
            if digit != ":":
                numeric_time.append((digit))
            if len(numeric_time) == 4:
                numeric_time = "".join(numeric_time)
                int(numeric_time)
        output_list.append(numeric_time)  
    output_list = [int(x) for x in output_list]
    return output_list

In [3]:
def period_time_to_full_time(df):
    first_period = df[df['period'] == 1]
    second_period = df[df['period'] == 2]
    third_period = df[df['period'] == 3]
    second_period['period_time'] = second_period['period_time'] + 2000
    third_period['period_time'] = third_period['period_time'] + 4000
    output_df = pd.concat([first_period,second_period], axis=0, ignore_index=True)
    output_df = pd.concat([output_df,third_period], axis=0, ignore_index=True)
    return output_df
    

In [4]:
box_score_differences = pd.read_csv('./live_scrapes/islanders_bruins_v3_diffs_137.csv')
box_score_differences.drop(columns=['Unnamed: 0','time_stamp'], inplace=True)
box_score_differences['period_time'] = time_to_int(box_score_differences['period_time'])
box_score_differences = period_time_to_full_time(box_score_differences)
box_score_differences.head()

Unnamed: 0,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,ice_tilt,period,period_time,rfc_prediction
0,0,0,0,0.0,0.0,0.0,100.0,0,0,0,-69.0,1,8,yes
1,0,0,0,0.0,0.0,0.0,100.0,0,0,0,-152.0,1,126,yes
2,0,1,0,0.0,0.0,0.0,100.0,1,0,-1,-154.0,1,211,yes
3,2,0,0,0.0,0.0,0.0,100.0,1,0,-2,-274.0,1,259,yes
4,2,0,0,0.0,0.0,0.0,0.0,1,0,-2,-385.0,1,259,no


In [5]:
minute_diffs = pd.read_csv('./live_scrapes/islanders_bruins_v3_minute_diffs_137.csv')
minute_diffs['period_time'] = box_score_differences['period_time']
minute_diffs.drop(columns='Unnamed: 0', inplace=True)
minute_diffs.drop_duplicates(inplace=True)
minute_diffs.head()

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,period_time
0,0,0,0,2,0,0,0,0,0,8
1,0,0,1,2,1,0,-1,0,0,126
2,0,0,0,2,1,0,-2,0,0,211
3,0,2,0,1,1,0,-3,0,0,259
4,0,2,0,0,1,0,-3,1,0,259


In [6]:
counter = 200
predicts = []
probabilities = []
while counter <= 6000:
    temp_df = minute_diffs[(minute_diffs['period_time'] <= (counter)) & (minute_diffs['period_time'] > (counter - 200))]      
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    try:
        temp_list = model.predict(temp_df.drop(columns='period_time'))
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df.drop(columns='period_time'))
        for i in range(0,(len(probs)),1):
            probabilities.append(probs[i][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
minute_diffs['predictions'] = predicts
minute_diffs['probability_win'] = probabilities


In [7]:
box_score_differences.drop_duplicates(inplace=True)

In [8]:
minute_diffs.head()

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,period_time,predictions,probability_win
0,0,0,0,2,0,0,0,0,0,8,no,0.2619
1,0,0,1,2,1,0,-1,0,0,126,no,0.307217
2,0,0,0,2,1,0,-2,0,0,211,no,0.258354
3,0,2,0,1,1,0,-3,0,0,259,no,0.290854
4,0,2,0,0,1,0,-3,1,0,259,no,0.275854


In [9]:
minute_diffs['predictions'].value_counts()

no    96
Name: predictions, dtype: int64

In [10]:
box_score_differences.to_csv('./data_frames/bruins_isles_live_scrape_preds.csv', index=False)
minute_diffs.to_csv('./data_frames/bruins_isles_live_scrape_minutes_preds.csv', index=False)

artifical scrape for a more exciting game

In [11]:
game_stats = requests.get('https://statsapi.web.nhl.com/api/v1/game/2020030113/feed/live')
test_1 = game_stats.json()['liveData']['plays']['allPlays']

In [12]:
game_events = pd.DataFrame()
for i in range(2,(len(test_1)-2),1):
    event = test_1[i]['result']['event']
    try:
        team = test_1[i]['team']['name']
    except:
        team = np.nan
    try:
        x_cords = test_1[i]['coordinates']['x']
    except:
        x_cords = np.nan
    try:
        y_cords = test_1[i]['coordinates']['y']
    except:
        y_cords =np.nan
    period = test_1[i]['about']['period']
    periodTime = test_1[i]['about']['periodTime']
    home_team = game_stats.json()['gameData']['teams']['home']['name']
    game_id = 2020030113
    temp_dic = {'event': event, 'team': team,'x_coordinate':x_cords, 'y_coordinate': y_cords,
               'period':period, 'period_time': periodTime, 'game_id':game_id, 'home_team': home_team}
    temp_df = pd.DataFrame(temp_dic, index=[i])
    game_events = pd.concat([game_events, temp_df], axis=0, ignore_index=True)
    

In [13]:
def minutes_into_game_df(df, minute_interval = 200):
    events = df[df['period_time'] <= minute_interval]

    events_home = events[events['team'] == events['home_team']]
    events_home_dict = events_home.groupby('event').count()['team'].to_dict()
    ice_tilt_minutes = events_home.groupby('event')['x_coordinate'].sum().sum()
    base_dic_home = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_home.update(events_home_dict)

    events_away = events[events['team'] != events['home_team']]
    events_away_dict = events_away.groupby('event').count()['team'].to_dict()
    ice_tilt_minutes = events_away.groupby('event')['x_coordinate'].sum().sum()
    base_dic_away = {
    'Goal':0, 'Shot':0, 'Blocked Shot': 0,
    'Faceoff':0 , 'Takeaway':0, 'Giveaway': 0,
    'Hit': 0, 'Missed Shot':0, 'Penalty': 0
    }
    base_dic_away.update(events_away_dict)
    output_df = pd.DataFrame(base_dic_away, index=[0])
    concat_df = pd.DataFrame(base_dic_home, index=[1])
    output_df = pd.concat([output_df,concat_df], axis=0, ignore_index=True)
    return output_df

In [14]:
game_events.dropna(inplace=True)

In [15]:
game_events.head()

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
1,Faceoff,New York Islanders,0.0,0.0,1,00:00,2020030113,New York Islanders
2,Hit,Pittsburgh Penguins,-56.0,39.0,1,00:16,2020030113,New York Islanders
5,Faceoff,New York Islanders,-69.0,22.0,1,01:03,2020030113,New York Islanders
6,Hit,New York Islanders,97.0,-4.0,1,01:46,2020030113,New York Islanders
7,Hit,Pittsburgh Penguins,-94.0,22.0,1,01:50,2020030113,New York Islanders


In [16]:
game_events['period_time'] = time_to_int(game_events['period_time'])

In [17]:
game_events = period_time_to_full_time(game_events)

In [18]:
game_events.head()

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
0,Faceoff,New York Islanders,0.0,0.0,1,0,2020030113,New York Islanders
1,Hit,Pittsburgh Penguins,-56.0,39.0,1,16,2020030113,New York Islanders
2,Faceoff,New York Islanders,-69.0,22.0,1,103,2020030113,New York Islanders
3,Hit,New York Islanders,97.0,-4.0,1,146,2020030113,New York Islanders
4,Hit,Pittsburgh Penguins,-94.0,22.0,1,150,2020030113,New York Islanders


In [19]:
counter = 200
i = 1
artifical_scrape = pd.DataFrame()
while counter <= 6000:
    df = minutes_into_game_df(game_events,counter)
    differences = df.iloc[1][:9] - df.iloc[0][:9]
    differences = differences.to_dict()
    output = pd.DataFrame(differences, index=[i])
    artifical_scrape = pd.concat([artifical_scrape,output], axis=0, ignore_index=True)
    counter += 200
    i += 1 

In [20]:
columns = artifical_scrape.columns.to_list()

In [21]:
counter = 200
predicts = []
probabilities = []
i = 0
while counter <= 6000:
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    temp_df = artifical_scrape.loc[[i], columns]
    try:
        temp_list = model.predict(temp_df)
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df)
        probabilities.append(probs[0][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
    i += 1
artifical_scrape['predictions'] = predicts
artifical_scrape['probability_win'] = probabilities


In [22]:
artifical_scrape.head()

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,predictions,probability_win
0,0,-1,0,2,0,1,-1,0,0,no,0.247342
1,-1,-3,1,2,0,1,1,0,1,no,0.005556
2,-1,0,2,2,0,1,2,0,1,no,0.062321
3,-1,-2,3,7,0,2,2,0,1,no,0.049982
4,-1,-3,3,9,0,3,3,0,1,no,0.049982


In [23]:
artifical_scrape.to_csv('./data_frames/penguins_isles_artifical_live_scrape_minutes.csv', index=False)

one where the home team wins:

In [24]:
game_stats = requests.get('https://statsapi.web.nhl.com/api/v1/game/2020030224/feed/live')
test_1 = game_stats.json()['liveData']['plays']['allPlays']

In [25]:
game_events_2 = pd.DataFrame()
for i in range(2,(len(test_1)-2),1):
    event = test_1[i]['result']['event']
    try:
        team = test_1[i]['team']['name']
    except:
        team = np.nan
    try:
        x_cords = test_1[i]['coordinates']['x']
    except:
        x_cords = np.nan
    try:
        y_cords = test_1[i]['coordinates']['y']
    except:
        y_cords =np.nan
    period = test_1[i]['about']['period']
    periodTime = test_1[i]['about']['periodTime']
    home_team = game_stats.json()['gameData']['teams']['home']['name']
    game_id = 2020030224
    temp_dic = {'event': event, 'team': team,'x_coordinate':x_cords, 'y_coordinate': y_cords,
               'period':period, 'period_time': periodTime, 'game_id':game_id, 'home_team': home_team}
    temp_df = pd.DataFrame(temp_dic, index=[i])
    game_events_2 = pd.concat([game_events_2, temp_df], axis=0, ignore_index=True)

In [26]:
game_events_2.dropna(inplace=True)

In [27]:
game_events_2['period_time'] = time_to_int(game_events_2['period_time'])

In [28]:
game_events_2 = period_time_to_full_time(game_events_2)

In [29]:
game_events_2.head()

Unnamed: 0,event,team,x_coordinate,y_coordinate,period,period_time,game_id,home_team
0,Penalty,Tampa Bay Lightning,0.0,-14.0,1,0,2020030224,Tampa Bay Lightning
1,Penalty,Carolina Hurricanes,0.0,-16.0,1,0,2020030224,Tampa Bay Lightning
2,Faceoff,Carolina Hurricanes,0.0,0.0,1,0,2020030224,Tampa Bay Lightning
3,Hit,Carolina Hurricanes,15.0,35.0,1,20,2020030224,Tampa Bay Lightning
4,Faceoff,Tampa Bay Lightning,69.0,22.0,1,41,2020030224,Tampa Bay Lightning


In [30]:
counter = 200
i = 1
artifical_scrape_2 = pd.DataFrame()
while counter <= 6000:
    df = minutes_into_game_df(game_events_2,counter)
    differences = df.iloc[1][:9] - df.iloc[0][:9]
    differences = differences.to_dict()
    output = pd.DataFrame(differences, index=[i])
    artifical_scrape_2 = pd.concat([artifical_scrape_2,output], axis=0, ignore_index=True)
    counter += 200
    i += 1 

In [31]:
counter = 200
predicts = []
probabilities = []
i = 0
while counter <= 6000:
    model = joblib.load(f'./models/{counter}_logreg_search_diffs_2020.pkl')
    temp_df = artifical_scrape_2.loc[[i], columns]
    try:
        temp_list = model.predict(temp_df)
        temp_list = temp_list.tolist()
        probs = model.predict_proba(temp_df)
        probabilities.append(probs[0][1])
    except:
        temp_list=[]
    predicts.extend(temp_list)
    counter += 200
    i += 1
artifical_scrape_2['predictions'] = predicts
artifical_scrape_2['probability_win'] = probabilities


In [32]:
artifical_scrape_2.head()

Unnamed: 0,Goal,Shot,Blocked Shot,Faceoff,Takeaway,Giveaway,Hit,Missed Shot,Penalty,predictions,probability_win
0,0,1,0,0,0,0,-1,0,0,no,0.25887
1,0,-1,-1,-2,0,1,-1,-2,0,no,0.220365
2,0,2,-2,-2,2,1,0,-1,0,no,0.330102
3,0,2,-2,-2,2,1,-1,-1,0,no,0.343738
4,0,4,-3,-2,2,1,-1,2,0,no,0.393556


In [33]:
artifical_scrape_2.to_csv('./data_frames/lighting_canes_artifical_scrape_minutes.csv', index=False)