In [1]:
# Gather data to determine how often the winner wins by OVER 1.5 goals

import pandas as pd
import numpy as np

url = "https://www.hockey-reference.com/leagues/NHL_2023_games.html"
nhlHTML=pd.read_html(url)

df = nhlHTML[0]
df.columns = df.columns.str.replace("."," ", regex=False)
df.rename(columns = {'G':'Visitor G', 'G 1':'Home G', 'Unnamed: 5': 'OT/SO'}, inplace = True)
df["OT/SO"] = df["OT/SO"].fillna('No')
df

Unnamed: 0,Date,Visitor,Visitor G,Home,Home G,OT/SO,Att,LOG,Notes
0,2022-10-07,San Jose Sharks,1.0,Nashville Predators,4.0,No,16648.0,2:43,"at (Prague, CZ)"
1,2022-10-08,Nashville Predators,3.0,San Jose Sharks,2.0,No,17023.0,2:33,"at (Prague, CZ)"
2,2022-10-11,Vegas Golden Knights,4.0,Los Angeles Kings,3.0,No,18230.0,2:31,
3,2022-10-11,Tampa Bay Lightning,1.0,New York Rangers,3.0,No,18006.0,2:21,
4,2022-10-12,Seattle Kraken,4.0,Anaheim Ducks,5.0,OT,17530.0,2:28,
...,...,...,...,...,...,...,...,...,...
1307,2023-04-13,Vegas Golden Knights,,Seattle Kraken,,No,,,
1308,2023-04-13,Detroit Red Wings,,Tampa Bay Lightning,,No,,,
1309,2023-04-13,New Jersey Devils,,Washington Capitals,,No,,,
1310,2023-04-14,Buffalo Sabres,,Columbus Blue Jackets,,No,,,


In [3]:
# Clean data down to only necessary data

nhlData = df[["Date", "Visitor", "Visitor G", "Home", "Home G", "OT/SO"]]
nhlData = nhlData.dropna()
nhlData

Unnamed: 0,Date,Visitor,Visitor G,Home,Home G,OT/SO
0,2022-10-07,San Jose Sharks,1.0,Nashville Predators,4.0,No
1,2022-10-08,Nashville Predators,3.0,San Jose Sharks,2.0,No
2,2022-10-11,Vegas Golden Knights,4.0,Los Angeles Kings,3.0,No
3,2022-10-11,Tampa Bay Lightning,1.0,New York Rangers,3.0,No
4,2022-10-12,Seattle Kraken,4.0,Anaheim Ducks,5.0,OT
...,...,...,...,...,...,...
1127,2023-03-21,Vegas Golden Knights,4.0,Vancouver Canucks,3.0,No
1128,2023-03-21,Arizona Coyotes,1.0,Winnipeg Jets,2.0,No
1129,2023-03-21,Columbus Blue Jackets,7.0,Washington Capitals,6.0,OT
1130,2023-03-22,Pittsburgh Penguins,5.0,Colorado Avalanche,2.0,No


In [4]:
# Gather the goal differential

nhlData['Diff'] = nhlData['Visitor G'] - nhlData['Home G']
nhlData['Diff'] = nhlData['Diff'].abs()
nhlData

Unnamed: 0,Date,Visitor,Visitor G,Home,Home G,OT/SO,Diff
0,2022-10-07,San Jose Sharks,1.0,Nashville Predators,4.0,No,3.0
1,2022-10-08,Nashville Predators,3.0,San Jose Sharks,2.0,No,1.0
2,2022-10-11,Vegas Golden Knights,4.0,Los Angeles Kings,3.0,No,1.0
3,2022-10-11,Tampa Bay Lightning,1.0,New York Rangers,3.0,No,2.0
4,2022-10-12,Seattle Kraken,4.0,Anaheim Ducks,5.0,OT,1.0
...,...,...,...,...,...,...,...
1127,2023-03-21,Vegas Golden Knights,4.0,Vancouver Canucks,3.0,No,1.0
1128,2023-03-21,Arizona Coyotes,1.0,Winnipeg Jets,2.0,No,1.0
1129,2023-03-21,Columbus Blue Jackets,7.0,Washington Capitals,6.0,OT,1.0
1130,2023-03-22,Pittsburgh Penguins,5.0,Colorado Avalanche,2.0,No,3.0


In [53]:
# Bring in historical odds data

oddsDf = pd.read_csv('oddsData.csv')
oddsDf = oddsDf.drop(columns=['Unnamed: 0', 'Visitor G', 'Home G'])

# oddsDf['Visitor Open'] = pd.to_numeric(oddsDf['Visitor Open'], errors='coerce')
# oddsDf['Home Open'] = pd.to_numeric(oddsDf['Home Open'], errors='coerce')
# oddsDf = oddsDf.dropna(subset=['Visitor Open'])
# oddsDf = oddsDf.dropna(subset=['Home Open'])
# oddsDf['Visitor Open'] = oddsDf['Visitor Open'].astype(int)
# oddsDf['Home Open'] = oddsDf['Home Open'].astype(int)

oddsDf

Unnamed: 0,Date,Visitor,Home,Visitor Open,Visitor Best,Home Open,Home Best
0,2022-10-07,San Jose Sharks,Nashville Predators,145,+170,-177,-200
1,2022-10-07,Buffalo Sabres,Pittsburgh Penguins,165,+255,-227,-325
2,2022-10-07,Toronto Maple Leafs,Detroit Red Wings,130,+195,-165,-245
3,2022-10-07,Seattle Kraken,Edmonton Oilers,140,+175,-189,-215
4,2022-10-07,Winnipeg Jets,Calgary Flames,145,+180,-200,-220
...,...,...,...,...,...,...,...
1137,2023-03-21,Seattle Kraken,Dallas Stars,135,+145,-160,-162
1138,2023-03-21,Calgary Flames,Anaheim Ducks,-200,-215,165,+195
1139,2023-03-21,Vegas Golden Knights,Vancouver Canucks,-145,-130,125,+118
1140,2023-03-22,Pittsburgh Penguins,Colorado Avalanche,145,+138,-170,-145


In [57]:
# Combine odds data to results data

mergedData = pd.merge(oddsDf, nhlData, on=['Date','Visitor', 'Home'], how='left')
mergedData['Visitor G'] = pd.to_numeric(mergedData['Visitor G'], errors='coerce')
mergedData['Home G'] = pd.to_numeric(mergedData['Home G'], errors='coerce')
mergedData = mergedData.dropna(subset=['Visitor G'])
mergedData = mergedData.dropna(subset=['Home G'])
mergedData['Visitor G'] = mergedData['Visitor G'].astype(int)
mergedData['Home G'] = mergedData['Home G'].astype(int)
nhlData = mergedData
nhlData


Unnamed: 0,Date,Visitor,Home,Visitor Open,Visitor Best,Home Open,Home Best,Visitor G,Home G,OT/SO,Diff
0,2022-10-07,San Jose Sharks,Nashville Predators,145,+170,-177,-200,1,4,No,3.0
6,2022-10-08,Nashville Predators,San Jose Sharks,-177,-175,142,+150,3,2,No,1.0
17,2022-10-11,Tampa Bay Lightning,New York Rangers,-102,+110,-120,-120,1,3,No,2.0
18,2022-10-11,Vegas Golden Knights,Los Angeles Kings,-102,+100,-120,-115,4,3,No,1.0
19,2022-10-12,Boston Bruins,Washington Capitals,113,+122,-137,-135,5,2,No,3.0
...,...,...,...,...,...,...,...,...,...,...,...
1137,2023-03-21,Seattle Kraken,Dallas Stars,135,+145,-160,-162,5,4,OT,1.0
1138,2023-03-21,Calgary Flames,Anaheim Ducks,-200,-215,165,+195,5,1,No,4.0
1139,2023-03-21,Vegas Golden Knights,Vancouver Canucks,-145,-130,125,+118,4,3,No,1.0
1140,2023-03-22,Pittsburgh Penguins,Colorado Avalanche,145,+138,-170,-145,5,2,No,3.0


In [58]:
# RESULT # 

# Gather data to determine how often the winner wins by OVER 1.5 goals

result = (nhlData['Diff']>1).sum()
percent = (result / len(nhlData.index)) * 100
print("Percent of nhl games where the winner won by OVER 1.5 goals: " + str(percent))

Percent of nhl games where the winner won by OVER 1.5 goals: 59.372026641294006


In [59]:
# Percent of games that did not go into OT that won by OVER 1.5 goals:

noOT = nhlData.loc[nhlData['OT/SO'] == 'No']

result = (noOT['Diff']>1).sum()
percent = (result / len(noOT.index)) * 100
print("Percent of nhl games that did not go into OT where the winner won by OVER 1.5 goals: " + str(percent))

Percent of nhl games that did not go into OT where the winner won by OVER 1.5 goals: 77.32342007434944


In [56]:
# Percentage of Favorites to win

visitorFavW = 0
visitorFav = 0
homeFavW = 0
homeFav = 0
favW = 0

for index, row in mergedData.iterrows():
    homeG = row['Home G']
    visG = row['Visitor G']
    homeOdds = row['Home Best']
    visOdds = row['Visitor Best']
    home = True
    if (int(homeOdds) < int(visOdds)):
        homeFav += 1
        home = True
    elif (int(visOdds) < int(homeOdds)):
        visitorFav += 1
        home = False
    if (homeG > visG and home):
        homeFavW += 1
        favW += 1
    elif (visG > homeG and not home):
        visitorFavW += 1
        favW += 1

visFavWPercent = (visitorFavW / visitorFav) * 100
homeFavWPercent = (homeFavW / homeFav) * 100
favWPercent = (favW / len(mergedData.index)) * 100

print(visFavWPercent)
print(homeFavWPercent)
print(favWPercent)

62.5
60.810810810810814
60.418648905804
