## Quantizing Home Advantage in NBA
Finding how much of a boost it is to teams playing at their home court

In [2]:
# Import packages
import pandas as pd # type: ignore
import numpy as np # type: ignore
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', None) # Set the maximum number of columns to display to the maximum possible
import pickle

import plotly.graph_objects as go # type: ignore
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

import seaborn as sns
from matplotlib import pyplot as plt

## Fetch dictionaries for lookups

In [3]:
# Write nba_team_conf to a pickle
# W: Western conference, E: Eastern conference
from collections import Counter
nba_team_conf = {
    'ATL': 'E','BOS': 'E','BRK': 'E','CHO': 'E','CHI': 'E','CLE': 'E','DAL': 'W','DEN': 'W','DET': 'E','GSW': 'W','HOU': 'W','IND': 'E','LAC': 'W','LAL': 'W','MEM': 'W',
    'MIA': 'E','MIL': 'E','MIN': 'W','NOP': 'W','NYK': 'E','OKC': 'W','ORL': 'E','PHI': 'E','PHO': 'W','POR': 'W','SAC': 'W','SAS': 'W','TOR': 'E','UTA': 'W','WAS': 'E'
}
print('Conference', Counter(nba_team_conf.values()))

# Write file to a pkl
with open('pkl/nba_team_conf.pkl', 'wb') as f:
    pickle.dump(nba_team_conf, f)

# Write nba_team_division to a pickle
nba_team_division = {
    'ATL': 'Southeast', 'BOS': 'Atlantic','BRK': 'Atlantic','CHO': 'Southeast','CHI': 'Central','CLE': 'Central','DAL': 'Southwest',
    'DEN': 'Northwest','DET': 'Central','GSW': 'Pacific','HOU': 'Southwest','IND': 'Central','LAC': 'Pacific','LAL': 'Pacific','MEM': 'Southwest',
    'MIA': 'Southeast','MIL': 'Central','MIN': 'Northwest','NOP': 'Southwest','NYK': 'Atlantic','OKC': 'Northwest','ORL': 'Southeast','PHI': 'Atlantic','PHX': 'Pacific',
    'POR': 'Northwest','SAC': 'Pacific','SAS': 'Southwest','TOR': 'Atlantic','UTA': 'Northwest', 'WAS': 'Southeast'
}

# Count the occurrences of values in the dictionary
print('Division', Counter(nba_team_division.values()))

with open('pkl/nba_team_division.pkl', 'wb') as f:
    pickle.dump(nba_team_division, f)

Conference Counter({'E': 15, 'W': 15})
Division Counter({'Southeast': 5, 'Atlantic': 5, 'Central': 5, 'Southwest': 5, 'Northwest': 5, 'Pacific': 5})


In [49]:
# Get nba_team_division dataframe
df_nba_team_division = pd.DataFrame.from_dict(nba_team_division, orient='index', columns = ['Division']).reset_index()
df_nba_team_division.columns = ['Team', 'Division']

In [6]:
# Read pickle
nba_df = pd.read_pickle('pkl/nba_df.pkl')
nba_df.Date = pd.to_datetime(nba_df.Date, format='%Y-%m-%d', errors = 'coerce')

# Replace values of W/L with 1 and 0
nba_df['W/L'] = nba_df['W/L'].replace({'L': 0, 'W': 1})
nba_df.head(5)

Unnamed: 0,Season,Team,G,Date,Home,Opp,W/L,Tm_Pts,Opp_Pts,Tm_FG,Tm_FGA,Tm_FG%,Tm_3P,Tm_3PA,Tm_3P%,Tm_FT,Tm_FTA,Tm_FT%,Tm_ORB,Tm_TRB,Tm_AST,Tm_STL,Tm_BLK,Tm_TOV,Tm_PF,Opp_FG,Opp_FGA,Opp_FG%,Opp_3P,Opp_3PA,Opp_3P%,Opp_FT,Opp_FTA,Opp_FT%,Opp_ORB,Opp_TRB,Opp_AST,Opp_STL,Opp_BLK,Opp_TOV,Opp_PF
0,2000,ATL,1,1999-11-02,0,WAS,0,87,94,31,78,0.397,2,6,0.333,23,30,0.767,16,50,15,5,5,23,22,39,88,0.443,3,10,0.3,13,16,0.813,12,42,23,5,5,15,30
1,2000,ATL,2,1999-11-04,1,MIL,0,109,119,41,83,0.494,6,14,0.429,21,30,0.7,17,46,22,3,5,26,26,41,90,0.456,4,11,0.364,33,36,0.917,12,38,24,15,6,11,25
2,2000,ATL,3,1999-11-06,1,CHI,1,113,97,44,81,0.543,3,8,0.375,22,34,0.647,13,42,21,10,3,11,30,35,80,0.438,3,12,0.25,24,35,0.686,17,39,14,6,6,14,26
3,2000,ATL,4,1999-11-08,0,DEN,0,100,115,39,82,0.476,0,7,0.0,22,30,0.733,13,39,21,2,6,12,22,45,96,0.469,6,19,0.316,19,24,0.792,22,49,28,6,15,7,23
4,2000,ATL,5,1999-11-10,0,VAN,0,97,102,39,92,0.424,1,7,0.143,18,24,0.75,10,41,17,9,7,14,18,44,92,0.478,1,6,0.167,13,16,0.813,15,49,27,9,10,18,24


## Home Wins Percentage

It is well known that teams hold an advantage at their home court. This is to look at the extent they impact.

In [7]:
# Overall Home win percentage (by season)
nba_home = nba_df.groupby(['Season', 'Team', 'Home']).agg({'W/L': 'sum'})
nba_home = pd.merge(nba_home,
                    nba_df.groupby(['Season', 'Team', 'Home']).agg({'G': 'count'}), 
                    on = ['Season', 'Team', 'Home'], 
                    how = 'inner')
nba_home['Win%'] = nba_home['W/L']/nba_home['G']
nba_home.reset_index(inplace=True)
nba_home.to_csv('pkl/nba_home_seasons.csv', index = False)
nba_home.head(10)

Unnamed: 0,Season,Team,Home,W/L,G,Win%
0,2000,ATL,0,7,41,0.170732
1,2000,ATL,1,21,41,0.512195
2,2000,BOS,0,9,41,0.219512
3,2000,BOS,1,26,41,0.634146
4,2000,CHI,0,5,41,0.121951
5,2000,CHI,1,12,41,0.292683
6,2000,CLE,0,10,41,0.243902
7,2000,CLE,1,22,41,0.536585
8,2000,DAL,0,18,41,0.439024
9,2000,DAL,1,22,41,0.536585


In [38]:
# Overall Home win percentage (overall)
nba_home_overall = nba_home.groupby(['Team', 'Home']).agg({'W/L': 'sum', 'G': 'sum'})
nba_home_overall['Win%'] = nba_home_overall['W/L']/nba_home_overall['G']
nba_home_overall.reset_index(inplace=True)

# Left join with team division information
nba_home_overall = pd.merge(nba_home_overall,
                            df_nba_team_division,
                            how='left',
                            left_on='Team',
                            right_on='Team')

# Separate the DataFrame into home and away data
home_df = nba_home_overall[nba_home_overall['Home'] == 1]
away_df = nba_home_overall[nba_home_overall['Home'] == 0]

# Create traces for home and away win percentages for each team
home_trace = go.Bar(x=home_df['Team'], y=home_df['Win%'], name='Home', marker_color='#93CBDD')
away_trace = go.Bar(x=away_df['Team'], y=away_df['Win%'], name='Away', marker_color='#FF8BA1')

# Create the figure
fig = go.Figure(data=[home_trace, away_trace])

# Add hlines at home and away win percentages
home_win_pt = np.round(home_df['W/L'].sum() / home_df['G'].sum(), 2)
away_win_pt = np.round(away_df['W/L'].sum() / away_df['G'].sum(), 2)

fig.add_hline(y = home_win_pt, line_dash="dash", line_color="#93CBDD") # Home avg
fig.add_hline(y = away_win_pt, line_dash="dash", line_color="#FF8BA1") # Away avg

# Update the layout
fig.update_layout(title='Win Percentage by Team at Home and Away',
                  xaxis_title='Team',
                  yaxis_title='Win Percentage')
fig.update_layout(width = 1400, height = 400)
fig.show()

## Case for better scheduling
Hypothesis. Why teams with wider schedules are more inclined to win the match

## Date differences between games
Hypothesis. Fewer dates between games will result in untoward outcomes.

In [48]:
# Get Date Difference Between Games
df_window = nba_df.groupby(['Season', 'Team']).agg({'W/L': 'count'})
df_window.reset_index(inplace=True)

# Datediff
try:
    nba_df.insert(loc = 4, column = 'Datediff', value = nba_df.groupby(['Season', 'Team'])['Date'].diff().dt.total_seconds().div(3600*24))
except:
    pass

# Value counts Date Difference Between Games
nba_df[nba_df['Datediff'] < 10].value_counts('Datediff').sort_values(ascending = False)

Datediff
2.0    30223
1.0    11923
3.0     7884
4.0     1827
5.0      360
6.0      314
8.0      141
7.0      124
9.0      116
Name: count, dtype: int64

In [93]:
# TODO: visit library
# TODO: Visualize home win %
# Check other stats that contribute to this
# Start scraping