In [19]:
import zipfile, io, json, os
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import numpy as np

In [2]:
files = [f for f in os.listdir('.') if os.path.isfile(f)]
if 'basketball.zip' not in files and 'basketball.sqlite' not in files:
        api = KaggleApi()
        #config_dir = os.getcwd()
        #config_file = 'kaggle.json'
        #api.config = os.path.join(config_dir, config_file)
        api.authenticate()
        api.dataset_download_files('wyattowalsh/basketball') #downloads to cwd
elif 'basketball.zip' in files and 'basketball.sqlite' not in files:
    with zipfile.ZipFile('basketball.zip', 'r') as zip_ref:
        zip_ref.extractall()
    os.remove('daily_execution_pipeline.yml')
    os.remove('monthly_execution_pipeline.yml')

if 'basketball.sqlite' in files:
    %load_ext sql
    %sql sqlite:///basketball.sqlite

In [73]:
%%sql nba_game_stats << SELECT 
SEASON,
GAME_DATE,
TEAM_ID_HOME,
FGM_HOME,
FGA_HOME,
FG3M_HOME,
FTM_HOME,
FTA_HOME,
OREB_HOME,
DREB_HOME,
AST_HOME,
STL_HOME,
TOV_HOME,
PF_HOME,
PTS_HOME,
CASE
    WHEN WL_HOME = 'W'
        THEN 1
        ELSE 0
END AS home_win,
TEAM_ID_AWAY,
FGM_AWAY,
FGA_AWAY,
FG3M_AWAY,
FTM_AWAY,
FTA_AWAY,
OREB_AWAY,
DREB_AWAY,
AST_AWAY,
STL_AWAY,
TOV_AWAY,
PF_AWAY,
PTS_AWAY,
CASE
    WHEN WL_HOME = 'W'
        THEN 0
        ELSE 1
END AS away_win
FROM Game 
WHERE SEASON > 2001 AND SEASON < 2019

 * sqlite:///basketball.sqlite
Done.
Returning data to local variable nba_game_stats


In [74]:
# Convert to Dataframe and ensure columns are int not object
nba_game_df = nba_game_stats.DataFrame()
nba_game_df = nba_game_df.astype({'SEASON':'int'})
nba_game_df = nba_game_df.astype({'FGA_HOME':'int'})
nba_game_df = nba_game_df.astype({'FG3M_HOME':'int'})
nba_game_df = nba_game_df.astype({'OREB_HOME':'int'})
nba_game_df = nba_game_df.astype({'DREB_HOME':'int'})
nba_game_df = nba_game_df.astype({'AST_HOME':'int'})
nba_game_df = nba_game_df.astype({'STL_HOME':'int'})
nba_game_df = nba_game_df.astype({'TOV_HOME':'int'})
nba_game_df = nba_game_df.astype({'FGA_AWAY':'int'})
nba_game_df = nba_game_df.astype({'FG3M_AWAY':'int'})
nba_game_df = nba_game_df.astype({'OREB_AWAY':'int'})
nba_game_df = nba_game_df.astype({'DREB_AWAY':'int'})
nba_game_df = nba_game_df.astype({'AST_AWAY':'int'})
nba_game_df = nba_game_df.astype({'STL_AWAY':'int'})
nba_game_df = nba_game_df.astype({'TOV_AWAY':'int'})
nba_game_df['GAME_DATE'] = pd.to_datetime(nba_game_df['GAME_DATE'])

In [76]:
# Call outlier detection here to get general game data
nba_game_df.columns
nba_game_df.dtypes

SEASON                   int64
GAME_DATE       datetime64[ns]
TEAM_ID_HOME            object
FGM_HOME               float64
FGA_HOME                 int64
FG3M_HOME                int64
FTM_HOME               float64
FTA_HOME               float64
OREB_HOME                int64
DREB_HOME                int64
AST_HOME                 int64
STL_HOME                 int64
TOV_HOME                 int64
PF_HOME                float64
PTS_HOME                 int64
home_win                 int64
TEAM_ID_AWAY            object
FGM_AWAY               float64
FGA_AWAY                 int64
FG3M_AWAY                int64
FTM_AWAY               float64
FTA_AWAY               float64
OREB_AWAY                int64
DREB_AWAY                int64
AST_AWAY                 int64
STL_AWAY                 int64
TOV_AWAY                 int64
PF_AWAY                float64
PTS_AWAY                 int64
away_win                 int64
dtype: object

In [77]:
# calculate differentials HOME
nba_game_df["FGM_HOME_DIFF"] = nba_game_df["FGM_HOME"] - nba_game_df["FGM_AWAY"]
nba_game_df["FGA_HOME_DIFF"] = nba_game_df["FGA_HOME"] - nba_game_df["FGA_AWAY"]
nba_game_df["FG3M_HOME_DIFF"] = nba_game_df["FG3M_HOME"] - nba_game_df["FG3M_AWAY"]
nba_game_df["FTM_HOME_DIFF"] = nba_game_df["FTM_HOME"] - nba_game_df["FTM_AWAY"]
nba_game_df["FTA_HOME_DIFF"] = nba_game_df["FTA_HOME"] - nba_game_df["FTA_AWAY"]
nba_game_df["OREB_HOME_DIFF"] = nba_game_df["OREB_HOME"] - nba_game_df["OREB_AWAY"]
nba_game_df["DREB_HOME_DIFF"] = nba_game_df["DREB_HOME"] - nba_game_df["DREB_AWAY"]
nba_game_df["AST_HOME_DIFF"] = nba_game_df["AST_HOME"] - nba_game_df["AST_AWAY"]
nba_game_df["STL_HOME_DIFF"] = nba_game_df["STL_HOME"] - nba_game_df["STL_AWAY"]
nba_game_df["TOV_HOME_DIFF"] = nba_game_df["TOV_HOME"] - nba_game_df["TOV_AWAY"]
nba_game_df["PF_HOME_DIFF"] = nba_game_df["PF_HOME"] - nba_game_df["PF_AWAY"]
nba_game_df["PTS_HOME_DIFF"] = nba_game_df["PTS_HOME"] - nba_game_df["PTS_AWAY"]
# calculate differentials AWAY
nba_game_df["FGM_AWAY_DIFF"] = nba_game_df["FGM_AWAY"] - nba_game_df["FGM_HOME"]
nba_game_df["FGA_AWAY_DIFF"] = nba_game_df["FGA_AWAY"] - nba_game_df["FGA_HOME"]
nba_game_df["FG3M_AWAY_DIFF"] = nba_game_df["FG3M_AWAY"] - nba_game_df["FG3M_HOME"]
nba_game_df["FTM_AWAY_DIFF"] = nba_game_df["FTM_AWAY"] - nba_game_df["FTM_HOME"]
nba_game_df["FTA_AWAY_DIFF"] = nba_game_df["FTA_AWAY"] - nba_game_df["FTA_HOME"]
nba_game_df["OREB_AWAY_DIFF"] = nba_game_df["OREB_AWAY"] - nba_game_df["OREB_HOME"]
nba_game_df["DREB_AWAY_DIFF"] = nba_game_df["DREB_AWAY"] - nba_game_df["DREB_HOME"]
nba_game_df["AST_AWAY_DIFF"] = nba_game_df["AST_AWAY"] - nba_game_df["AST_HOME"]
nba_game_df["STL_AWAY_DIFF"] = nba_game_df["STL_AWAY"] - nba_game_df["STL_HOME"]
nba_game_df["TOV_AWAY_DIFF"] = nba_game_df["TOV_AWAY"] - nba_game_df["TOV_HOME"]
nba_game_df["PF_AWAY_DIFF"] = nba_game_df["PF_AWAY"] - nba_game_df["PF_HOME"]
nba_game_df["PTS_AWAY_DIFF"] = nba_game_df["PTS_AWAY"] - nba_game_df["PTS_HOME"]

In [78]:
# split into home and away dfs pulling only differential stats
nba_home_df = nba_game_df[["SEASON", "GAME_DATE", "TEAM_ID_HOME", "FGM_HOME_DIFF", "FGA_HOME_DIFF", "FG3M_HOME_DIFF", 
                           "FTM_HOME_DIFF", "FTA_HOME_DIFF", "OREB_HOME_DIFF", "DREB_HOME_DIFF", "AST_HOME_DIFF",
                          "STL_HOME_DIFF", "TOV_HOME_DIFF", "PF_HOME_DIFF", "PTS_HOME_DIFF", "home_win"]]
nba_away_df = nba_game_df[["SEASON", "GAME_DATE", "TEAM_ID_AWAY", "FGM_AWAY_DIFF", "FGA_AWAY_DIFF", "FG3M_AWAY_DIFF", 
                           "FTM_AWAY_DIFF", "FTA_AWAY_DIFF", "OREB_AWAY_DIFF", "DREB_AWAY_DIFF", "AST_AWAY_DIFF",
                          "STL_AWAY_DIFF", "TOV_AWAY_DIFF", "PF_AWAY_DIFF", "PTS_AWAY_DIFF", "away_win"]]

In [79]:
# add column to specify home or away game for each df
nba_home_df["HOME_GAME"] = 1
nba_away_df["HOME_GAME"] = 0

# rename win column to be the same in both dfs
nba_home_df.rename(columns = {"TEAM_ID_HOME": "TEAM_ID", "FGM_HOME_DIFF": "FGM_DIFF", "FGA_HOME_DIFF": "FGA_DIFF", 
                              "FG3M_HOME_DIFF":"FG3M_DIFF", "FTM_HOME_DIFF":"FTM_DIFF", "FTA_HOME_DIFF":"FTA_DIFF",
                              "OREB_HOME_DIFF":"OREB_DIFF", "DREB_HOME_DIFF":"DREB_DIFF", "AST_HOME_DIFF":"AST_DIFF",
                              "STL_HOME_DIFF":"STL_DIFF", "TOV_HOME_DIFF":"TOV_DIFF", "PF_HOME_DIFF":"PF_DIFF", 
                              "PTS_HOME_DIFF":"PTS_DIFF", "home_win":"WIN"}, inplace = True)
nba_away_df.rename(columns = {"TEAM_ID_AWAY": "TEAM_ID", "FGM_AWAY_DIFF": "FGM_DIFF", "FGA_AWAY_DIFF": "FGA_DIFF", 
                              "FG3M_AWAY_DIFF":"FG3M_DIFF", "FTM_AWAY_DIFF":"FTM_DIFF", "FTA_AWAY_DIFF":"FTA_DIFF",
                              "OREB_AWAY_DIFF":"OREB_DIFF", "DREB_AWAY_DIFF":"DREB_DIFF", "AST_AWAY_DIFF":"AST_DIFF",
                              "STL_AWAY_DIFF":"STL_DIFF", "TOV_AWAY_DIFF":"TOV_DIFF", "PF_AWAY_DIFF":"PF_DIFF", 
                              "PTS_AWAY_DIFF":"PTS_DIFF", "away_win":"WIN"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_home_df["HOME_GAME"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_away_df["HOME_GAME"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [80]:
# combine both dfs into a df with differential stats for each team and their win or loss
nba_combined_df = pd.concat([nba_home_df, nba_away_df])
print(nba_home_df.shape)
print(nba_away_df.shape)
print(nba_combined_df.shape)

(20651, 17)
(20651, 17)
(41302, 17)


In [84]:
# sort combined df by team and date
print(nba_combined_df.head())
nba_combined_df.sort_values(by=['TEAM_ID', 'GAME_DATE'], inplace=True)
print(nba_combined_df.head())

   SEASON  GAME_DATE     TEAM_ID  FGM_DIFF  FGA_DIFF  FG3M_DIFF  FTM_DIFF  \
0    2002 2002-10-29  1610612747       1.0        14          1      -8.0   
1    2002 2002-10-29  1610612747       1.0        14          1      -8.0   
2    2002 2002-10-29  1610612753       5.0         3          4      -7.0   
3    2002 2002-10-29  1610612758       9.0        -3          5       4.0   
4    2002 2002-10-30  1610612754       6.0         4          1      -4.0   

   FTA_DIFF  OREB_DIFF  DREB_DIFF  AST_DIFF  STL_DIFF  TOV_DIFF  PF_DIFF  \
0     -16.0          6        -10         1        -5        -2      8.0   
1     -16.0          6        -10         1        -5        -2      8.0   
2     -10.0         -3         -4         5        -6        -5      4.0   
3       3.0         -2          4         7         5        -4     -1.0   
4       1.0          2         -2         9         5        -1      6.0   

   PTS_DIFF  WIN  HOME_GAME  
0        -5    0          1  
1        -5    0    

In [86]:
# save data to csv file
# only used seasons 2002-2018 to take out the bubble season
nba_combined_df.to_csv("../../data/LSTMGames.csv", index=False)