<a href="https://colab.research.google.com/github/WillKWL/Project-NBASeason/blob/master/1_load_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Does performance in regular season predict playoff ranking? 
- supervised offline multi-class classification task
- performance measure: confusion matrix
- data source: https://www.nba.com/stats/teams/traditional/?sort=W&dir=-1&Season=2021-22&SeasonType=Regular%20Season 

In [None]:
!pip install nba_api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nba_api
  Downloading nba_api-1.1.11.tar.gz (125 kB)
[K     |████████████████████████████████| 125 kB 9.1 MB/s 
Building wheels for collected packages: nba-api
  Building wheel for nba-api (setup.py) ... [?25l[?25hdone
  Created wheel for nba-api: filename=nba_api-1.1.11-py3-none-any.whl size=251504 sha256=dd41d8619f61ace374de5a5bba4944885a791e78c6b812e9ed026863303f99bd
  Stored in directory: /root/.cache/pip/wheels/e3/f4/46/996128675381abc175e306642bae575d372dcf7efc4683e551
Successfully built nba-api
Installing collected packages: nba-api
Successfully installed nba-api-1.1.11


In [None]:
import pandas as pd
from nba_api.stats.endpoints import leaguedashteamstats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
import os
from tqdm import tqdm
import numpy as np
from zlib import crc32

# Gather data with [nba api](https://github.com/swar/nba_api)
- stats adjusted per 100 posessions, instead of per game, to account for the different pace each team plays at
- [glossary](https://www.nba.com/stats/help/glossary/) for column names  
- extra data: [advanced team stats](https://www.nba.com/stats/teams/advanced/?sort=W&dir=-1) 



In [None]:
per_mode = 'Per100Possessions'

def get_team_table(measure_type, season, season_type):
  # documentation: https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/endpoints/leaguedashteamstats.md
  return leaguedashteamstats.LeagueDashTeamStats(
      per_mode_detailed=per_mode, 
      measure_type_detailed_defense=measure_type,
      season=season, 
      season_type_all_star=season_type).get_data_frames()[0]

In [None]:
def season_vs_playoff(start_year, end_year):
  output = None
  season_list = [str(i) + '-' + str(i+1)[2:] for i in range(start_year, end_year)]
  with tqdm(total=len(season_list)) as pbar:
    for season in season_list:
      season_df_base = get_team_table("Base", season, "Regular Season")
      season_df_opponent = get_team_table("Opponent", season, "Regular Season")
      season_df = season_df_base.merge(season_df_opponent, on = "TEAM_ID", how = "inner", suffixes = ("", "_y"))

      playoff_df_base = get_team_table("Base", season, "Playoffs")
      playoff_df_opponent = get_team_table("Opponent", season, "Playoffs")
      playoff_df = playoff_df_base.merge(playoff_df_opponent, on = "TEAM_ID", how = "inner", suffixes = ("", "_y"))
      
      # # win 4 games = win 1 round
      # # ranking = 0 for teams which didn't enter playoffs
      # # ranking = 5 for champion which won 4 rounds, 16 games
      # if playoff_df['W'].max() == 16:
      #   # to win a championship, you need to win 4 rounds of best-of-7 (4,4,4,4)
      #   playoff_df['PLAYOFF_RANKING'] = playoff_df['W'].apply(lambda x: int(x/4) + 1)
      # elif playoff_df['W'].max() == 15:
      #   # 2001-02 season is the last season with best-of-5 in 1st round (3,4,4,4) so lets add 1 back to make ranking easier
      #   playoff_df['PLAYOFF_RANKING'] = playoff_df['W'].apply(lambda x: int((x+1)/4) + 1)

      # winning championship = 1, else = 0
      playoff_df = playoff_df.loc[playoff_df['W'] == playoff_df['W'].max()]
      playoff_df[['PLAYOFF_RANKING']] = 1

      playoff_df = playoff_df[['TEAM_ID', 'PLAYOFF_RANKING']] 

      season_vs_playoff = pd.merge(season_df, playoff_df, on='TEAM_ID', how = 'outer')
      season_vs_playoff.fillna(0, inplace=True) # teams which are not in playoff will have 0 as PLAYOFF_RANKING
      season_vs_playoff['PLAYOFF_RANKING'] = season_vs_playoff['PLAYOFF_RANKING'].astype(int)
      season_vs_playoff['SEASON'] = season
      output = pd.concat([output,season_vs_playoff.sort_values(by='PLAYOFF_RANKING', ascending = False).reset_index(drop=True)], axis = 0)

      pbar.update(1)
  
  return output

In [None]:
df = season_vs_playoff(start_year = 1996, end_year = 2022)
df

100%|██████████| 26/26 [01:16<00:00,  2.93s/it]


Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,GP_RANK,W_RANK,L_RANK,W_PCT_RANK,MIN_RANK,FGM_RANK,FGA_RANK,FG_PCT_RANK,FG3M_RANK,FG3A_RANK,FG3_PCT_RANK,FTM_RANK,FTA_RANK,FT_PCT_RANK,OREB_RANK,DREB_RANK,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,CFID,CFPARAMS,TEAM_NAME_y,GP_y,W_y,L_y,W_PCT_y,MIN_y,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS,PLUS_MINUS_y,GP_RANK_y,W_RANK_y,L_RANK_y,W_PCT_RANK_y,MIN_RANK_y,OPP_FGM_RANK,OPP_FGA_RANK,OPP_FG_PCT_RANK,OPP_FG3M_RANK,OPP_FG3A_RANK,OPP_FG3_PCT_RANK,OPP_FTM_RANK,OPP_FTA_RANK,OPP_FT_PCT_RANK,OPP_OREB_RANK,OPP_DREB_RANK,OPP_REB_RANK,OPP_AST_RANK,OPP_TOV_RANK,OPP_STL_RANK,OPP_BLK_RANK,OPP_BLKA_RANK,OPP_PF_RANK,OPP_PFD_RANK,OPP_PTS_RANK,PLUS_MINUS_RANK_y,CFID_y,CFPARAMS_y,PLAYOFF_RANKING,SEASON
0,1610612741,Chicago Bulls,82,69,13,0.841,52.5,43.6,92.0,0.473,7.0,18.7,0.373,18.4,24.6,0.747,16.4,32.7,49.1,28.5,14.7,9.5,4.4,3.8,21.5,0.2,112.4,11.8,1,1,1,1,13,1,1,3,11,10,6,26,28,12,2,3,1,2,2,6,26,1,2,7,1,1,10,Chicago Bulls,Chicago Bulls,82,69,13,0.841,52.5,38.5,88.4,0.436,6.3,18.7,0.335,17.3,23.5,0.737,14.5,29.3,43.8,21.5,17.2,8.3,3.8,4.4,23.0,21.5,100.7,-11.8,1,1,1,1,13,10,25,4,11,17,1,4,4,15,24,5,10,5,14,6,1,26,9,28,5,1,10,Chicago Bulls,1,1996-97
1,1610612737,Atlanta Hawks,82,56,26,0.683,54.2,38.5,86.3,0.446,9.0,25.1,0.360,20.4,26.8,0.763,14.0,32.2,46.2,21.3,16.8,9.6,5.8,4.5,21.8,0.1,106.4,6.1,1,7,7,7,4,18,11,20,2,2,15,15,17,4,11,7,4,28,13,5,9,6,4,27,8,4,10,Atlanta Hawks,Atlanta Hawks,82,56,26,0.683,54.2,38.4,88.3,0.435,6.6,19.2,0.347,16.9,22.9,0.737,14.3,30.4,44.8,22.8,17.3,8.3,4.5,5.8,23.9,21.8,100.3,-6.1,1,7,7,7,4,9,24,3,17,21,6,1,1,13,21,12,14,8,12,5,6,9,15,26,3,4,10,Atlanta Hawks,0,1996-97
2,1610612750,Minnesota Timberwolves,82,40,42,0.488,52.1,38.7,84.8,0.456,4.9,14.4,0.339,21.6,28.7,0.751,12.6,30.3,42.9,24.7,16.4,8.1,7.3,5.9,24.0,0.2,103.9,-1.6,1,15,15,15,16,16,21,12,26,25,25,6,7,9,24,17,27,9,7,24,2,24,17,18,16,17,10,Minnesota Timberwolves,Minnesota Timberwolves,82,40,42,0.488,52.1,38.8,86.2,0.450,6.2,17.0,0.362,21.7,28.8,0.754,14.1,31.1,45.2,23.5,16.9,8.6,5.9,7.3,24.1,24.0,105.5,1.6,1,15,15,15,16,12,16,13,7,7,15,22,20,29,20,17,19,12,17,11,24,2,17,13,15,17,10,Minnesota Timberwolves,0,1996-97
3,1610612763,Vancouver Grizzlies,82,14,68,0.171,53.6,38.2,87.4,0.437,6.0,17.3,0.349,16.7,23.5,0.709,13.9,29.2,43.1,25.2,17.6,8.9,6.3,6.2,23.8,0.1,99.1,-11.4,1,29,29,29,5,21,4,26,19,19,23,29,29,27,13,26,26,7,20,13,4,27,13,28,29,29,10,Vancouver Grizzlies,Vancouver Grizzlies,82,14,68,0.171,53.6,41.9,88.8,0.472,6.8,18.6,0.365,19.9,27.3,0.727,16.3,32.9,49.2,26.7,17.0,9.8,6.2,6.3,21.6,23.8,110.5,11.4,1,29,29,29,5,27,26,26,21,16,19,13,14,4,29,29,29,27,16,27,27,4,1,17,28,29,10,Vancouver Grizzlies,0,1996-97
4,1610612762,Utah Jazz,82,64,18,0.780,52.7,41.6,82.7,0.504,4.4,12.0,0.370,24.7,32.1,0.769,11.8,32.0,43.9,29.2,16.7,9.9,5.6,5.0,26.3,0.2,112.4,9.6,1,2,2,2,12,2,28,1,27,29,8,1,1,3,28,8,20,1,10,3,12,9,26,6,2,2,10,Utah Jazz,Utah Jazz,82,64,18,0.780,52.7,36.0,82.3,0.438,6.9,19.7,0.352,23.9,31.8,0.750,12.7,27.9,40.7,21.2,17.7,8.5,5.0,5.6,27.2,26.3,102.8,-9.6,1,2,2,2,12,3,2,6,22,25,10,29,29,25,5,1,1,3,6,9,9,12,29,4,11,2,10,Utah Jazz,0,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,1610612739,Cleveland Cavaliers,82,44,38,0.537,49.5,40.9,87.1,0.469,12.0,33.7,0.355,17.3,22.7,0.760,10.5,35.0,45.5,25.9,14.8,7.3,4.3,4.8,18.0,20.6,111.0,2.2,1,14,14,14,6,17,26,9,19,21,15,14,8,21,13,12,10,7,27,20,23,16,2,8,20,13,10,Cleveland Cavaliers,Cleveland Cavaliers,82,44,38,0.537,49.5,40.6,89.8,0.452,12.1,34.2,0.353,15.5,20.0,0.774,10.8,33.4,44.1,24.8,13.8,8.7,4.8,4.3,20.6,18.0,108.8,-2.2,1,14,14,14,6,10,22,8,12,10,16,5,5,15,22,9,10,14,13,29,16,23,23,29,6,13,10,Cleveland Cavaliers,0,2021-22
26,1610612741,Chicago Bulls,82,46,36,0.561,48.6,42.1,87.7,0.480,10.7,29.1,0.369,17.7,21.7,0.813,8.8,34.0,42.7,24.1,13.0,7.2,4.1,5.0,19.0,18.3,112.7,-0.4,1,12,12,12,18,5,21,3,29,30,4,8,20,3,30,19,28,19,6,23,25,24,9,27,13,20,10,Chicago Bulls,Chicago Bulls,82,46,36,0.561,48.6,41.8,88.3,0.474,11.8,32.3,0.366,17.6,22.1,0.795,9.4,34.2,43.6,25.2,13.2,7.2,5.0,4.1,18.3,19.0,113.0,0.4,1,12,12,12,18,24,11,26,6,1,27,22,15,28,1,15,8,18,23,7,24,25,4,22,22,20,10,Chicago Bulls,0,2021-22
27,1610612766,Charlotte Hornets,82,43,39,0.524,47.8,42.2,90.1,0.468,13.7,37.6,0.365,15.6,21.1,0.740,10.7,33.2,43.9,27.7,13.1,8.5,4.8,4.5,19.6,19.3,113.6,0.4,1,16,16,16,26,4,4,11,6,9,6,27,24,27,11,23,21,2,9,6,13,10,13,22,8,16,10,Charlotte Hornets,Charlotte Hornets,82,43,39,0.524,47.8,41.4,88.8,0.466,13.8,38.1,0.362,16.6,21.8,0.763,11.2,35.1,46.3,26.1,14.8,7.4,4.5,4.8,19.3,19.6,113.2,-0.4,1,16,16,16,26,19,16,20,28,26,23,12,13,7,27,23,26,28,5,12,10,13,9,18,23,16,10,Charlotte Hornets,0,2021-22
28,1610612751,Brooklyn Nets,82,44,38,0.537,48.3,42.1,88.7,0.475,11.5,31.8,0.361,17.5,21.8,0.805,10.3,34.2,44.5,25.3,14.1,7.1,5.5,4.9,20.4,19.8,113.2,0.8,1,14,14,14,19,6,15,4,23,26,10,11,19,6,15,18,17,12,18,27,5,22,22,15,10,15,10,Brooklyn Nets,Brooklyn Nets,82,44,38,0.537,48.3,40.8,90.4,0.452,12.5,36.4,0.345,18.2,23.1,0.788,11.3,32.9,44.2,23.9,13.3,7.9,4.9,5.5,19.8,20.4,112.4,-0.8,1,14,14,14,19,13,27,7,17,23,8,25,24,25,29,3,11,6,21,20,22,5,16,9,20,15,10,Brooklyn Nets,0,2021-22


In [None]:
# save to google drive
os.chdir('/content/drive/MyDrive/Colab Notebooks')
df.to_csv('nba_season_vs_playoff.csv', index = False)
# download local copy
from google.colab import files
files.download("nba_season_vs_playoff.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/WillKWL/Project-NBASeason/master/data/raw/nba_season_vs_playoff.csv')
df

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,GP_RANK,W_RANK,L_RANK,W_PCT_RANK,MIN_RANK,FGM_RANK,FGA_RANK,FG_PCT_RANK,FG3M_RANK,FG3A_RANK,FG3_PCT_RANK,FTM_RANK,FTA_RANK,FT_PCT_RANK,OREB_RANK,DREB_RANK,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,CFID,CFPARAMS,TEAM_NAME_y,GP_y,W_y,L_y,W_PCT_y,MIN_y,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS,PLUS_MINUS_y,GP_RANK_y,W_RANK_y,L_RANK_y,W_PCT_RANK_y,MIN_RANK_y,OPP_FGM_RANK,OPP_FGA_RANK,OPP_FG_PCT_RANK,OPP_FG3M_RANK,OPP_FG3A_RANK,OPP_FG3_PCT_RANK,OPP_FTM_RANK,OPP_FTA_RANK,OPP_FT_PCT_RANK,OPP_OREB_RANK,OPP_DREB_RANK,OPP_REB_RANK,OPP_AST_RANK,OPP_TOV_RANK,OPP_STL_RANK,OPP_BLK_RANK,OPP_BLKA_RANK,OPP_PF_RANK,OPP_PFD_RANK,OPP_PTS_RANK,PLUS_MINUS_RANK_y,CFID_y,CFPARAMS_y,PLAYOFF_RANKING,SEASON
0,1610612741,Chicago Bulls,82,69,13,0.841,52.5,43.6,92.0,0.473,7.0,18.7,0.373,18.4,24.6,0.747,16.4,32.7,49.1,28.5,14.7,9.5,4.4,3.8,21.5,0.2,112.4,11.8,1,1,1,1,13,1,1,3,11,10,6,26,28,12,2,3,1,2,2,6,26,1,2,7,1,1,10,Chicago Bulls,Chicago Bulls,82,69,13,0.841,52.5,38.5,88.4,0.436,6.3,18.7,0.335,17.3,23.5,0.737,14.5,29.3,43.8,21.5,17.2,8.3,3.8,4.4,23.0,21.5,100.7,-11.8,1,1,1,1,13,10,25,4,11,17,1,4,4,15,24,5,10,5,14,6,1,26,9,28,5,1,10,Chicago Bulls,1,1996-97
1,1610612737,Atlanta Hawks,82,56,26,0.683,54.2,38.5,86.3,0.446,9.0,25.1,0.360,20.4,26.8,0.763,14.0,32.2,46.2,21.3,16.8,9.6,5.8,4.5,21.8,0.1,106.4,6.1,1,7,7,7,4,18,11,20,2,2,15,15,17,4,11,7,4,28,13,5,9,6,4,27,8,4,10,Atlanta Hawks,Atlanta Hawks,82,56,26,0.683,54.2,38.4,88.3,0.435,6.6,19.2,0.347,16.9,22.9,0.737,14.3,30.4,44.8,22.8,17.3,8.3,4.5,5.8,23.9,21.8,100.3,-6.1,1,7,7,7,4,9,24,3,17,21,6,1,1,13,21,12,14,8,12,5,6,9,15,26,3,4,10,Atlanta Hawks,0,1996-97
2,1610612750,Minnesota Timberwolves,82,40,42,0.488,52.1,38.7,84.8,0.456,4.9,14.4,0.339,21.6,28.7,0.751,12.6,30.3,42.9,24.7,16.4,8.1,7.3,5.9,24.0,0.2,103.9,-1.6,1,15,15,15,16,16,21,12,26,25,25,6,7,9,24,17,27,9,7,24,2,24,17,18,16,17,10,Minnesota Timberwolves,Minnesota Timberwolves,82,40,42,0.488,52.1,38.8,86.2,0.450,6.2,17.0,0.362,21.7,28.8,0.754,14.1,31.1,45.2,23.5,16.9,8.6,5.9,7.3,24.1,24.0,105.5,1.6,1,15,15,15,16,12,16,13,7,7,15,22,20,29,20,17,19,12,17,11,24,2,17,13,15,17,10,Minnesota Timberwolves,0,1996-97
3,1610612763,Vancouver Grizzlies,82,14,68,0.171,53.6,38.2,87.4,0.437,6.0,17.3,0.349,16.7,23.5,0.709,13.9,29.2,43.1,25.2,17.6,8.9,6.3,6.2,23.8,0.1,99.1,-11.4,1,29,29,29,5,21,4,26,19,19,23,29,29,27,13,26,26,7,20,13,4,27,13,28,29,29,10,Vancouver Grizzlies,Vancouver Grizzlies,82,14,68,0.171,53.6,41.9,88.8,0.472,6.8,18.6,0.365,19.9,27.3,0.727,16.3,32.9,49.2,26.7,17.0,9.8,6.2,6.3,21.6,23.8,110.5,11.4,1,29,29,29,5,27,26,26,21,16,19,13,14,4,29,29,29,27,16,27,27,4,1,17,28,29,10,Vancouver Grizzlies,0,1996-97
4,1610612762,Utah Jazz,82,64,18,0.780,52.7,41.6,82.7,0.504,4.4,12.0,0.370,24.7,32.1,0.769,11.8,32.0,43.9,29.2,16.7,9.9,5.6,5.0,26.3,0.2,112.4,9.6,1,2,2,2,12,2,28,1,27,29,8,1,1,3,28,8,20,1,10,3,12,9,26,6,2,2,10,Utah Jazz,Utah Jazz,82,64,18,0.780,52.7,36.0,82.3,0.438,6.9,19.7,0.352,23.9,31.8,0.750,12.7,27.9,40.7,21.2,17.7,8.5,5.0,5.6,27.2,26.3,102.8,-9.6,1,2,2,2,12,3,2,6,22,25,10,29,29,25,5,1,1,3,6,9,9,12,29,4,11,2,10,Utah Jazz,0,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,1610612739,Cleveland Cavaliers,82,44,38,0.537,49.5,40.9,87.1,0.469,12.0,33.7,0.355,17.3,22.7,0.760,10.5,35.0,45.5,25.9,14.8,7.3,4.3,4.8,18.0,20.6,111.0,2.2,1,14,14,14,6,17,26,9,19,21,15,14,8,21,13,12,10,7,27,20,23,16,2,8,20,13,10,Cleveland Cavaliers,Cleveland Cavaliers,82,44,38,0.537,49.5,40.6,89.8,0.452,12.1,34.2,0.353,15.5,20.0,0.774,10.8,33.4,44.1,24.8,13.8,8.7,4.8,4.3,20.6,18.0,108.8,-2.2,1,14,14,14,6,10,22,8,12,10,16,5,5,15,22,9,10,14,13,29,16,23,23,29,6,13,10,Cleveland Cavaliers,0,2021-22
768,1610612741,Chicago Bulls,82,46,36,0.561,48.6,42.1,87.7,0.480,10.7,29.1,0.369,17.7,21.7,0.813,8.8,34.0,42.7,24.1,13.0,7.2,4.1,5.0,19.0,18.3,112.7,-0.4,1,12,12,12,18,5,21,3,29,30,4,8,20,3,30,19,28,19,6,23,25,24,9,27,13,20,10,Chicago Bulls,Chicago Bulls,82,46,36,0.561,48.6,41.8,88.3,0.474,11.8,32.3,0.366,17.6,22.1,0.795,9.4,34.2,43.6,25.2,13.2,7.2,5.0,4.1,18.3,19.0,113.0,0.4,1,12,12,12,18,24,11,26,6,1,27,22,15,28,1,15,8,18,23,7,24,25,4,22,22,20,10,Chicago Bulls,0,2021-22
769,1610612766,Charlotte Hornets,82,43,39,0.524,47.8,42.2,90.1,0.468,13.7,37.6,0.365,15.6,21.1,0.740,10.7,33.2,43.9,27.7,13.1,8.5,4.8,4.5,19.6,19.3,113.6,0.4,1,16,16,16,26,4,4,11,6,9,6,27,24,27,11,23,21,2,9,6,13,10,13,22,8,16,10,Charlotte Hornets,Charlotte Hornets,82,43,39,0.524,47.8,41.4,88.8,0.466,13.8,38.1,0.362,16.6,21.8,0.763,11.2,35.1,46.3,26.1,14.8,7.4,4.5,4.8,19.3,19.6,113.2,-0.4,1,16,16,16,26,19,16,20,28,26,23,12,13,7,27,23,26,28,5,12,10,13,9,18,23,16,10,Charlotte Hornets,0,2021-22
770,1610612751,Brooklyn Nets,82,44,38,0.537,48.3,42.1,88.7,0.475,11.5,31.8,0.361,17.5,21.8,0.805,10.3,34.2,44.5,25.3,14.1,7.1,5.5,4.9,20.4,19.8,113.2,0.8,1,14,14,14,19,6,15,4,23,26,10,11,19,6,15,18,17,12,18,27,5,22,22,15,10,15,10,Brooklyn Nets,Brooklyn Nets,82,44,38,0.537,48.3,40.8,90.4,0.452,12.5,36.4,0.345,18.2,23.1,0.788,11.3,32.9,44.2,23.9,13.3,7.9,4.9,5.5,19.8,20.4,112.4,-0.8,1,14,14,14,19,13,27,7,17,23,8,25,24,25,29,3,11,6,21,20,22,5,16,9,20,15,10,Brooklyn Nets,0,2021-22


In [None]:
pd.DataFrame(df.dtypes).T

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,GP_RANK,W_RANK,L_RANK,W_PCT_RANK,MIN_RANK,FGM_RANK,FGA_RANK,FG_PCT_RANK,FG3M_RANK,FG3A_RANK,FG3_PCT_RANK,FTM_RANK,FTA_RANK,FT_PCT_RANK,OREB_RANK,DREB_RANK,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,CFID,CFPARAMS,TEAM_NAME_y,GP_y,W_y,L_y,W_PCT_y,MIN_y,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS,PLUS_MINUS_y,GP_RANK_y,W_RANK_y,L_RANK_y,W_PCT_RANK_y,MIN_RANK_y,OPP_FGM_RANK,OPP_FGA_RANK,OPP_FG_PCT_RANK,OPP_FG3M_RANK,OPP_FG3A_RANK,OPP_FG3_PCT_RANK,OPP_FTM_RANK,OPP_FTA_RANK,OPP_FT_PCT_RANK,OPP_OREB_RANK,OPP_DREB_RANK,OPP_REB_RANK,OPP_AST_RANK,OPP_TOV_RANK,OPP_STL_RANK,OPP_BLK_RANK,OPP_BLKA_RANK,OPP_PF_RANK,OPP_PFD_RANK,OPP_PTS_RANK,PLUS_MINUS_RANK_y,CFID_y,CFPARAMS_y,PLAYOFF_RANKING,SEASON
0,int64,object,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,object,object,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,object,int64,object


In [None]:
# remove duplicate columns and ranking columns since we will clean the features later on
df = df.loc[:, [not((('RANK' in x) | ('_y' in x)) & ('RANKING' not in x)) for x in df.columns]]
df

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,CFID,CFPARAMS,OPP_FGM,OPP_FGA,OPP_FG_PCT,OPP_FG3M,OPP_FG3A,OPP_FG3_PCT,OPP_FTM,OPP_FTA,OPP_FT_PCT,OPP_OREB,OPP_DREB,OPP_REB,OPP_AST,OPP_TOV,OPP_STL,OPP_BLK,OPP_BLKA,OPP_PF,OPP_PFD,OPP_PTS,PLAYOFF_RANKING,SEASON
0,1610612741,Chicago Bulls,82,69,13,0.841,52.5,43.6,92.0,0.473,7.0,18.7,0.373,18.4,24.6,0.747,16.4,32.7,49.1,28.5,14.7,9.5,4.4,3.8,21.5,0.2,112.4,11.8,10,Chicago Bulls,38.5,88.4,0.436,6.3,18.7,0.335,17.3,23.5,0.737,14.5,29.3,43.8,21.5,17.2,8.3,3.8,4.4,23.0,21.5,100.7,1,1996-97
1,1610612737,Atlanta Hawks,82,56,26,0.683,54.2,38.5,86.3,0.446,9.0,25.1,0.360,20.4,26.8,0.763,14.0,32.2,46.2,21.3,16.8,9.6,5.8,4.5,21.8,0.1,106.4,6.1,10,Atlanta Hawks,38.4,88.3,0.435,6.6,19.2,0.347,16.9,22.9,0.737,14.3,30.4,44.8,22.8,17.3,8.3,4.5,5.8,23.9,21.8,100.3,0,1996-97
2,1610612750,Minnesota Timberwolves,82,40,42,0.488,52.1,38.7,84.8,0.456,4.9,14.4,0.339,21.6,28.7,0.751,12.6,30.3,42.9,24.7,16.4,8.1,7.3,5.9,24.0,0.2,103.9,-1.6,10,Minnesota Timberwolves,38.8,86.2,0.450,6.2,17.0,0.362,21.7,28.8,0.754,14.1,31.1,45.2,23.5,16.9,8.6,5.9,7.3,24.1,24.0,105.5,0,1996-97
3,1610612763,Vancouver Grizzlies,82,14,68,0.171,53.6,38.2,87.4,0.437,6.0,17.3,0.349,16.7,23.5,0.709,13.9,29.2,43.1,25.2,17.6,8.9,6.3,6.2,23.8,0.1,99.1,-11.4,10,Vancouver Grizzlies,41.9,88.8,0.472,6.8,18.6,0.365,19.9,27.3,0.727,16.3,32.9,49.2,26.7,17.0,9.8,6.2,6.3,21.6,23.8,110.5,0,1996-97
4,1610612762,Utah Jazz,82,64,18,0.780,52.7,41.6,82.7,0.504,4.4,12.0,0.370,24.7,32.1,0.769,11.8,32.0,43.9,29.2,16.7,9.9,5.6,5.0,26.3,0.2,112.4,9.6,10,Utah Jazz,36.0,82.3,0.438,6.9,19.7,0.352,23.9,31.8,0.750,12.7,27.9,40.7,21.2,17.7,8.5,5.0,5.6,27.2,26.3,102.8,0,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,1610612739,Cleveland Cavaliers,82,44,38,0.537,49.5,40.9,87.1,0.469,12.0,33.7,0.355,17.3,22.7,0.760,10.5,35.0,45.5,25.9,14.8,7.3,4.3,4.8,18.0,20.6,111.0,2.2,10,Cleveland Cavaliers,40.6,89.8,0.452,12.1,34.2,0.353,15.5,20.0,0.774,10.8,33.4,44.1,24.8,13.8,8.7,4.8,4.3,20.6,18.0,108.8,0,2021-22
768,1610612741,Chicago Bulls,82,46,36,0.561,48.6,42.1,87.7,0.480,10.7,29.1,0.369,17.7,21.7,0.813,8.8,34.0,42.7,24.1,13.0,7.2,4.1,5.0,19.0,18.3,112.7,-0.4,10,Chicago Bulls,41.8,88.3,0.474,11.8,32.3,0.366,17.6,22.1,0.795,9.4,34.2,43.6,25.2,13.2,7.2,5.0,4.1,18.3,19.0,113.0,0,2021-22
769,1610612766,Charlotte Hornets,82,43,39,0.524,47.8,42.2,90.1,0.468,13.7,37.6,0.365,15.6,21.1,0.740,10.7,33.2,43.9,27.7,13.1,8.5,4.8,4.5,19.6,19.3,113.6,0.4,10,Charlotte Hornets,41.4,88.8,0.466,13.8,38.1,0.362,16.6,21.8,0.763,11.2,35.1,46.3,26.1,14.8,7.4,4.5,4.8,19.3,19.6,113.2,0,2021-22
770,1610612751,Brooklyn Nets,82,44,38,0.537,48.3,42.1,88.7,0.475,11.5,31.8,0.361,17.5,21.8,0.805,10.3,34.2,44.5,25.3,14.1,7.1,5.5,4.9,20.4,19.8,113.2,0.8,10,Brooklyn Nets,40.8,90.4,0.452,12.5,36.4,0.345,18.2,23.1,0.788,11.3,32.9,44.2,23.9,13.3,7.9,4.9,5.5,19.8,20.4,112.4,0,2021-22


In [None]:
# fix datatypes
df['TEAM_ID'] = df['TEAM_ID'].astype(str)
df['CFID'] = df['CFID'].astype(str)
df['PLAYOFF_RANKING'] = df['PLAYOFF_RANKING'].astype('category')
df['SEASON'] = df['SEASON'].astype('category')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

# Train test split with unique identifier = TEAM_ID + SEASON
- hashing unique identifier
- alternative: stratified sampling by season

In [None]:
# unique id = team id + season
df['UNIQUE_ID'] = df['TEAM_ID'].astype(str) + df['SEASON'].str.replace('-','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# instead of train-test_split by random shuffling, hash the unique identifier for train-test split
# slight modification to the technique from Hands On Machine Learning by Aurélien Géron
def split_data_with_id_hash(data, test_ratio, id_column):
  ids = data[id_column]
  threshold = np.quantile(ids.apply(lambda x: crc32(np.int64(x))), test_ratio) # instead of using test_ratio * 2**32 as the threshold, this is more accurate
  in_test_set = ids.apply(lambda x: crc32(np.int64(x)) < threshold) 
  return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
train_set, test_set = split_data_with_id_hash(df, 0.2, "UNIQUE_ID")

In [None]:
print(f"size of full dataset: {df.shape}")
print(f"train-test ratio: {train_set.shape[0] / df.shape[0]:0.2} / {test_set.shape[0] / df.shape[0]:0.2}")

size of full dataset: (772, 53)
train-test ratio: 0.8 / 0.2


In [None]:
# save to google drive
os.chdir('/content/drive/MyDrive/Colab Notebooks')
train_set.to_pickle('nba_season_vs_playoff_train.pkl')
test_set.to_pickle('nba_season_vs_playoff_test.pkl')
# download local copy
from google.colab import files
files.download("nba_season_vs_playoff_train.pkl")
files.download("nba_season_vs_playoff_test.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>