In [2]:
from nba_api.stats.static import teams

nba_teams = teams.get_teams()
# Select the dictionary for the Celtics, which contains their team ID
celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]
celtics_id = celtics['id']

In [3]:
from nba_api.stats.endpoints import leaguegamefinder

In [4]:
# Query for games where the Celtics were playing
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable='2023-24') # first season to select data from
# The first DataFrame of those returned is what we want.
games = gamefinder.get_data_frames()[0]

# generate some info about games: 

print("Shape: {}".format(games.shape)) # Number of games played

print("Stats tracked: {}".format(games.columns)) # Stats tracked

Shape: (2893, 28)
Stats tracked: Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')


In [5]:
print(games.isna().sum()) # Where gaps in data exist

SEASON_ID            0
TEAM_ID              0
TEAM_ABBREVIATION    0
TEAM_NAME            0
GAME_ID              0
GAME_DATE            0
MATCHUP              0
WL                   1
MIN                  0
PTS                  0
FGM                  0
FGA                  0
FG_PCT               1
FG3M                 0
FG3A                 0
FG3_PCT              1
FTM                  0
FTA                  0
FT_PCT               2
OREB                 0
DREB                 0
REB                  0
AST                  0
STL                  0
BLK                  0
TOV                  0
PF                   0
PLUS_MINUS           0
dtype: int64


In [6]:
print(games['SEASON_ID'].value_counts()) # Number of seasons + subseasons in the data

SEASON_ID
22023    2234
52023     498
12023     151
32023       8
62023       2
Name: count, dtype: int64


In [7]:
games.head() # First few rows of the data; games are mirrored from each team's perspective

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,32023,1610616847,PAU,Team Pau,32300005,2024-02-16,PAU vs. DLF,L,61,36,...,1.0,4,10,14,10,4,3,6,3,2.0
1,32023,1610616850,TAM,Team Tamika,32300004,2024-02-16,TAM vs. JAL,L,60,35,...,0.667,3,8,11,8,7,1,3,3,-12.0
2,32023,1610616848,DLF,Team Detlef,32300006,2024-02-16,DLF @ JAL,L,35,13,...,,3,2,5,5,1,1,5,2,-7.0
3,32023,1610616849,JAL,Team Jalen,32300004,2024-02-16,JAL @ TAM,W,60,40,...,0.429,3,16,19,7,1,0,9,3,12.0
4,32023,1610616840,STA,Team Stephen A,32300003,2024-02-16,STA vs. SHN,L,241,91,...,0.615,14,26,40,12,9,4,12,9,-9.0


In [8]:
games['MATCHUP'] # Determines whether the team is at home or not via the '@' or 'vs.' symbol 

0       PAU vs. DLF
1       TAM vs. JAL
2         DLF @ JAL
3         JAL @ TAM
4       STA vs. SHN
           ...     
2888    GLI vs. PER
2889      PER @ GLI
2890    GLI vs. PER
2891    USA vs. PUR
2892      PUR @ USA
Name: MATCHUP, Length: 2893, dtype: object

In [9]:
nba_teams = teams.get_teams()

nba_teams = teams.get_teams()
# Select the dictionary for the Celtics, which contains their team ID
spurs = [team for team in nba_teams if team['abbreviation'] == 'SAS'][0]
spurs_id = spurs['id']

In [10]:
gamefinder_spurs = leaguegamefinder.LeagueGameFinder(team_id_nullable = [spurs_id])
games_spurs = gamefinder_spurs.get_data_frames()[0]
print(games_spurs.shape)
games_spurs.head()

(3832, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22023,1610612759,SAS,San Antonio Spurs,22300785,2024-02-14,SAS @ DAL,L,239,93,...,0.733,13,31,44,29,8,7,11,9,-23.0
1,22023,1610612759,SAS,San Antonio Spurs,22300764,2024-02-12,SAS @ TOR,W,239,122,...,0.826,5,41,46,37,12,13,17,14,23.0
2,22023,1610612759,SAS,San Antonio Spurs,22300750,2024-02-10,SAS @ BKN,L,240,103,...,0.952,15,26,41,20,5,3,14,11,-20.0
3,22023,1610612759,SAS,San Antonio Spurs,22300734,2024-02-08,SAS @ ORL,L,239,111,...,0.941,4,23,27,29,7,5,14,16,-16.0
4,22023,1610612759,SAS,San Antonio Spurs,22300730,2024-02-07,SAS @ MIA,L,240,104,...,0.588,6,40,46,30,6,2,14,21,-12.0


In [11]:
games_spurs['SEASON_ID'].value_counts() # select seasons that start with '2'

SEASON_ID
22015    92
22016    90
22018    90
22021    90
22017    90
         ..
41987     3
41985     3
12011     2
52020     1
52021     1
Name: count, Length: 95, dtype: int64

In [12]:
games_spurs = games_spurs[games_spurs['SEASON_ID'] == '22015'].head(82) # take regular season games
games_spurs.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
742,22015,1610612759,SAS,San Antonio Spurs,21501223,2016-04-13,SAS @ DAL,W,240,96,...,0.769,8,34,42,24,8,3,14,19,5.0
743,22015,1610612759,SAS,San Antonio Spurs,21501215,2016-04-12,SAS vs. OKC,W,264,102,...,0.75,11,31,42,15,12,8,11,22,4.0
744,22015,1610612759,SAS,San Antonio Spurs,21501201,2016-04-10,SAS vs. GSW,L,239,86,...,0.789,18,35,53,21,10,5,11,18,-6.0
745,22015,1610612759,SAS,San Antonio Spurs,21501186,2016-04-08,SAS @ DEN,L,240,98,...,0.821,8,37,45,20,6,8,10,12,-4.0
746,22015,1610612759,SAS,San Antonio Spurs,21501177,2016-04-07,SAS @ GSW,L,241,101,...,0.696,5,27,32,28,10,2,12,11,-11.0


In [13]:
import numpy as np
import pandas as pd

In [14]:
raw_df = pd.read_csv('raw/raw.csv')

In [15]:
print(raw_df.shape)
raw_df.head()

(92430, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,21983,1610612744,GOS,Golden State Warriors,28300938,1984-04-15,GOS vs. DAL,W,240,98,...,0.571,21.0,29.0,50,20,14.0,4,24,19,
1,21983,1610612759,SAN,San Antonio Spurs,28300943,1984-04-15,SAN vs. DEN,W,240,157,...,0.735,10.0,38.0,48,50,8.0,8,18,21,
2,21983,1610612738,BOS,Boston Celtics,28300941,1984-04-15,BOS vs. NJN,W,240,118,...,0.84,13.0,37.0,50,31,8.0,6,17,17,
3,21983,1610612742,DAL,Dallas Mavericks,28300938,1984-04-15,DAL @ GOS,L,240,96,...,0.625,19.0,24.0,43,17,8.0,0,24,18,
4,21983,1610612751,NJN,New Jersey Nets,28300941,1984-04-15,NJN @ BOS,L,240,111,...,0.632,13.0,25.0,38,26,11.0,3,14,22,


In [16]:
print(raw_df[raw_df['SEASON_ID'].astype(str).str.startswith('2')].shape)
raw_df[raw_df['SEASON_ID'].astype(str).str.startswith('2')]

(92430, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,21983,1610612744,GOS,Golden State Warriors,28300938,1984-04-15,GOS vs. DAL,W,240,98,...,0.571,21.0,29.0,50,20,14.0,4,24,19,
1,21983,1610612759,SAN,San Antonio Spurs,28300943,1984-04-15,SAN vs. DEN,W,240,157,...,0.735,10.0,38.0,48,50,8.0,8,18,21,
2,21983,1610612738,BOS,Boston Celtics,28300941,1984-04-15,BOS vs. NJN,W,240,118,...,0.840,13.0,37.0,50,31,8.0,6,17,17,
3,21983,1610612742,DAL,Dallas Mavericks,28300938,1984-04-15,DAL @ GOS,L,240,96,...,0.625,19.0,24.0,43,17,8.0,0,24,18,
4,21983,1610612751,NJN,New Jersey Nets,28300941,1984-04-15,NJN @ BOS,L,240,111,...,0.632,13.0,25.0,38,26,11.0,3,14,22,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92425,22023,1610612741,CHI,Chicago Bulls,22300070,2023-10-25,CHI vs. OKC,L,242,104,...,0.778,13.0,29.0,42,26,10.0,3,12,16,-20.0
92426,22023,1610612744,GSW,Golden State Warriors,22300062,2023-10-24,GSW vs. PHX,L,240,104,...,0.786,18.0,31.0,49,19,11.0,6,11,23,-4.0
92427,22023,1610612747,LAL,Los Angeles Lakers,22300061,2023-10-24,LAL @ DEN,L,239,107,...,0.750,13.0,31.0,44,23,5.0,4,11,18,-12.0
92428,22023,1610612743,DEN,Denver Nuggets,22300061,2023-10-24,DEN vs. LAL,W,240,119,...,0.750,9.0,33.0,42,29,9.0,6,11,15,12.0


In [17]:
raw_df[raw_df['SEASON_ID'] == 22022].value_counts('TEAM_ID') # assert regular season

TEAM_ID
1610612737    82
1610612738    82
1610612765    82
1610612764    82
1610612763    82
1610612762    82
1610612761    82
1610612760    82
1610612759    82
1610612758    82
1610612757    82
1610612756    82
1610612755    82
1610612754    82
1610612753    82
1610612752    82
1610612751    82
1610612750    82
1610612749    82
1610612748    82
1610612747    82
1610612746    82
1610612745    82
1610612744    82
1610612743    82
1610612742    82
1610612741    82
1610612740    82
1610612739    82
1610612766    82
Name: count, dtype: int64

In [18]:
raw_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

Are redundant columns. Keeping them for time being.

CHECKING THAT DATA WAS CLEANED WELL

In [19]:
clean_df = pd.read_csv('processed/scaled/clean.csv')
clean_df.head()

Unnamed: 0,ID,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,IS_HOME
0,40000002,22000,1610612752,NYK,New York Knicks,20000001,2000-10-31,NYK vs. PHI,L,0.012658,...,0.40625,0.257143,0.319149,0.212121,0.333333,0.173913,0.678571,0.65625,0.177778,1
1,40000003,22000,1610612755,PHI,Philadelphia 76ers,20000001,2000-10-31,PHI @ NYK,W,0.037975,...,0.21875,0.428571,0.319149,0.606061,0.555556,0.217391,0.357143,0.46875,0.822222,0
2,40000004,22000,1610612751,NJN,New Jersey Nets,20000002,2000-10-31,NJN vs. CLE,L,0.025316,...,0.34375,0.6,0.531915,0.515152,0.5,0.347826,0.321429,0.6875,0.455556,1
3,40000005,22000,1610612739,CLE,Cleveland Cavaliers,20000002,2000-10-31,CLE @ NJN,W,0.037975,...,0.3125,0.771429,0.638298,0.272727,0.277778,0.347826,0.571429,0.5625,0.544444,0
4,40000006,22000,1610612753,ORL,Orlando Magic,20000003,2000-10-31,ORL vs. WAS,W,0.037975,...,0.21875,0.428571,0.319149,0.393939,0.555556,0.391304,0.428571,0.46875,0.622222,1


In [20]:
print(raw_df.shape)
print(clean_df.shape)

(92430, 28)
(92430, 30)


In [21]:
print(raw_df.dropna().shape)
print(clean_df.dropna().shape)

(65287, 28)
(91415, 30)


Above comparison shows that using cleaned data allows us to make use of far more data points. 

STREAKING

In [22]:
streak_df = pd.read_csv('processed/unscaled/streak.csv')
streak_df_scaled = pd.read_csv('processed/scaled/streak.csv')
print(streak_df.shape)
print(streak_df_scaled.shape)

(92430, 209)
(92430, 209)


In [23]:
streak_df.columns

Index(['Unnamed: 0', 'SEASON_ID_A', 'GAME_DATE_A', 'TEAM_ABBREVIATION_for_A',
       'TEAM_ABBREVIATION_against_A', 'TEAM_NAME_for_A', 'TEAM_NAME_against_A',
       'MATCHUP_for_A', 'MATCHUP_against_A', 'ID_A',
       ...
       'DREB_against_prev_0_B', 'REB_against_prev_0_B', 'AST_against_prev_0_B',
       'STL_against_prev_0_B', 'BLK_against_prev_0_B', 'TOV_against_prev_0_B',
       'PF_against_prev_0_B', 'PLUS_MINUS_against_prev_0_B',
       'IS_HOME_against_prev_0_B', 'GAME_ID_prev_0_B'],
      dtype='object', length=209)

In [24]:
streak_df = streak_df.dropna(subset=['TEAM_ID_for_prev_0_A', 'TEAM_ID_for_prev_0_B'])
streak_df_scaled = streak_df_scaled.dropna(subset=['TEAM_ID_for_prev_0_A', 'TEAM_ID_for_prev_0_B'])
print(streak_df.shape)
print(streak_df_scaled.shape)


(91128, 209)
(91128, 209)


In [25]:
streak_df.head()

Unnamed: 0.1,Unnamed: 0,SEASON_ID_A,GAME_DATE_A,TEAM_ABBREVIATION_for_A,TEAM_ABBREVIATION_against_A,TEAM_NAME_for_A,TEAM_NAME_against_A,MATCHUP_for_A,MATCHUP_against_A,ID_A,...,DREB_against_prev_0_B,REB_against_prev_0_B,AST_against_prev_0_B,STL_against_prev_0_B,BLK_against_prev_0_B,TOV_against_prev_0_B,PF_against_prev_0_B,PLUS_MINUS_against_prev_0_B,IS_HOME_against_prev_0_B,GAME_ID_prev_0_B
28,28,22000,2000-11-01,PHI,TOR,Philadelphia 76ers,Toronto Raptors,PHI vs. TOR,TOR @ PHI,40000030,...,34.0,45.0,21.0,7.0,6.0,12.0,27.0,9.0,0.0,20000005.0
29,29,22000,2000-11-01,TOR,PHI,Toronto Raptors,Philadelphia 76ers,TOR @ PHI,PHI vs. TOR,40000031,...,23.0,37.0,14.0,6.0,4.0,22.0,30.0,-29.0,1.0,20000001.0
32,32,22000,2000-11-01,CHH,WAS,Charlotte Hornets,Washington Wizards,CHH vs. WAS,WAS @ CHH,40000034,...,29.0,37.0,20.0,10.0,9.0,15.0,24.0,11.0,1.0,20000003.0
33,33,22000,2000-11-01,WAS,CHH,Washington Wizards,Charlotte Hornets,WAS @ CHH,CHH vs. WAS,40000035,...,18.0,29.0,14.0,9.0,2.0,13.0,31.0,-24.0,1.0,20000004.0
34,34,22000,2000-11-01,CLE,SAC,Cleveland Cavaliers,Sacramento Kings,CLE vs. SAC,SAC @ CLE,40000036,...,22.0,29.0,19.0,8.0,4.0,19.0,20.0,-19.0,1.0,20000006.0


In [26]:
streak_df_scaled.head()

Unnamed: 0.1,Unnamed: 0,SEASON_ID_A,GAME_DATE_A,TEAM_ABBREVIATION_for_A,TEAM_ABBREVIATION_against_A,TEAM_NAME_for_A,TEAM_NAME_against_A,MATCHUP_for_A,MATCHUP_against_A,ID_A,...,DREB_against_prev_0_B,REB_against_prev_0_B,AST_against_prev_0_B,STL_against_prev_0_B,BLK_against_prev_0_B,TOV_against_prev_0_B,PF_against_prev_0_B,PLUS_MINUS_against_prev_0_B,IS_HOME_against_prev_0_B,GAME_ID_prev_0_B
28,28,22000,2000-11-01,PHI,TOR,Philadelphia 76ers,Toronto Raptors,PHI vs. TOR,TOR @ PHI,40000030,...,0.571429,0.489362,0.424242,0.388889,0.26087,0.321429,0.5625,0.6,0.0,20000005.0
29,29,22000,2000-11-01,TOR,PHI,Toronto Raptors,Philadelphia 76ers,TOR @ PHI,PHI vs. TOR,40000031,...,0.257143,0.319149,0.212121,0.333333,0.173913,0.678571,0.65625,0.177778,1.0,20000001.0
32,32,22000,2000-11-01,CHH,WAS,Charlotte Hornets,Washington Wizards,CHH vs. WAS,WAS @ CHH,40000034,...,0.428571,0.319149,0.393939,0.555556,0.391304,0.428571,0.46875,0.622222,1.0,20000003.0
33,33,22000,2000-11-01,WAS,CHH,Washington Wizards,Charlotte Hornets,WAS @ CHH,CHH vs. WAS,40000035,...,0.114286,0.148936,0.212121,0.5,0.086957,0.357143,0.6875,0.233333,1.0,20000004.0
34,34,22000,2000-11-01,CLE,SAC,Cleveland Cavaliers,Sacramento Kings,CLE vs. SAC,SAC @ CLE,40000036,...,0.228571,0.148936,0.363636,0.444444,0.173913,0.571429,0.34375,0.288889,1.0,20000006.0


In [27]:
# Data sanity check
idx = 2000
# Assert first team's stats are in line
print(streak_df.iloc[idx]['WL_for_prev_0_A'])
print(streak_df.iloc[idx+1]['WL_for_prev_0_B'])
print(streak_df.iloc[idx]['WL_against_prev_0_A'])
print(streak_df.iloc[idx+1]['WL_against_prev_0_B'])

# Assert second team's stats are in line
print(streak_df.iloc[idx]['WL_for_prev_0_B'])
print(streak_df.iloc[idx+1]['WL_for_prev_0_A'])
print(streak_df.iloc[idx]['WL_against_prev_0_B'])
print(streak_df.iloc[idx+1]['WL_against_prev_0_A'])


0.5277777777777778
0.5277777777777778
0.4722222222222222
0.4722222222222222
0.3768115942028985
0.3768115942028985
0.6231884057971014
0.6231884057971014


In [28]:
# To revisit via streak.py
non_streak_features = ['SEASON_ID_A', 'SEASON_ID_B',
                       'GAME_DATE_A', 'GAME_DATE_B',
                       'TEAM_ABBREVIATION_for_A', 'TEAM_ABBREVIATION_against_A',
                       'TEAM_ABBREVIATION_for_B', 'TEAM_ABBREVIATION_against_B',
                       'TEAM_NAME_for_A', 'TEAM_NAME_against_A',
                        'TEAM_NAME_for_B', 'TEAM_NAME_against_B',
                        'MATCHUP_for_A', 'MATCHUP_against_A',
                        'MATCHUP_for_B', 'MATCHUP_against_B',
                        'ID_A', 'ID_B']

leave_out_cols = non_streak_features

leave_out_cols

['SEASON_ID_A',
 'SEASON_ID_B',
 'GAME_DATE_A',
 'GAME_DATE_B',
 'TEAM_ABBREVIATION_for_A',
 'TEAM_ABBREVIATION_against_A',
 'TEAM_ABBREVIATION_for_B',
 'TEAM_ABBREVIATION_against_B',
 'TEAM_NAME_for_A',
 'TEAM_NAME_against_A',
 'TEAM_NAME_for_B',
 'TEAM_NAME_against_B',
 'MATCHUP_for_A',
 'MATCHUP_against_A',
 'MATCHUP_for_B',
 'MATCHUP_against_B',
 'ID_A',
 'ID_B']

In [29]:
corrs = streak_df.drop(columns=leave_out_cols).corr()

In [30]:
cols_0 = [col for col in streak_df.columns if "_0" in col]
corrs.loc[cols_0, "WL_for_A"].sort_values(ascending=False)

PLUS_MINUS_for_prev_0_A        0.231095
PLUS_MINUS_against_prev_0_B    0.231085
WL_for_prev_0_A                0.219506
WL_against_prev_0_B            0.219506
FG_PCT_for_prev_0_A            0.122331
                                 ...   
FG_PCT_for_prev_0_B           -0.122331
WL_against_prev_0_A           -0.219506
WL_for_prev_0_B               -0.219506
PLUS_MINUS_against_prev_0_A   -0.231085
PLUS_MINUS_for_prev_0_B       -0.231095
Name: WL_for_A, Length: 96, dtype: float64

PREVIOUSLY USED FEATURES

In [31]:
# All features being used

# Old features
WL = ['WL_for_prev_0_A', 'WL_for_prev_0_B']
AST = ['AST_for_prev_0_A', 'AST_against_prev_0_A', 'AST_for_prev_0_B', 'AST_against_prev_0_B']
REB = ['REB_for_prev_0_A', 'REB_against_prev_0_A', 'REB_for_prev_0_B', 'REB_against_prev_0_B']
FG_PCT = ['FG_PCT_for_prev_0_A', 'FG_PCT_against_prev_0_A', 'FG_PCT_for_prev_0_B', 'FG_PCT_against_prev_0_B']
FG3_PCT = ['FG3_PCT_for_prev_0_A', 'FG3_PCT_against_prev_0_A', 'FG3_PCT_for_prev_0_B', 'FG3_PCT_against_prev_0_B']
PTS = ['PTS_for_prev_0_A', 'PTS_against_prev_0_A', 'PTS_for_prev_0_B', 'PTS_against_prev_0_B']
old_features = WL + AST + REB + FG_PCT + FG3_PCT + PTS


# New features
PLUS_MINUS = ['PLUS_MINUS_for_prev_0_A', 'PLUS_MINUS_for_prev_0_B']
BLK = ['BLK_for_prev_0_A', 'BLK_against_prev_0_A', 'BLK_for_prev_0_B', 'BLK_against_prev_0_B']
STL = ['STL_for_prev_0_A', 'STL_against_prev_0_A', 'STL_for_prev_0_B', 'STL_against_prev_0_B']
TOV = ['TOV_for_prev_0_A', 'TOV_against_prev_0_A', 'TOV_for_prev_0_B', 'TOV_against_prev_0_B']
PF = ['PF_for_prev_0_A', 'PF_against_prev_0_A', 'PF_for_prev_0_B', 'PF_against_prev_0_B']
MIN = ['MIN_for_prev_0_A', 'MIN_for_prev_0_B']
OREB = ['OREB_for_prev_0_A', 'OREB_against_prev_0_A', 'OREB_for_prev_0_B', 'OREB_against_prev_0_B']
DREB = ['DREB_for_prev_0_A', 'DREB_against_prev_0_A', 'DREB_for_prev_0_B', 'DREB_against_prev_0_B']
new_features1 = PLUS_MINUS + BLK + STL + TOV + PF + MIN + OREB + DREB


FGM = ['FGM_for_prev_0_A', 'FGM_against_prev_0_A', 'FGM_for_prev_0_B', 'FGM_against_prev_0_B']
FGA = ['FGA_for_prev_0_A', 'FGA_against_prev_0_A', 'FGA_for_prev_0_B', 'FGA_against_prev_0_B']
FG3M = ['FG3M_for_prev_0_A', 'FG3M_against_prev_0_A', 'FG3M_for_prev_0_B', 'FG3M_against_prev_0_B']
FG3A = ['FG3A_for_prev_0_A', 'FG3A_against_prev_0_A', 'FG3A_for_prev_0_B', 'FG3A_against_prev_0_B']
FTM = ['FTM_for_prev_0_A', 'FTM_against_prev_0_A', 'FTM_for_prev_0_B', 'FTM_against_prev_0_B']
FTA = ['FTA_for_prev_0_A', 'FTA_against_prev_0_A', 'FTA_for_prev_0_B', 'FTA_against_prev_0_B']
FT_PCT = ['FT_PCT_for_prev_0_A', 'FT_PCT_against_prev_0_A', 'FT_PCT_for_prev_0_B', 'FT_PCT_against_prev_0_B']
new_features2 = FGM + FGA + FG3M + FG3A + FTM + FTA + FT_PCT

IS_HOME = ['IS_HOME_for_prev_0_A', 'IS_HOME_for_prev_0_B',
                    'IS_HOME_against_prev_0_A', 'IS_HOME_against_prev_0_B',
                    'IS_HOME_for_A', 'IS_HOME_for_B',
                    'IS_HOME_against_A', 'IS_HOME_against_B']

In [32]:
# WL, FG_PCT, FG3_PCT, AST, REB
corrs.loc[old_features, "WL_for_A"].sort_values(ascending=False)

WL_for_prev_0_A             0.219506
FG_PCT_for_prev_0_A         0.122331
FG_PCT_against_prev_0_B     0.107509
REB_against_prev_0_B        0.088052
AST_for_prev_0_A            0.085865
PTS_for_prev_0_A            0.080828
AST_against_prev_0_B        0.079891
PTS_against_prev_0_B        0.075950
FG3_PCT_for_prev_0_A        0.058911
REB_for_prev_0_A            0.053213
FG3_PCT_against_prev_0_B    0.026421
FG3_PCT_against_prev_0_A   -0.026421
REB_for_prev_0_B           -0.053213
FG3_PCT_for_prev_0_B       -0.058911
PTS_against_prev_0_A       -0.075950
AST_against_prev_0_A       -0.079891
PTS_for_prev_0_B           -0.080828
AST_for_prev_0_B           -0.085865
REB_against_prev_0_A       -0.088052
FG_PCT_against_prev_0_A    -0.107509
FG_PCT_for_prev_0_B        -0.122331
WL_for_prev_0_B            -0.219506
Name: WL_for_A, dtype: float64

NEWLY USED FEATURES

In [33]:
# STL, BLK, TOV, PF, MIN, PLUS_MINUS
corrs.loc[new_features1, "WL_for_A"].sort_values(ascending=False)

PLUS_MINUS_for_prev_0_A    0.231095
BLK_against_prev_0_B       0.111880
DREB_for_prev_0_A          0.074332
DREB_against_prev_0_B      0.073900
STL_against_prev_0_B       0.062594
TOV_for_prev_0_B           0.053749
BLK_for_prev_0_A           0.044660
PF_for_prev_0_B            0.044241
PF_against_prev_0_A        0.028661
OREB_for_prev_0_B          0.028579
STL_for_prev_0_A           0.027306
OREB_against_prev_0_B      0.018926
TOV_against_prev_0_B       0.004373
MIN_for_prev_0_A           0.001513
MIN_for_prev_0_B          -0.001513
TOV_against_prev_0_A      -0.004373
OREB_against_prev_0_A     -0.018926
STL_for_prev_0_B          -0.027306
OREB_for_prev_0_A         -0.028579
PF_against_prev_0_B       -0.028661
PF_for_prev_0_A           -0.044241
BLK_for_prev_0_B          -0.044660
TOV_for_prev_0_A          -0.053749
STL_against_prev_0_A      -0.062594
DREB_against_prev_0_A     -0.073900
DREB_for_prev_0_B         -0.074332
BLK_against_prev_0_A      -0.111880
PLUS_MINUS_for_prev_0_B   -0

In [34]:
# FGM, FGA, FG3M, FG3A, FTM, FTA, FT_PCT, OREB, DREB
# These are the features that aren't being used well
corrs.loc[new_features2, "WL_for_A"].sort_values(ascending=False)

FGM_for_prev_0_A           0.060940
FGM_against_prev_0_B       0.060626
FTM_against_prev_0_B       0.053936
FTA_against_prev_0_B       0.052222
FT_PCT_for_prev_0_A        0.045329
FTM_for_prev_0_A           0.042716
FG3M_for_prev_0_A          0.031765
FTA_for_prev_0_A           0.029587
FGA_for_prev_0_B           0.020347
FG3A_for_prev_0_A          0.020093
FT_PCT_against_prev_0_B    0.008465
FG3M_against_prev_0_B      0.007391
FG3A_against_prev_0_A      0.000091
FGA_against_prev_0_B       0.000022
FGA_against_prev_0_A      -0.000022
FG3A_against_prev_0_B     -0.000091
FG3M_against_prev_0_A     -0.007391
FT_PCT_against_prev_0_A   -0.008465
FG3A_for_prev_0_B         -0.020093
FGA_for_prev_0_A          -0.020347
FTA_for_prev_0_B          -0.029587
FG3M_for_prev_0_B         -0.031765
FTM_for_prev_0_B          -0.042716
FT_PCT_for_prev_0_B       -0.045329
FTA_against_prev_0_A      -0.052222
FTM_against_prev_0_A      -0.053936
FGM_against_prev_0_A      -0.060626
FGM_for_prev_0_B          -0

In [35]:
# IS_HOME features
corrs.loc[IS_HOME, "WL_for_A"].sort_values(ascending=False)

IS_HOME_for_A               0.209815
IS_HOME_against_B           0.209815
IS_HOME_against_prev_0_A    0.025750
IS_HOME_for_prev_0_B        0.025750
IS_HOME_against_prev_0_B   -0.025750
IS_HOME_for_prev_0_A       -0.025750
IS_HOME_for_B              -0.209815
IS_HOME_against_A          -0.209815
Name: WL_for_A, dtype: float64

Sick correlations ... features with highest correlations:
* IS_HOME (.20)
* WL (.22)
* PLUS_MINUS (.23)
* FG_PCT (.13)
* BLK (.1)

In [36]:
best_features = IS_HOME + PLUS_MINUS + WL + FG_PCT + BLK

Finding best features via random forest (?)

BUILDING MODEL

In [37]:
# imports 
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [38]:
# features being used
features = IS_HOME + PLUS_MINUS
# features = best_features

print("Features used:")

# Use either streak_df or streak_df_scaled
data_used = streak_df_scaled.dropna().copy()

X_raw = data_used[features].copy() # need to figure out how to handle NaN's
print(X_raw.columns)
y = data_used["WL_for_A"].copy()

Features used:
Index(['IS_HOME_for_prev_0_A', 'IS_HOME_for_prev_0_B',
       'IS_HOME_against_prev_0_A', 'IS_HOME_against_prev_0_B', 'IS_HOME_for_A',
       'IS_HOME_for_B', 'IS_HOME_against_A', 'IS_HOME_against_B',
       'PLUS_MINUS_for_prev_0_A', 'PLUS_MINUS_for_prev_0_B'],
      dtype='object')


In [39]:
# Changing to np array
X = np.array(X_raw)
y = np.array(y)
X.shape, y.shape

((89198, 10), (89198,))

In [40]:
for state in range(0, 10):

    # Splitting data into training and testing data
    print(' * Splitting data ... ')
    X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2, random_state=state)

    # Model training
    print(' * Training model ... ')
    lr_model = LogisticRegression(max_iter=10000, random_state=state)
    lr_model.fit(X_train, y_train)

    # Get training accuracy
    print("Accuracy run {}: ".format(state+1))
    print("Train {:.10f}".format(accuracy_score(lr_model.predict(X_train), y_train)))
    
    # Get test accuracy
    y_logits = lr_model.predict(X_test)
    y_pred = y_logits.round()
    print("Test {:.10f}".format(accuracy_score(y_pred, y_true)))

 * Splitting data ... 
 * Training model ... 


Accuracy run 1: 
Train 0.6687687435
Test 0.6717488789
 * Splitting data ... 
 * Training model ... 
Accuracy run 2: 
Train 0.6688388127
Test 0.6711883408
 * Splitting data ... 
 * Training model ... 
Accuracy run 3: 
Train 0.6691331035
Test 0.6696748879
 * Splitting data ... 
 * Training model ... 
Accuracy run 4: 
Train 0.6686426189
Test 0.6720291480
 * Splitting data ... 
 * Training model ... 
Accuracy run 5: 
Train 0.6694694358
Test 0.6688901345
 * Splitting data ... 
 * Training model ... 
Accuracy run 6: 
Train 0.6688668404
Test 0.6705156951
 * Splitting data ... 
 * Training model ... 
Accuracy run 7: 
Train 0.6687126881
Test 0.6707959641
 * Splitting data ... 
 * Training model ... 
Accuracy run 8: 
Train 0.6687967712
Test 0.6721412556
 * Splitting data ... 
 * Training model ... 
Accuracy run 9: 
Train 0.6697777404
Test 0.6676008969
 * Splitting data ... 
 * Training model ... 
Accuracy run 10: 
Train 0.6692872558
Test 0.6695627803


In [41]:
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Precision: {}".format(precision)) # TP / (TP + FP)
print("Recall: {}".format(recall)) # TP / (TP + FN)
print("F1 Score: {}".format(f1)) # 2 * (precision * recall) / (precision + recall)

Precision: 0.6763881099270892
Recall: 0.6670353982300885
F1 Score: 0.6716791979949874


RANDOM FOREST

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [44]:
# Evaluate the model
# Predict on the test set
y_pred = rf_classifier.predict(X_test).round()

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6477017937219731


GRADINTBOOSTINGMACHINE

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_classifier.predict(X_test).round()

# Evaluate the model
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6715807174887892


SVM

In [51]:
from sklearn.svm import SVC

Building a LR model such that:
* training on everything but 2023-24 season data
* predicting 2023-24 season data

In [47]:
# features being used
features = IS_HOME + PLUS_MINUS
# features = best_features
features = features + ['SEASON_ID_A', 'GAME_DATE_A']

print("Features used:")

# Use either streak_df or streak_df_scaled
data_used = streak_df_scaled.dropna().copy()
data_used['GAME_DATE_A'] = pd.to_datetime(data_used['GAME_DATE_A'])

X_raw = data_used[features].copy() # need to figure out how to handle NaN's
print(X_raw.columns)
y = data_used["WL_for_A"].copy()

Features used:
Index(['IS_HOME_for_prev_0_A', 'IS_HOME_for_prev_0_B',
       'IS_HOME_against_prev_0_A', 'IS_HOME_against_prev_0_B', 'IS_HOME_for_A',
       'IS_HOME_for_B', 'IS_HOME_against_A', 'IS_HOME_against_B',
       'PLUS_MINUS_for_prev_0_A', 'PLUS_MINUS_for_prev_0_B', 'SEASON_ID_A',
       'GAME_DATE_A'],
      dtype='object')


In [48]:
from sklearn.svm import SVC

In [49]:
seasons_arr = np.arange(2015, 2024, 1)
res = []

for season in seasons_arr:

    # Season to train for
    season += 20000
    recency_bias = 10
    recency_weight = 1

    print(' * Training model data before {} season '.format(season - 20000))
    # Splitting data into training and testing data
    print(' * Splitting data ... ')
    X_train = X_raw[X_raw['SEASON_ID_A'] < season]
    y_train = y[X_raw['SEASON_ID_A'] < season]


    # Identify data in training set
    #cutoff_date = pd.to_datetime('2023-12-31')
    test_cond = (X_raw['SEASON_ID_A'] == season) # & (X_raw['GAME_DATE_A'] > cutoff_date)
    X_test = X_raw[test_cond]
    y_true = y[test_cond]

    print(' * Identifying recent data (past {} seasons)'.format(recency_bias))
    print(' * Counting recent data {} times more in training data'.format(recency_weight))
    # Identify recent data; doesn't select current season
    X_recent = X_train[X_train['SEASON_ID_A'] > season - recency_bias]
    y_recent = y_train[X_train['SEASON_ID_A'] > season - recency_bias]

    # count recent data more in training data
    for i in range(1, recency_weight):
        X_train = pd.concat([X_train, X_recent], axis=0)
        y_train = pd.concat([y_train, y_recent], axis=0)

    X_train = X_train.drop(columns=['SEASON_ID_A', 'GAME_DATE_A'])
    X_test = X_test.drop(columns=['SEASON_ID_A', 'GAME_DATE_A'])



    state = 100

    # Model training
    lr_model = LogisticRegression(max_iter=10000, random_state=state)
    lr_model.fit(X_train, y_train)
    
    #gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    #gb_classifier.fit(X_train, y_train)
    
    #svm = SVC(kernel='linear', random_state=state)
    #svm.fit(X_train, y_train)

    # Get training accuracy
    print("Train {:.10f}".format(accuracy_score(lr_model.predict(X_train), y_train)))
        
    # Get test accuracy
    y_logits = lr_model.predict(X_test)
    y_pred = y_logits.round()
    print("Test {:.10f}".format(accuracy_score(y_pred, y_true)))
    res.append(accuracy_score(y_pred, y_true))

 * Training model data before 2015 season 
 * Splitting data ... 
 * Identifying recent data (past 10 seasons)
 * Counting recent data 1 times more in training data
Train 0.6787083285
Test 0.6727122836
 * Training model data before 2016 season 
 * Splitting data ... 
 * Identifying recent data (past 10 seasons)
 * Counting recent data 1 times more in training data
Train 0.6786866392
Test 0.6252059308
 * Training model data before 2017 season 
 * Splitting data ... 
 * Identifying recent data (past 10 seasons)
 * Counting recent data 1 times more in training data
Train 0.6767761712
Test 0.6392092257
 * Training model data before 2018 season 
 * Splitting data ... 
 * Identifying recent data (past 10 seasons)
 * Counting recent data 1 times more in training data
Train 0.6762087349
Test 0.6532125206
 * Training model data before 2019 season 
 * Splitting data ... 
 * Identifying recent data (past 10 seasons)
 * Counting recent data 1 times more in training data
Train 0.6753454990
Test 0.6

In [50]:
print(seasons_arr)
res = np.array(res).round(2)
print(res)
print(res.mean())

[2015 2016 2017 2018 2019 2020 2021 2022 2023]
[0.67 0.63 0.64 0.65 0.65 0.61 0.6  0.62 0.65]
0.6355555555555557


Above tests prove we need to overtrain on more recent data, since we want the model to predict current game outcomes. Prediction accuracy sees a falloff after 2015. 

Also worth identifying features that have started to correlate more heavily with wins. 

PLAYING AROUND WITH ELO