In [1]:
import pandas as pd

In [2]:
#Read out csv that we saved in parse_data
df = pd.read_csv("nba_games.csv", index_col=0)

In [3]:
df = df.sort_values("date")

In [4]:
df = df.reset_index(drop=True)

In [5]:
#Delete extra columns
del df["index_opp"]

In [6]:
df["team"]

0        DET
1        ATL
2        CLE
3        CHI
4        NOP
        ... 
24423    MIL
24424    ATL
24425    HOU
24426    UTA
24427    POR
Name: team, Length: 24428, dtype: object

In [7]:
def add_target(team):
    team["target"] = team["won"].shift(-1) #Target will indicate if team won their next game
    return team

#Group dataframe by team, then apply the function to that team's next game (instead of some other random team by date)
#df = df.groupby("team", group_keys=False).apply(add_target, include_groups=False)'
df = df.copy()
df['target'] = df.groupby('team')['won'].shift(-1) #Target will be whether or not team won next game


In [None]:
df[df["team"] == "CHI"] #Testing

In [9]:
#Replace our missing targets with 2, so now 2 will signify that the game has yet to be played
df.loc[pd.isnull(df["target"]), "target"] = 2
#Also convert our true/false to 1/0, and set null ft%'s to 0 when team shoots 0 fts
df["target"] = df["target"].astype(int, errors="ignore") 
df.fillna({'ft%': 0, 'ft%_max': 0, 'ft%_opp': 0, 'ft%_max_opp': 0}, inplace=True)
df.fillna({'+/-_max': 0, '+/-_max_opp': 0}, inplace=True)

In [10]:
df["won"].value_counts()
df["target"].value_counts()

target
1    12200
0    12198
2       30
Name: count, dtype: int64

In [11]:
nulls = pd.isnull(df)

In [12]:
nulls = nulls.sum() #Find number of nulls in each column

In [13]:
nulls = nulls[nulls > 0]

In [14]:
#Get only the columns that are not in our nulls list from above
nulls.index
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=141)

In [15]:
df = df[valid_columns].copy()

In [16]:
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
1,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,7.0,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
3,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,44.0,87.0,0.506,11.0,35.0,0.314,13.0,18.0,0.722,9.0,...,35.2,250.0,123.0,POR,125,1,2025,2025-01-28,False,2
24424,36.0,89.0,0.404,11.0,47.0,0.234,13.0,16.0,0.813,9.0,...,42.1,200.0,104.0,HOU,100,0,2025,2025-01-28,False,2
24425,37.0,79.0,0.468,7.0,30.0,0.233,19.0,20.0,0.950,6.0,...,32.0,143.0,108.0,ATL,96,1,2025,2025-01-28,True,2
24426,39.0,76.0,0.513,8.0,26.0,0.308,17.0,24.0,0.708,5.0,...,28.3,154.0,115.0,GSW,114,1,2025,2025-01-28,False,2


In [17]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
# FOr
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) # Forward Direction means start at 0 features, and select best feature until 30, can test backwards too

In [18]:
# Grab columns we can't scale since they're just context
removed_columns = ["season", "date", "won", "target", "team", "team_opp"] 

In [19]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [20]:
# Scales our values to fall between 0 and 1 to make ridge regression better
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [21]:
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.454545,0.406250,0.769,0.793103,...,0.151282,0.800948,0.500000,ATL,0.267857,1.0,2016,2015-10-27,True,1
1,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.272727,0.234375,0.800,0.241379,...,0.020513,0.203791,0.306818,DET,0.375000,0.0,2016,2015-10-27,False,1
2,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.227273,0.265625,0.588,0.379310,...,0.161538,0.345972,0.306818,CHI,0.294643,1.0,2016,2015-10-27,False,1
3,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.363636,0.359375,0.696,0.241379,...,0.089744,0.232227,0.318182,CLE,0.276786,0.0,2016,2015-10-27,True,1
4,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.431818,0.421875,0.704,0.275862,...,0.278205,0.554502,0.306818,GSW,0.419643,1.0,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.543478,0.397059,0.566986,0.379310,0.469697,0.372922,0.295455,0.281250,0.722,0.310345,...,0.169231,0.763033,0.522727,POR,0.544643,1.0,2025,2025-01-28,False,2
24424,0.369565,0.426471,0.322967,0.379310,0.651515,0.277910,0.295455,0.250000,0.813,0.310345,...,0.257692,0.526066,0.306818,HOU,0.321429,0.0,2025,2025-01-28,False,2
24425,0.391304,0.279412,0.476077,0.241379,0.393939,0.276722,0.431818,0.312500,0.950,0.206897,...,0.128205,0.255924,0.352273,ATL,0.285714,1.0,2025,2025-01-28,True,2
24426,0.434783,0.235294,0.583732,0.275862,0.333333,0.365796,0.386364,0.375000,0.708,0.172414,...,0.080769,0.308057,0.431818,GSW,0.446429,1.0,2025,2025-01-28,False,2


In [22]:
sfs.fit(df[selected_columns], df["target"])

In [23]:
# Pick 30 best features
predictors = list(selected_columns[sfs.get_support()])
predictors

['ft%',
 'orb',
 'efg%',
 'orb%',
 'trb%',
 'usg%',
 'pts_max',
 '+/-_max',
 'ftr_max',
 'drb%_max',
 'blk%_max',
 'home',
 'mp_opp',
 'ft%_opp',
 'drb_opp',
 'stl_opp',
 'drb%_opp',
 'trb%_opp',
 'blk%_opp',
 'usg%_opp',
 '3p_max_opp',
 'ft%_max_opp',
 'trb_max_opp',
 'efg%_max_opp',
 'drb%_max_opp',
 'ast%_max_opp',
 'stl%_max_opp',
 'usg%_max_opp',
 'drtg_max_opp',
 'home_opp']

In [24]:
#Split data up by season and use past seasons to predict future ones
def backtest(df, model, predictors, start=2, step=1):
    all_predictions = [] #List of df's for each season
    
    #Get list of all unique seasons
    seasons = sorted(df["season"].unique())
    #Loop through each season, starting at 2 (so 2018)
    for i in range(start, len(seasons), step):
        season = seasons[i]
        #Grab data before current season for training and current for testing
        train = df[df["season"] < season]
        test = df[df["season"] == season]
        #Fit model 
        model.fit(train[predictors], train["target"])
        #After fitting, perform prediction using test set and predictors, then convert into pandas series
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index) 
        #Combine actual results (the target of test set) with predictions
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        #Save actual/predictions for this season
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [25]:
predictions = backtest(df, rr, predictors)

In [26]:
predictions

Unnamed: 0,actual,prediction
5250,0,0
5251,1,0
5252,1,1
5253,1,0
5254,0,0
...,...,...
24423,2,0
24424,2,0
24425,2,1
24426,2,0


In [27]:
#predictions["correct"] = predictions["actual"] == predictions["prediction"]
#predictions["correct"].sum() / len(predictions)
#del predictions["correct"]

In [28]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2] # Remove the 2s 
accuracy_score(predictions["actual"], predictions["prediction"])

0.541048673490704

In [29]:
#Group by home, and then calculate percentage of time team won when they were home (number of won rows versus total number of rows)
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

  df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])


home
0.0    0.431308
1.0    0.568692
dtype: float64

In [30]:
#To beat this baseline percentage, we will use a team's last 10 games instead of 1 game
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [31]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.454545,0.406250,0.769,0.793103,...,0.071,0.550314,0.151282,0.800948,0.500000,0.267857,1.0,True,DET,2016
1,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.272727,0.234375,0.800,0.241379,...,0.047,0.300839,0.020513,0.203791,0.306818,0.375000,0.0,False,ATL,2016
2,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.227273,0.265625,0.588,0.379310,...,0.140,0.509434,0.161538,0.345972,0.306818,0.294643,1.0,False,CLE,2016
3,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.363636,0.359375,0.696,0.241379,...,0.185,0.270440,0.089744,0.232227,0.318182,0.276786,0.0,True,CHI,2016
4,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.431818,0.421875,0.704,0.275862,...,0.079,0.679245,0.278205,0.554502,0.306818,0.419643,1.0,False,NOP,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.543478,0.397059,0.566986,0.379310,0.469697,0.372922,0.295455,0.281250,0.722,0.310345,...,0.087,0.300839,0.169231,0.763033,0.522727,0.544643,1.0,False,MIL,2025
24424,0.369565,0.426471,0.322967,0.379310,0.651515,0.277910,0.295455,0.250000,0.813,0.310345,...,0.070,0.401468,0.257692,0.526066,0.306818,0.321429,0.0,False,ATL,2025
24425,0.391304,0.279412,0.476077,0.241379,0.393939,0.276722,0.431818,0.312500,0.950,0.206897,...,0.049,0.401468,0.128205,0.255924,0.352273,0.285714,1.0,True,HOU,2025
24426,0.434783,0.235294,0.583732,0.275862,0.333333,0.365796,0.386364,0.375000,0.708,0.172414,...,0.080,0.222222,0.080769,0.308057,0.431818,0.446429,1.0,False,UTA,2025


In [32]:
def find_team_averages(team):
    #Rolling method groups row with previous 10, and then we take mean of each stat (and exclude won, team, and season)
    rolling = team[selected_columns].rolling(10).mean()
    return rolling
    
#Group by team to ensure last 10 are that team's games, and group season to ensure that we don't go back to previous season
df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)


In [33]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.552174,0.391176,0.589952,0.427586,0.436364,0.463183,0.425000,0.395313,0.7199,0.289655,...,0.153289,0.431307,0.0498,0.0971,0.354088,0.183077,0.492891,0.567045,0.440179,0.5
24424,0.443478,0.457353,0.393541,0.413793,0.522727,0.377791,0.393182,0.359375,0.7575,0.434483,...,0.140022,0.276835,0.0572,0.1013,0.386164,0.145641,0.405213,0.422727,0.429464,0.5
24425,0.517391,0.455882,0.486124,0.465517,0.453030,0.474584,0.413636,0.401562,0.7213,0.506897,...,0.118311,0.266399,0.0583,0.0869,0.510377,0.226538,0.443128,0.560227,0.437500,0.7
24426,0.465217,0.458824,0.421531,0.444828,0.565152,0.372090,0.356818,0.326562,0.7499,0.448276,...,0.148026,0.306307,0.0679,0.1304,0.420860,0.146923,0.400000,0.475000,0.486607,0.6


In [34]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis = 1)

In [35]:
df = df.dropna()
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp_10,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10
243,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.272727,0.250000,0.750,0.275862,...,0.152851,0.311927,0.0628,0.0679,0.413522,0.125256,0.361611,0.434091,0.322321,0.4
249,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.340909,0.250000,0.938,0.206897,...,0.166667,0.412271,0.0613,0.0772,0.469497,0.220641,0.394787,0.513636,0.300893,0.5
254,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.454545,0.406250,0.769,0.517241,...,0.201974,0.331537,0.0657,0.1032,0.437212,0.126026,0.404739,0.394318,0.398214,0.2
255,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.522727,0.421875,0.852,0.448276,...,0.132675,0.458257,0.0699,0.1072,0.380294,0.274359,0.270616,0.462500,0.286607,0.6
256,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.636364,0.484375,0.903,0.344828,...,0.135965,0.398280,0.0747,0.0742,0.303564,0.131667,0.387678,0.396591,0.325893,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.543478,0.397059,0.566986,0.379310,0.469697,0.372922,0.295455,0.281250,0.722,0.310345,...,0.153289,0.431307,0.0498,0.0971,0.354088,0.183077,0.492891,0.567045,0.440179,0.5
24424,0.369565,0.426471,0.322967,0.379310,0.651515,0.277910,0.295455,0.250000,0.813,0.310345,...,0.140022,0.276835,0.0572,0.1013,0.386164,0.145641,0.405213,0.422727,0.429464,0.5
24425,0.391304,0.279412,0.476077,0.241379,0.393939,0.276722,0.431818,0.312500,0.950,0.206897,...,0.118311,0.266399,0.0583,0.0869,0.510377,0.226538,0.443128,0.560227,0.437500,0.7
24426,0.434783,0.235294,0.583732,0.275862,0.333333,0.365796,0.386364,0.375000,0.708,0.172414,...,0.148026,0.306307,0.0679,0.1304,0.420860,0.146923,0.400000,0.475000,0.486607,0.6


In [36]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col
    
def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

#Add columns to indicate if team will be home, who they'll face, and date
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")


  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [37]:
df[df["team"] == "CHI"][["team", "season", "team_opp_next", "team_opp"]]

Unnamed: 0,team,season,team_opp_next,team_opp
311,CHI,2016,PHO,IND
331,CHI,2016,GSW,PHO
372,CHI,2016,POR,GSW
420,CHI,2016,IND,POR
472,CHI,2016,SAS,IND
...,...,...,...,...
24289,CHI,2025,LAC,POR
24307,CHI,2025,GSW,LAC
24354,CHI,2025,PHI,GSW
24387,CHI,2025,DEN,PHI


In [38]:
df = df.copy()

In [39]:
#Opponent stats will be _y 
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"]
)

In [40]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,DEN,NOP,NOP,DEN,2015-11-17
3,ORL,MIN,MIN,ORL,2015-11-18
4,PHI,DAL,DAL,PHI,2015-11-16
...,...,...,...,...,...
21585,MIL,POR,POR,MIL,2025-01-28
21586,UTA,GSW,GSW,UTA,2025-01-28
21587,ATL,HOU,HOU,ATL,2025-01-28
21588,LAL,PHI,PHI,LAL,2025-01-28


In [41]:
#Now, we prepare to select features again with these new rolling averages
removed_columns = list(full.columns[full.dtypes=="object"]) + removed_columns

In [42]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [43]:
#Get all columns that aren't removed
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [44]:
sfs.fit(full[selected_columns], full["target"])

In [45]:
#Get Support function returns array of bools for wehther or not feature was included, so index by trues
predictors = list(selected_columns[sfs.get_support()])

In [46]:
predictors

['fg%',
 'usg%',
 'fg_max',
 'fga_max',
 'orb_max',
 'pts_max',
 'ft%_opp',
 'stl%_opp',
 'usg%_opp',
 'ftr_max_opp',
 'drtg_max_opp',
 'usg%_10_x',
 'pts_max_10_x',
 '+/-_max_10_x',
 'blk%_opp_10_x',
 'usg%_opp_10_x',
 'ft%_max_opp_10_x',
 'ast%_max_opp_10_x',
 'home_next',
 'usg%_10_y',
 'fga_max_10_y',
 'pts_max_10_y',
 '+/-_max_10_y',
 'ft%_opp_10_y',
 'tov_opp_10_y',
 'blk%_opp_10_y',
 'usg%_opp_10_y',
 'efg%_max_opp_10_y',
 'ftr_max_opp_10_y',
 'ast%_max_opp_10_y']

In [47]:
predictions = backtest(full, rr, predictors)

In [48]:
#Get new accuracy score
accuracy_score(predictions["actual"], predictions["prediction"])

0.6331224501862472