In [249]:
import pandas as pd
from datetime import datetime

In [250]:
#Read out csv that we saved in parse_data
#Csv has two rows for each game, since it has one for perspective of each team
#(i.e. one where team is ATL, opp_team is DET and next where team is DET, opp_team is ATL)
df = pd.read_csv("nba_games.csv", index_col=0)

In [251]:
df = df.sort_values("date")

In [252]:
df = df.reset_index(drop=True)

In [253]:
#Delete extra columns
del df["index_opp"]

In [254]:
df["team"]

0        DET
1        ATL
2        CLE
3        CHI
4        NOP
        ... 
24423    MIL
24424    ATL
24425    HOU
24426    UTA
24427    POR
Name: team, Length: 24428, dtype: object

In [255]:
df = df.copy()
# Group dataframe by team, then apply the function to that team's next game (instead of some other random team by date)
df['target'] = df.groupby('team')['won'].shift(-1) # Target will be be false if team lost next game, true if they won

In [256]:
# Last row (most recent game for that team) now has NaN since there's no next game to read
df[df["team"] == "WAS"] 

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
8,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,15.0,...,25.2,134.0,98.0,ORL,87,1,2016,2015-10-28,True,True
51,35.0,68.0,0.515,12.0,21.0,0.571,36.0,44.0,0.818,8.0,...,32.4,138.0,122.0,MIL,113,1,2016,2015-10-30,True,False
65,38.0,90.0,0.422,6.0,22.0,0.273,28.0,33.0,0.848,11.0,...,28.7,153.0,108.0,NYK,117,0,2016,2015-10-31,False,True
118,42.0,87.0,0.483,8.0,23.0,0.348,10.0,14.0,0.714,4.0,...,30.0,160.0,109.0,SAS,99,0,2016,2015-11-04,True,False
166,36.0,88.0,0.409,8.0,25.0,0.320,18.0,23.0,0.783,10.0,...,41.6,146.0,103.0,BOS,118,1,2016,2015-11-06,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24287,32.0,89.0,0.360,10.0,42.0,0.238,26.0,32.0,0.813,10.0,...,34.9,226.0,106.0,SAC,123,1,2025,2025-01-19,False,False
24325,34.0,95.0,0.358,11.0,43.0,0.256,9.0,13.0,0.692,9.0,...,31.8,186.0,97.0,LAL,111,1,2025,2025-01-21,False,False
24359,32.0,94.0,0.340,11.0,46.0,0.239,18.0,21.0,0.857,14.0,...,33.7,233.0,105.0,LAC,110,1,2025,2025-01-23,False,False
24374,42.0,90.0,0.467,18.0,41.0,0.439,7.0,9.0,0.778,5.0,...,43.8,228.0,119.0,PHO,119,1,2025,2025-01-25,False,False


In [257]:
# Last row (most recent game for that team) now has NaN, so replace all nulls with 2
# This will signify that the game is the most recent box-score for that team
df.loc[pd.isnull(df["target"]), "target"] = 2
# Also convert our true/false to 1/0 so possible values are 0,1,2
df["target"] = df["target"].astype(int, errors="ignore") 
# Finally, set null ft%'s to 0 when team shoots 0 fts (rare case)
df.fillna({'ft%': 0, 'ft%_max': 0, 'ft%_opp': 0, 'ft%_max_opp': 0}, inplace=True)
df.fillna({'+/-_max': 0, '+/-_max_opp': 0}, inplace=True)

In [258]:
df["won"].value_counts() # Even because always a winner

won
True     12214
False    12214
Name: count, dtype: int64

In [259]:
df["target"].value_counts() # Thirty 2s because each team has a most recent game

target
1    12200
0    12198
2       30
Name: count, dtype: int64

In [260]:
# Now, we will handle null columns since those cannot be included in the model
nulls = pd.isnull(df)

In [261]:
nulls = nulls.sum() #Find number of nulls in each column
nulls

fg          0
fga         0
fg%         0
3p          0
3pa         0
           ..
home_opp    0
season      0
date        0
won         0
target      0
Length: 145, dtype: int64

In [262]:
nulls = nulls[nulls > 0]
nulls

+/-           24428
mp_max        24428
+/-_opp       24428
mp_max_opp    24428
dtype: int64

In [263]:
#Get only the columns that are not in our nulls list from above
nulls.index
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=141)

In [264]:
nulls.index

Index(['+/-', 'mp_max', '+/-_opp', 'mp_max_opp'], dtype='object')

In [265]:
df = df[valid_columns].copy()

In [266]:
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
1,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,7.0,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
3,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,44.0,87.0,0.506,11.0,35.0,0.314,13.0,18.0,0.722,9.0,...,35.2,250.0,123.0,POR,125,1,2025,2025-01-28,False,2
24424,36.0,89.0,0.404,11.0,47.0,0.234,13.0,16.0,0.813,9.0,...,42.1,200.0,104.0,HOU,100,0,2025,2025-01-28,False,2
24425,37.0,79.0,0.468,7.0,30.0,0.233,19.0,20.0,0.950,6.0,...,32.0,143.0,108.0,ATL,96,1,2025,2025-01-28,True,2
24426,39.0,76.0,0.513,8.0,26.0,0.308,17.0,24.0,0.708,5.0,...,28.3,154.0,115.0,GSW,114,1,2025,2025-01-28,False,2


In [267]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
# Forward Direction means start at 0 features, and select best feature until 30, can test backwards too
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) 

In [268]:
# Grab columns we can't scale since they're just context
removed_columns = ["season", "date", "won", "target", "team", "team_opp"] 

In [269]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [270]:
# Scales our values to fall between 0 and 1 to make ridge regression better
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [271]:
df
print(list(df.columns))

['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'ts%', 'efg%', '3par', 'ftr', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%', 'usg%', 'ortg', 'drtg', 'fg_max', 'fga_max', 'fg%_max', '3p_max', '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max', 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max', 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max', 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max', 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max', 'team', 'total', 'home', 'mp_opp', 'fg_opp', 'fga_opp', 'fg%_opp', '3p_opp', '3pa_opp', '3p%_opp', 'ft_opp', 'fta_opp', 'ft%_opp', 'orb_opp', 'drb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp', 'pf_opp', 'pts_opp', 'ts%_opp', 'efg%_opp', '3par_opp', 'ftr_opp', 'orb%_opp', 'drb%_opp', 'trb%_opp', 'ast%_opp', 'stl%_opp', 'blk%_opp', 'tov%_opp', 'usg%_opp', 'ortg_opp', 'drtg_opp', '

In [272]:
#NOTE: Going to eventually test and see if we can remove/comment out this middle part, but for now just run it
sfs.fit(df[selected_columns], df["target"])

In [273]:
# Pick 30 best features
predictors = list(selected_columns[sfs.get_support()])
predictors

['ft%',
 'orb',
 'efg%',
 'orb%',
 'trb%',
 'usg%',
 'pts_max',
 '+/-_max',
 'ftr_max',
 'drb%_max',
 'blk%_max',
 'home',
 'mp_opp',
 'ft%_opp',
 'drb_opp',
 'stl_opp',
 'drb%_opp',
 'trb%_opp',
 'blk%_opp',
 'usg%_opp',
 '3p_max_opp',
 'ft%_max_opp',
 'trb_max_opp',
 'efg%_max_opp',
 'drb%_max_opp',
 'ast%_max_opp',
 'stl%_max_opp',
 'usg%_max_opp',
 'drtg_max_opp',
 'home_opp']

In [308]:
# Backtesting will let us test our model's accuracy by splitting data up by season and use past seasons to predict another one
# (i.e. using 2016,2017 to predict 2018's games and seeing if results are accurate to what really happened in 2018)
def backtest(df, model, predictors, detailed, start=2, step=1):
    all_predictions = [] #List of dataframes for actual vs prediction results for each season
    
    #Get list of all unique seasons
    seasons = sorted(df["season"].unique())
    #Loop through each season, starting at 2 (so 2018 will be the first season we predict for)
    for i in range(start, len(seasons), step):
        season = seasons[i]
        # Grab only df data from 2016 (our oldest data) to one before current iterating season for training model
        train = df[df["season"] < season] 
        # Grab all data for current iterating season that we will try to predict
        test = df[df["season"] == season] 
        #Fit model, feeding in the features we want to train from, and the correct results from the train set
        model.fit(train[predictors], train["target"])
        #After fitting, predict current iterating season by passing in the predictor values 
        preds = model.predict(test[predictors])
        #Convert array to a pandas series that labels each prediction by it's dataframe index
        preds = pd.Series(preds, index=test.index)
        #Now, preds is a series of 0's and 1's of whether or not team is predicted to win that index's game
        #Combine actual results (the target of test set) with predictions and rename for clarity
        if detailed:
            combined = pd.concat([test["target"], preds, test["team_x"], test["team_y"], test["date"], test["date_next"]], axis=1)
            combined.columns = ["actual", "prediction", "team1", "team2", "last played date", "predict game date"]
        else:
            combined = pd.concat([test["target"], preds], axis=1)
            combined.columns = ["actual", "prediction"]
        #Save actual/predictions for this season
        all_predictions.append(combined)
        
    #Concat all season's together and return
    return pd.concat(all_predictions)

In [277]:
# Pass in ridge classifier as our model along with dataframe and predictors
predictions = backtest(df, rr, predictors, detailed = False)

In [278]:
predictions

Unnamed: 0,actual,prediction
5250,0,0
5251,1,0
5252,1,1
5253,1,0
5254,0,0
...,...,...
24423,2,0
24424,2,0
24425,2,1
24426,2,0


In [279]:
from sklearn.metrics import accuracy_score

In [280]:
predictions = predictions[predictions["actual"] != 2] # Remove the 2s 
accuracy_score(predictions["actual"], predictions["prediction"])

0.541048673490704

In [281]:
#Group by home, and then calculate % of time team won when they were home (number of won rows versus total number of rows)
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

  df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])


home
0.0    0.431308
1.0    0.568692
dtype: float64

In [282]:
#We see how our prediction was worse than just guessing by home/away
#To beat this baseline percentage, we will use a team's last 10 games instead of 1 game, and then run prediction again to improve %
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [283]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.454545,0.406250,0.769,0.793103,...,0.071,0.550314,0.151282,0.800948,0.500000,0.267857,1.0,True,DET,2016
1,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.272727,0.234375,0.800,0.241379,...,0.047,0.300839,0.020513,0.203791,0.306818,0.375000,0.0,False,ATL,2016
2,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.227273,0.265625,0.588,0.379310,...,0.140,0.509434,0.161538,0.345972,0.306818,0.294643,1.0,False,CLE,2016
3,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.363636,0.359375,0.696,0.241379,...,0.185,0.270440,0.089744,0.232227,0.318182,0.276786,0.0,True,CHI,2016
4,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.431818,0.421875,0.704,0.275862,...,0.079,0.679245,0.278205,0.554502,0.306818,0.419643,1.0,False,NOP,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.543478,0.397059,0.566986,0.379310,0.469697,0.372922,0.295455,0.281250,0.722,0.310345,...,0.087,0.300839,0.169231,0.763033,0.522727,0.544643,1.0,False,MIL,2025
24424,0.369565,0.426471,0.322967,0.379310,0.651515,0.277910,0.295455,0.250000,0.813,0.310345,...,0.070,0.401468,0.257692,0.526066,0.306818,0.321429,0.0,False,ATL,2025
24425,0.391304,0.279412,0.476077,0.241379,0.393939,0.276722,0.431818,0.312500,0.950,0.206897,...,0.049,0.401468,0.128205,0.255924,0.352273,0.285714,1.0,True,HOU,2025
24426,0.434783,0.235294,0.583732,0.275862,0.333333,0.365796,0.386364,0.375000,0.708,0.172414,...,0.080,0.222222,0.080769,0.308057,0.431818,0.446429,1.0,False,UTA,2025


In [284]:
#Team parameter is sub-set of entire rolling_df for each team/each season (i.e. ATL 2015, ATL 2016, OKC 2021)
def find_team_averages(team):
    #Rolling method groups row with previous 10 games, and then we take mean of each stat (and exclude won, team, and season)
    rolling = team[selected_columns].rolling(10).mean()
    return rolling
    
#Group by team to ensure last 10 are that team's games, and group season to ensure that we don't go back to previous season
df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)


In [285]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.552174,0.391176,0.589952,0.427586,0.436364,0.463183,0.425000,0.395313,0.7199,0.289655,...,0.153289,0.431307,0.0498,0.0971,0.354088,0.183077,0.492891,0.567045,0.440179,0.5
24424,0.443478,0.457353,0.393541,0.413793,0.522727,0.377791,0.393182,0.359375,0.7575,0.434483,...,0.140022,0.276835,0.0572,0.1013,0.386164,0.145641,0.405213,0.422727,0.429464,0.5
24425,0.517391,0.455882,0.486124,0.465517,0.453030,0.474584,0.413636,0.401562,0.7213,0.506897,...,0.118311,0.266399,0.0583,0.0869,0.510377,0.226538,0.443128,0.560227,0.437500,0.7
24426,0.465217,0.458824,0.421531,0.444828,0.565152,0.372090,0.356818,0.326562,0.7499,0.448276,...,0.148026,0.306307,0.0679,0.1304,0.420860,0.146923,0.400000,0.475000,0.486607,0.6


In [286]:
#Re-name all rolling stat columns for indication, and then concat the two dataframes together, side-by-side
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis = 1)

In [287]:
df = df.dropna() #Drop NaN stats from the first 10 games since they didn't have 10 games to go back of
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp_10,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10
243,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.272727,0.250000,0.750,0.275862,...,0.152851,0.311927,0.0628,0.0679,0.413522,0.125256,0.361611,0.434091,0.322321,0.4
249,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.340909,0.250000,0.938,0.206897,...,0.166667,0.412271,0.0613,0.0772,0.469497,0.220641,0.394787,0.513636,0.300893,0.5
254,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.454545,0.406250,0.769,0.517241,...,0.201974,0.331537,0.0657,0.1032,0.437212,0.126026,0.404739,0.394318,0.398214,0.2
255,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.522727,0.421875,0.852,0.448276,...,0.132675,0.458257,0.0699,0.1072,0.380294,0.274359,0.270616,0.462500,0.286607,0.6
256,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.636364,0.484375,0.903,0.344828,...,0.135965,0.398280,0.0747,0.0742,0.303564,0.131667,0.387678,0.396591,0.325893,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,0.543478,0.397059,0.566986,0.379310,0.469697,0.372922,0.295455,0.281250,0.722,0.310345,...,0.153289,0.431307,0.0498,0.0971,0.354088,0.183077,0.492891,0.567045,0.440179,0.5
24424,0.369565,0.426471,0.322967,0.379310,0.651515,0.277910,0.295455,0.250000,0.813,0.310345,...,0.140022,0.276835,0.0572,0.1013,0.386164,0.145641,0.405213,0.422727,0.429464,0.5
24425,0.391304,0.279412,0.476077,0.241379,0.393939,0.276722,0.431818,0.312500,0.950,0.206897,...,0.118311,0.266399,0.0583,0.0869,0.510377,0.226538,0.443128,0.560227,0.437500,0.7
24426,0.434783,0.235294,0.583732,0.275862,0.333333,0.365796,0.386364,0.375000,0.708,0.172414,...,0.148026,0.306307,0.0679,0.1304,0.420860,0.146923,0.400000,0.475000,0.486607,0.6


In [288]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col
    
def add_col(df, col_name):
    #Use shift_col as a lambda to get the column name's next value (for that specific team group)
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

#Add columns to indicate if team will be home, who they'll face, and date
#These will be NaN on the most recent game that team played, so we'll have to manually fill it in using scraped schedule
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")


  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [289]:
#Find rows where team is CHI and list out columns (last 3 are what we will have to manually fill using schedule of next game)
df[df["team"] == "CHI"][["date", "target", "team", "season", "team_opp", "home_next", "team_opp_next", "date_next"]]

Unnamed: 0,date,target,team,season,team_opp,home_next,team_opp_next,date_next
311,2015-11-16,1,CHI,2016,IND,0.0,PHO,2015-11-18
331,2015-11-18,0,CHI,2016,PHO,0.0,GSW,2015-11-20
372,2015-11-20,1,CHI,2016,GSW,0.0,POR,2015-11-24
420,2015-11-24,0,CHI,2016,POR,0.0,IND,2015-11-27
472,2015-11-27,1,CHI,2016,IND,1.0,SAS,2015-11-30
...,...,...,...,...,...,...,...,...
24289,2025-01-19,1,CHI,2025,POR,0.0,LAC,2025-01-20
24307,2025-01-20,0,CHI,2025,LAC,0.0,GSW,2025-01-23
24354,2025-01-23,0,CHI,2025,GSW,1.0,PHI,2025-01-25
24387,2025-01-25,1,CHI,2025,PHI,1.0,DEN,2025-01-27


In [290]:
# Now, we will use our future csv with future games to fill in home_next, team_opp_next, and date_next
future = pd.read_csv("future_test_02_01.csv")
future[future["away_abbrev"] == "MEM"]

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral,home_abbrev,away_abbrev
10,2025-02-02,8:30p,Memphis Grizzlies,Milwaukee Bucks,MIL,MEM
31,2025-02-05,7:30p,Memphis Grizzlies,Toronto Raptors,TOR,MEM
82,2025-02-11,10:00p,Memphis Grizzlies,Phoenix Suns,PHO,MEM
86,2025-02-12,10:30p,Memphis Grizzlies,Los Angeles Clippers,LAC,MEM
112,2025-02-20,7:00p,Memphis Grizzlies,Indiana Pacers,IND,MEM
114,2025-02-21,7:00p,Memphis Grizzlies,Orlando Magic,ORL,MEM
135,2025-02-23,7:30p,Memphis Grizzlies,Cleveland Cavaliers,CLE,MEM
221,2025-03-07,7:30p,Memphis Grizzlies,Dallas Mavericks,DAL,MEM
237,2025-03-09,7:00p,Memphis Grizzlies,New Orleans Pelicans,NOP,MEM
305,2025-03-17,10:00p,Memphis Grizzlies,Sacramento Kings,SAC,MEM


In [291]:
def nearest_date(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d").date()

# Takes in a row for game that requires filling in a next stat (home, team_opp, date)
def add_next_stat(target_game, stat): 
    # Get current team abbrev and date of their last played game
    team_abbrev = target_game["team"]
    last_date = datetime.strptime(target_game["date"], "%Y-%m-%d").date()
    # Use abbrev to get all the future games of the team
    next_games = future[
        (future["home_abbrev"] == team_abbrev) | (future["away_abbrev"] == team_abbrev)
    ]
    # From rows of future games for team, find the one that is closest to the last game date, using iloc to get 'first' (only) row
    next_game = next_games[
        next_games["Date"] == datetime.strftime(nearest_date(next_games["Date"].apply(parse_date), last_date), "%Y-%m-%d")
    ].iloc[0] 
    # Now that we have a series for team_abbrev's next game, check which stat was passed in and return corresponding value
    if (stat == "home_next"):
        if next_game["home_abbrev"] == team_abbrev: 
            return 1
        return 0
    if (stat == "date_next"):
        return next_game["Date"]
    if (stat == "team_opp_next"):
        if next_game["home_abbrev"] == team_abbrev:
            return next_game["away_abbrev"]
        return next_game["home_abbrev"]
    return None # Will never hit this

#TODO: For some reason, filling in the 30 rows of N/A's causes like 2000 extra rows to be added in the full
df.loc[df["target"] == 2, "home_next"] = df[df["target"] == 2].apply(add_next_stat, stat="home_next", axis = 1)
df.loc[df["target"] == 2, "team_opp_next"] = df[df["target"] == 2].apply(add_next_stat, stat="team_opp_next", axis = 1)
df.loc[df["target"] == 2, "date_next"] = df[df["target"] == 2].apply(add_next_stat, stat="date_next", axis = 1)

In [292]:
df[df["target"] == 2][["target", "date", "team", "home_next", "team_opp_next", "date_next"]]

Unnamed: 0,target,date,team,home_next,team_opp_next,date_next
24376,2,2025-01-25,IND,1.0,ATL,2025-02-01
24377,2,2025-01-25,SAS,1.0,MIA,2025-02-01
24395,2,2025-01-26,OKC,1.0,SAC,2025-02-01
24397,2,2025-01-27,TOR,1.0,LAC,2025-02-02
24398,2,2025-01-27,NOP,0.0,DEN,2025-02-03
24399,2,2025-01-27,PHO,0.0,POR,2025-02-01
24400,2,2025-01-27,LAC,0.0,TOR,2025-02-02
24402,2,2025-01-27,NYK,1.0,LAL,2025-02-01
24403,2,2025-01-27,MEM,0.0,MIL,2025-02-02
24404,2,2025-01-27,MIN,1.0,WAS,2025-02-01


In [293]:
df = df.copy() #Copy just to prevent errors

In [294]:
#Note: Opponent stats will be {name}_y after running this
# Now, we 
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"]
)

In [296]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,DEN,NOP,NOP,DEN,2015-11-17
3,ORL,MIN,MIN,ORL,2015-11-18
4,PHI,DAL,DAL,PHI,2015-11-16
...,...,...,...,...,...
21613,MIL,MEM,MEM,MIL,2025-02-02
21614,ATL,IND,IND,ATL,2025-02-01
21615,HOU,BRK,BRK,HOU,2025-02-01
21616,UTA,ORL,ORL,UTA,2025-02-01


In [297]:
#Now, we prepare to select features again with these new rolling averages by selecting new string columns
removed_columns = list(full.columns[full.dtypes=="object"]) + removed_columns

In [298]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [299]:
#Get all columns that aren't removed
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [300]:
#Use feature selector to get 30 best again
sfs.fit(full[selected_columns], full["target"])

In [301]:
#Get Support function returns array of bools for wehther or not feature was included, so index by trues
predictors = list(selected_columns[sfs.get_support()])

In [302]:
predictors

['fga',
 'ft%',
 'tov%',
 'usg%',
 'usg%_max',
 'fg_opp',
 'stl%_opp',
 'usg%_opp',
 'pts_max_opp',
 'stl%_10_x',
 'usg%_10_x',
 'fg_max_10_x',
 'fta_max_10_x',
 'ft%_max_10_x',
 'pf_max_10_x',
 'pts_max_10_x',
 '+/-_max_10_x',
 '3p%_opp_10_x',
 'usg%_opp_10_x',
 'blk_max_opp_10_x',
 'home_next',
 'ft%_10_y',
 'usg%_10_y',
 '+/-_max_10_y',
 'orb%_max_10_y',
 'orb_opp_10_y',
 'usg%_opp_10_y',
 'fga_max_opp_10_y',
 'orb_max_opp_10_y',
 'drtg_max_opp_10_y']

In [309]:
predictions = backtest(full, rr, predictors, detailed=True)

In [310]:
real_predictions = predictions[predictions["actual"] == 2]
#Get new accuracy score
predictions = predictions[predictions["actual"] != 2] # Remove the 2s for accuracy score
accuracy_score(predictions["actual"], predictions["prediction"])

0.6316442972861113

In [311]:
# 2/1/25:
#To predict new games, will need to manually fill in "home_next", "team_opp_next", "date_next" columns for 30 teams
#since they will be NaN for their most recent game. 
# We can do this using the future_games.csv file: load into dataframe, grab date, home_abbrev, away_abbrev cols
#Then re-run the full = merge line and all the predictions
#Then, to look at games that haven't happened yet, look at any rows where actual/target column was 2 and take note of predictions:
# predictions[predictions["actual"] == 2]

In [312]:
#TODO: ANALYZE
real_predictions

Unnamed: 0,actual,prediction,team1,team2,last played date,predict game date
21568,2,1,IND,ATL,2025-01-25,2025-02-01
21569,2,1,SAS,MIA,2025-01-25,2025-02-01
21587,2,1,OKC,SAC,2025-01-26,2025-02-01
21589,2,0,TOR,LAC,2025-01-27,2025-02-02
21590,2,1,PHO,POR,2025-01-27,2025-02-01
21591,2,0,LAC,TOR,2025-01-27,2025-02-02
21593,2,1,NYK,LAL,2025-01-27,2025-02-01
21594,2,0,MEM,MIL,2025-01-27,2025-02-02
21595,2,1,MIN,WAS,2025-01-27,2025-02-01
21597,2,0,MIA,SAS,2025-01-27,2025-02-01
