In [604]:
import pandas as pd
from datetime import datetime

In [605]:
#Read out csv that we saved in parse_data
#Csv has two rows for each game, since it has one for perspective of each team
#(i.e. one where team is ATL, opp_team is DET and next where team is DET, opp_team is ATL)
df = pd.read_csv("nba_games.csv", index_col=0)

In [606]:
df = df.sort_values("date")

In [607]:
df = df.reset_index(drop=True)

In [608]:
#Delete extra columns
del df["index_opp"]

In [609]:
df["team"]

0        DET
1        ATL
2        CLE
3        CHI
4        NOP
        ... 
24487    CHO
24488    DEN
24489    ORL
24490    MIN
24491    UTA
Name: team, Length: 24492, dtype: object

In [610]:
df = df.copy()
# Group dataframe by team, then apply the function to that team's next game (instead of some other random team by date)
df['target'] = df.groupby('team')['won'].shift(-1) # Target will be be false if team lost next game, true if they won

In [611]:
# Last row (most recent game for that team) now has NaN since there's no next game to read
df[df["team"] == "WAS"] 

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
8,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,15.0,...,25.2,134.0,98.0,ORL,87,1,2016,2015-10-28,True,True
51,35.0,68.0,0.515,12.0,21.0,0.571,36.0,44.0,0.818,8.0,...,32.4,138.0,122.0,MIL,113,1,2016,2015-10-30,True,False
65,38.0,90.0,0.422,6.0,22.0,0.273,28.0,33.0,0.848,11.0,...,28.7,153.0,108.0,NYK,117,0,2016,2015-10-31,False,True
118,42.0,87.0,0.483,8.0,23.0,0.348,10.0,14.0,0.714,4.0,...,30.0,160.0,109.0,SAS,99,0,2016,2015-11-04,True,False
166,36.0,88.0,0.409,8.0,25.0,0.320,18.0,23.0,0.783,10.0,...,41.6,146.0,103.0,BOS,118,1,2016,2015-11-06,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24369,42.0,90.0,0.467,18.0,41.0,0.439,7.0,9.0,0.778,5.0,...,43.8,228.0,119.0,PHO,119,1,2025,2025-01-25,False,False
24408,42.0,101.0,0.416,11.0,37.0,0.297,13.0,17.0,0.765,14.0,...,32.3,224.0,115.0,DAL,130,1,2025,2025-01-27,False,False
24436,28.0,72.0,0.389,5.0,30.0,0.167,21.0,29.0,0.724,10.0,...,32.5,150.0,89.0,TOR,106,0,2025,2025-01-29,False,False
24450,29.0,91.0,0.319,10.0,44.0,0.227,28.0,34.0,0.824,12.0,...,34.6,211.0,106.0,LAL,134,0,2025,2025-01-30,False,True


In [612]:
# Last row (most recent game for that team) now has NaN, so replace all nulls with 2
# This will signify that the game is the most recent box-score for that team
df.loc[pd.isnull(df["target"]), "target"] = 2
# Also convert our true/false to 1/0 so possible values are 0,1,2
df["target"] = df["target"].astype(int, errors="ignore") 
# Finally, set null ft%'s to 0 when team shoots 0 fts (rare case)
df.fillna({'ft%': 0, 'ft%_max': 0, 'ft%_opp': 0, 'ft%_max_opp': 0}, inplace=True)
df.fillna({'+/-_max': 0, '+/-_max_opp': 0}, inplace=True)

In [613]:
df["won"].value_counts() # Even because always a winner

won
True     12246
False    12246
Name: count, dtype: int64

In [614]:
df["target"].value_counts() # Thirty 2s because each team has a most recent game

target
1    12232
0    12230
2       30
Name: count, dtype: int64

In [615]:
# Now, we will handle null columns since those cannot be included in the model
nulls = pd.isnull(df)

In [616]:
nulls = nulls.sum() #Find number of nulls in each column
nulls

fg          0
fga         0
fg%         0
3p          0
3pa         0
           ..
home_opp    0
season      0
date        0
won         0
target      0
Length: 145, dtype: int64

In [617]:
nulls = nulls[nulls > 0]
nulls

+/-           24492
mp_max        24492
+/-_opp       24492
mp_max_opp    24492
dtype: int64

In [618]:
#Get only the columns that are not in our nulls list from above
nulls.index
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=141)

In [619]:
nulls.index

Index(['+/-', 'mp_max', '+/-_opp', 'mp_max_opp'], dtype='object')

In [620]:
df = df[valid_columns].copy()

In [621]:
df
print(list(df.columns))

['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'ts%', 'efg%', '3par', 'ftr', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%', 'usg%', 'ortg', 'drtg', 'fg_max', 'fga_max', 'fg%_max', '3p_max', '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max', 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max', 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max', 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max', 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max', 'team', 'total', 'home', 'mp_opp', 'fg_opp', 'fga_opp', 'fg%_opp', '3p_opp', '3pa_opp', '3p%_opp', 'ft_opp', 'fta_opp', 'ft%_opp', 'orb_opp', 'drb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp', 'pf_opp', 'pts_opp', 'ts%_opp', 'efg%_opp', '3par_opp', 'ftr_opp', 'orb%_opp', 'drb%_opp', 'trb%_opp', 'ast%_opp', 'stl%_opp', 'blk%_opp', 'tov%_opp', 'usg%_opp', 'ortg_opp', 'drtg_opp', '

In [622]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
# Forward Direction means start at 0 features, and select best feature until 30, can test backwards too
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) 

In [623]:
# Grab columns we can't scale since they're just context
removed_columns = ["season", "date", "won", "target", "team", "team_opp"] 

In [624]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [625]:
# Scales our values to fall between 0 and 1 to make ridge regression better
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [626]:
df
print(list(df.columns))

['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'ts%', 'efg%', '3par', 'ftr', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%', 'usg%', 'ortg', 'drtg', 'fg_max', 'fga_max', 'fg%_max', '3p_max', '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max', 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max', 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max', 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max', 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max', 'team', 'total', 'home', 'mp_opp', 'fg_opp', 'fga_opp', 'fg%_opp', '3p_opp', '3pa_opp', '3p%_opp', 'ft_opp', 'fta_opp', 'ft%_opp', 'orb_opp', 'drb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp', 'pf_opp', 'pts_opp', 'ts%_opp', 'efg%_opp', '3par_opp', 'ftr_opp', 'orb%_opp', 'drb%_opp', 'trb%_opp', 'ast%_opp', 'stl%_opp', 'blk%_opp', 'tov%_opp', 'usg%_opp', 'ortg_opp', 'drtg_opp', '

In [627]:
#NOTE: Don't need to do anymore since predicting this way without rolling is poor (will also comment out next few lines)
#sfs.fit(df[selected_columns], df["target"])

In [628]:
# Pick 30 best features
#predictors = list(selected_columns[sfs.get_support()])
#predictors

In [629]:
# Backtesting will let us test our model's accuracy by splitting data up by season and use past seasons to predict another one
# (i.e. using 2016,2017 to predict 2018's games and seeing if results are accurate to what really happened in 2018)
def backtest(df, model, predictors, detailed, start=2, step=1):
    all_predictions = [] #List of dataframes for actual vs prediction results for each season
    
    #Get list of all unique seasons
    seasons = sorted(df["season"].unique())
    #Loop through each season, starting at 2 (so 2018 will be the first season we predict for)
    for i in range(start, len(seasons), step):
        season = seasons[i]
        # Grab only df data from 2016 (our oldest data) to one before current iterating season for training model
        train = df[df["season"] < season] 
        # Grab all data for current iterating season that we will try to predict
        test = df[df["season"] == season] 
        #Fit model, feeding in the features we want to train from, and the correct results from the train set
        model.fit(train[predictors], train["target"])
        #After fitting, predict current iterating season by passing in the predictor values 
        preds = model.predict(test[predictors])
        #Convert array to a pandas series that labels each prediction by it's dataframe index
        preds = pd.Series(preds, index=test.index)
        #Now, preds is a series of 0's and 1's of whether or not team is predicted to win that index's game
        #Combine actual results (the target of test set) with predictions and rename for clarity
        if detailed:
            combined = pd.concat([test["target"], preds, test["team_x"], test["team_y"], test["date"], test["date_next"]], axis=1)
            combined.columns = ["actual", "prediction", "team1", "team2", "last played date", "predict game date"]
        else:
            combined = pd.concat([test["target"], preds], axis=1)
            combined.columns = ["actual", "prediction"]
        #Save actual/predictions for this season
        all_predictions.append(combined)
        
    #Concat all season's together and return
    return pd.concat(all_predictions)

In [630]:
# Pass in ridge classifier as our model along with dataframe and predictors
#predictions = backtest(df, rr, predictors, detailed = False)

In [631]:
#predictions

In [632]:
from sklearn.metrics import accuracy_score

In [633]:
#predictions = predictions[predictions["actual"] != 2] # Remove the 2s 
#accuracy_score(predictions["actual"], predictions["prediction"])

In [634]:
#Group by home, and then calculate % of time team won when they were home (number of won rows versus total number of rows)
#df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

In [635]:
#We see how our prediction was worse than just guessing by home/away
#To beat this baseline percentage, we will use a team's last 10 games instead of 1 game, and then run prediction again to improve %
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [636]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.454545,0.406250,0.769,0.793103,...,0.071,0.550314,0.151282,0.800948,0.500000,0.267857,1.0,True,DET,2016
1,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.272727,0.234375,0.800,0.241379,...,0.047,0.300839,0.020513,0.203791,0.306818,0.375000,0.0,False,ATL,2016
2,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.227273,0.265625,0.588,0.379310,...,0.140,0.509434,0.161538,0.345972,0.306818,0.294643,1.0,False,CLE,2016
3,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.363636,0.359375,0.696,0.241379,...,0.185,0.270440,0.089744,0.232227,0.318182,0.276786,0.0,True,CHI,2016
4,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.431818,0.421875,0.704,0.275862,...,0.079,0.679245,0.278205,0.554502,0.306818,0.419643,1.0,False,NOP,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24487,0.478261,0.514706,0.389952,0.310345,0.333333,0.410926,0.295455,0.265625,0.765,0.448276,...,0.157,0.119497,0.107692,0.260664,0.454545,0.383929,0.0,False,CHO,2025
24488,0.478261,0.411765,0.471292,0.241379,0.500000,0.224466,0.409091,0.312500,0.900,0.275862,...,0.089,0.344864,0.125641,0.279621,0.443182,0.357143,1.0,True,DEN,2025
24489,0.239130,0.455882,0.145933,0.310345,0.560606,0.261283,0.681818,0.546875,0.857,0.379310,...,0.088,0.262055,0.107692,1.000000,0.352273,0.437500,1.0,False,ORL,2025
24490,0.413043,0.485294,0.334928,0.413793,0.545455,0.356295,0.340909,0.281250,0.833,0.379310,...,0.052,0.237945,0.160256,0.450237,0.397727,0.366071,0.0,False,MIN,2025


In [637]:
#Team parameter is sub-set of entire rolling_df for each team/each season (i.e. ATL 2015, ATL 2016, OKC 2021)
def find_team_averages(team):
    #Rolling method groups row with previous 10 games, and then we take mean of each stat (and exclude won, team, and season)
    rolling = team[selected_columns].rolling(10).mean()
    return rolling
    
#Group by team to ensure last 10 are that team's games, and group season to ensure that we don't go back to previous season
df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)


In [638]:
df_rolling

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24487,0.452174,0.433824,0.419856,0.424138,0.471212,0.409145,0.388636,0.342187,0.7802,0.451724,...,0.180044,0.324197,0.0607,0.1455,0.318973,0.177436,0.369194,0.507955,0.411607,0.3
24488,0.591304,0.400000,0.627751,0.386207,0.396970,0.445249,0.397727,0.331250,0.8290,0.386207,...,0.183004,0.330963,0.0512,0.0792,0.233857,0.124615,0.473934,0.613636,0.495536,0.7
24489,0.356522,0.401471,0.329665,0.324138,0.460606,0.320546,0.431818,0.412500,0.7225,0.417241,...,0.244298,0.484060,0.0639,0.1531,0.545912,0.196923,0.530806,0.396591,0.441964,0.6
24490,0.482609,0.382353,0.508612,0.520690,0.490909,0.496081,0.429545,0.368750,0.8009,0.406897,...,0.234101,0.310550,0.0499,0.1002,0.296226,0.156282,0.429384,0.597727,0.400000,0.5


In [639]:
#Re-name all rolling stat columns for indication, and then concat the two dataframes together, side-by-side
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis = 1)

In [640]:
df = df.dropna() #Drop NaN stats from the first 10 games since they didn't have 10 games to go back of
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,trb%_max_opp_10,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10
243,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.272727,0.250000,0.750,0.275862,...,0.152851,0.311927,0.0628,0.0679,0.413522,0.125256,0.361611,0.434091,0.322321,0.4
249,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.340909,0.250000,0.938,0.206897,...,0.166667,0.412271,0.0613,0.0772,0.469497,0.220641,0.394787,0.513636,0.300893,0.5
254,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.454545,0.406250,0.769,0.517241,...,0.201974,0.331537,0.0657,0.1032,0.437212,0.126026,0.404739,0.394318,0.398214,0.2
255,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.522727,0.421875,0.852,0.448276,...,0.132675,0.458257,0.0699,0.1072,0.380294,0.274359,0.270616,0.462500,0.286607,0.6
256,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.636364,0.484375,0.903,0.344828,...,0.135965,0.398280,0.0747,0.0742,0.303564,0.131667,0.387678,0.396591,0.325893,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24487,0.478261,0.514706,0.389952,0.310345,0.333333,0.410926,0.295455,0.265625,0.765,0.448276,...,0.180044,0.324197,0.0607,0.1455,0.318973,0.177436,0.369194,0.507955,0.411607,0.3
24488,0.478261,0.411765,0.471292,0.241379,0.500000,0.224466,0.409091,0.312500,0.900,0.275862,...,0.183004,0.330963,0.0512,0.0792,0.233857,0.124615,0.473934,0.613636,0.495536,0.7
24489,0.239130,0.455882,0.145933,0.310345,0.560606,0.261283,0.681818,0.546875,0.857,0.379310,...,0.244298,0.484060,0.0639,0.1531,0.545912,0.196923,0.530806,0.396591,0.441964,0.6
24490,0.413043,0.485294,0.334928,0.413793,0.545455,0.356295,0.340909,0.281250,0.833,0.379310,...,0.234101,0.310550,0.0499,0.1002,0.296226,0.156282,0.429384,0.597727,0.400000,0.5


In [641]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col
    
def add_col(df, col_name):
    #Use shift_col as a lambda to get the column name's next value (for that specific team group)
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

#Add columns to indicate if team will be home, who they'll face, and date
#These will be NaN on the most recent game that team played, so we'll have to manually fill it in using scraped schedule
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")


  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [642]:
#Find rows where team is CHI and list out columns (last 3 are what we will have to manually fill using schedule of next game)
df[df["team"] == "LAL"][["date", "target", "team", "season", "team_opp", "home_next", "team_opp_next", "date_next"]]

Unnamed: 0,date,target,team,season,team_opp,home_next,team_opp_next,date_next
295,2015-11-15,0,LAL,2016,DET,0.0,PHO,2015-11-16
305,2015-11-16,0,LAL,2016,PHO,1.0,TOR,2015-11-20
363,2015-11-20,0,LAL,2016,TOR,1.0,POR,2015-11-22
395,2015-11-22,0,LAL,2016,POR,0.0,GSW,2015-11-24
423,2015-11-24,0,LAL,2016,GSW,0.0,POR,2015-11-28
...,...,...,...,...,...,...,...,...
24380,2025-01-25,1,LAL,2025,GSW,0.0,CHO,2025-01-27
24412,2025-01-27,0,LAL,2025,CHO,0.0,PHI,2025-01-28
24423,2025-01-28,1,LAL,2025,PHI,0.0,WAS,2025-01-30
24451,2025-01-30,1,LAL,2025,WAS,0.0,NYK,2025-02-01


In [643]:
# Now, we will use our future csv with future games to fill in home_next, team_opp_next, and date_next
future = pd.read_csv("future_2025.csv")
#future[future["away_abbrev"] == "MEM"]
future

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral,home_abbrev,away_abbrev
0,2025-02-01,8:30p,Los Angeles Lakers,New York Knicks,NYK,LAL
1,2025-02-01,10:00p,Phoenix Suns,Portland Trail Blazers,POR,PHO
2,2025-02-01,8:30p,Miami Heat,San Antonio Spurs,SAS,MIA
3,2025-02-01,8:00p,Sacramento Kings,Oklahoma City Thunder,OKC,SAC
4,2025-02-01,5:00p,Atlanta Hawks,Indiana Pacers,IND,ATL
...,...,...,...,...,...,...
514,2025-04-13,1:00p,New York Knicks,Brooklyn Nets,BRK,NYK
515,2025-04-13,1:00p,Charlotte Hornets,Boston Celtics,BOS,CHO
516,2025-04-13,1:00p,Orlando Magic,Atlanta Hawks,ATL,ORL
517,2025-04-13,3:30p,Los Angeles Clippers,Golden State Warriors,GSW,LAC


In [644]:
# Before using the future dataframe, we will drop all the games that have already been played using today's date
def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d").date() # We access date() method to not compare times (ruins comparison)
    
def remove_old_games(df, curr_date):
    # Drop all rows with dates before today (but keep today's date for predicting)
    df = df.drop(
        df[df["Date"].apply(parse_date) < curr_date.date()].index
    )
    return df

curr_date = datetime.today()
future = remove_old_games(future, curr_date)
future

Unnamed: 0,Date,Start (ET),Visitor/Neutral,Home/Neutral,home_abbrev,away_abbrev
9,2025-02-02,6:00p,Boston Celtics,Philadelphia 76ers,PHI,BOS
10,2025-02-02,8:30p,Memphis Grizzlies,Milwaukee Bucks,MIL,MEM
11,2025-02-02,3:30p,Dallas Mavericks,Cleveland Cavaliers,CLE,DAL
12,2025-02-02,3:30p,Los Angeles Clippers,Toronto Raptors,TOR,LAC
13,2025-02-02,3:00p,Chicago Bulls,Detroit Pistons,DET,CHI
...,...,...,...,...,...,...
514,2025-04-13,1:00p,New York Knicks,Brooklyn Nets,BRK,NYK
515,2025-04-13,1:00p,Charlotte Hornets,Boston Celtics,BOS,CHO
516,2025-04-13,1:00p,Orlando Magic,Atlanta Hawks,ATL,ORL
517,2025-04-13,3:30p,Los Angeles Clippers,Golden State Warriors,GSW,LAC


In [645]:
# Now that future contains only games that have yet to be played (either today or later), we can use to fill NaN
def nearest_date(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

def parse_date(date_str):
    return datetime.strptime(date_str, "%Y-%m-%d").date()

# Takes in a row for game that requires filling in a next stat (home, team_opp, date)
def add_next_stat(target_game, stat): 
    # Get current team abbrev and date of their last played game
    team_abbrev = target_game["team"]
    last_date = datetime.strptime(target_game["date"], "%Y-%m-%d").date()
    # Use abbrev to get all the future games of the team
    next_games = future[
        (future["home_abbrev"] == team_abbrev) | (future["away_abbrev"] == team_abbrev)
    ]
    # From rows of future games for team, find the one that is closest to the last game date, using iloc to get 'first' (only) row
    next_game = next_games[
        next_games["Date"] == datetime.strftime(nearest_date(next_games["Date"].apply(parse_date), last_date), "%Y-%m-%d")
    ].iloc[0] 
    # Now that we have a series for team_abbrev's next game, check which stat was passed in and return corresponding value
    if (stat == "home_next"):
        if next_game["home_abbrev"] == team_abbrev: 
            return 1
        return 0
    if (stat == "date_next"):
        return next_game["Date"]
    if (stat == "team_opp_next"):
        if next_game["home_abbrev"] == team_abbrev:
            return next_game["away_abbrev"]
        return next_game["home_abbrev"]
    return None # Will never hit this

#Use our add_next_stat function to apply to all rows with target == 2 the three missing stats. 
df.loc[df["target"] == 2, "home_next"] = df[df["target"] == 2].apply(add_next_stat, stat="home_next", axis = 1)
df.loc[df["target"] == 2, "team_opp_next"] = df[df["target"] == 2].apply(add_next_stat, stat="team_opp_next", axis = 1)
df.loc[df["target"] == 2, "date_next"] = df[df["target"] == 2].apply(add_next_stat, stat="date_next", axis = 1)

In [646]:
#Testing new method (either way seems to work)
# df_copy = df.copy()
# dfnulls = df.index[pd.isnull(df["team_opp_next"])].tolist()

# for i in dfnulls:
#     print(i, df['team'][i], df['team_opp_next'][i])
#     #if df['team'][i] == 'CHI' or df['team'][i] == 'CHO':
#         #continue
#     firstIndxVisitor = future.index[(future['away_abbrev'] == df['team'][i])].tolist()[0]
#     firstIndxHome = future.index[(future['home_abbrev'] == df['team'][i])].tolist()[0]
#     firstIndx = min(firstIndxHome, firstIndxVisitor)
    
#     if firstIndx == firstIndxHome:
#         df.loc[i, 'team_opp_next'] = future.loc[firstIndx, 'away_abbrev']
#         df.loc[i, 'home_next'] = 1  # Home game
#     else:
#         df.loc[i, 'team_opp_next'] = future.loc[firstIndx, 'home_abbrev']
#         df.loc[i, 'home_next'] = 0  # Away game
        
#     df.loc[i, 'date_next'] = future.loc[firstIndx, 'Date']

In [647]:
# Issue here is that there may be 2 TOR-LAC and LAC-TOR rows, and then an extra LAL-LAC row. LAL-LAC will be dropped
# Since there's no LAC-LAL row that corresponds to it in the merge. So we go from 30 rows to ~24 rows and don't predict some
df[df["target"] == 2][["target", "date", "team", "home_next", "team_opp_next", "date_next"]]

Unnamed: 0,target,date,team,home_next,team_opp_next,date_next
24455,2,2025-01-30,MEM,0.0,MIL,2025-02-02
24457,2,2025-01-30,CLE,1.0,DAL,2025-02-02
24460,2,2025-01-31,TOR,1.0,LAC,2025-02-02
24461,2,2025-01-31,CHI,0.0,DET,2025-02-02
24463,2,2025-01-31,MIL,1.0,MEM,2025-02-02
24464,2,2025-01-31,PHI,1.0,BOS,2025-02-02
24466,2,2025-01-31,NOP,0.0,DEN,2025-02-03
24467,2,2025-01-31,GSW,1.0,ORL,2025-02-03
24469,2,2025-01-31,DET,1.0,CHI,2025-02-02
24470,2,2025-01-31,DAL,0.0,CLE,2025-02-02


In [648]:
df = df.copy() #Copy just to prevent errors
#print(list(df.columns))

In [649]:
# Now, we match a team's next game date to another team's next game date 
# Then, it'll add new columns to each row to signify their match (opponent stats will be {name}_y)
# This way, the model will know things like date_next_y (date of opponents next game) and could use it
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"]
)

In [650]:
# Here, team_y is team_x's (the one we're analyzing) NEXT opponent, and so team_y's next opponent should match team_x
full[full["target"] == 2][["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
21647,MEM,MIL,MIL,MEM,2025-02-02
21649,CLE,DAL,DAL,CLE,2025-02-02
21652,TOR,LAC,LAC,TOR,2025-02-02
21653,CHI,DET,DET,CHI,2025-02-02
21655,MIL,MEM,MEM,MIL,2025-02-02
21656,PHI,BOS,BOS,PHI,2025-02-02
21658,NOP,DEN,DEN,NOP,2025-02-03
21659,GSW,ORL,ORL,GSW,2025-02-03
21661,DET,CHI,CHI,DET,2025-02-02
21662,DAL,CLE,CLE,DAL,2025-02-02


In [651]:
#testing problem where games are dropped
before_merge = df[df["target"] == 2][["team", "team_opp_next", "date_next"]]
after_merge = full[full["target"] == 2][["team_x", "team_opp_next_x", "date_next"]]

missing_games = before_merge[~before_merge["team"].isin(after_merge["team_x"])]
print("NOTE: The following games will not have predictions because their opponents have yet to play") 
print(missing_games)

NOTE: The following games will not have predictions because their opponents have yet to play
      team team_opp_next   date_next
24474  SAS           MEM  2025-02-03
24475  MIA           CHI  2025-02-04
24478  OKC           MIL  2025-02-03
24481  LAL           LAC  2025-02-04
24484  ATL           DET  2025-02-03
24486  BRK           HOU  2025-02-04


In [652]:
#Now, we prepare to select features again with these new rolling averages by selecting new string columns
removed_columns = ["season", "date", "won", "target", "team", "team_opp"] 
removed_columns = list(full.columns[full.dtypes=="object"]) + removed_columns

In [653]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [654]:
#Get all columns that aren't removed (includes our new _x and _y merge cols)
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [655]:
#Use feature selector to get 30 best again
sfs.fit(full[selected_columns], full["target"])

In [662]:
#Get Support function returns array of bools for wehther or not feature was included, so index by trues
predictors = list(selected_columns[sfs.get_support()])

In [663]:
predictors

['ft%',
 'orb',
 'usg%',
 'ftr_max',
 'blk%_opp',
 'usg%_opp',
 'ft_max_opp',
 'ft%_max_opp',
 'trb%_max_opp',
 'usg%_10_x',
 'ft%_max_10_x',
 'pts_max_10_x',
 '+/-_max_10_x',
 'blk%_opp_10_x',
 'usg%_opp_10_x',
 'ast%_max_opp_10_x',
 'tov%_max_opp_10_x',
 'home_next',
 'tov%_10_y',
 'usg%_10_y',
 'ast_max_10_y',
 'pts_max_10_y',
 '+/-_max_10_y',
 'mp_opp_10_y',
 'ast%_opp_10_y',
 'blk%_opp_10_y',
 'usg%_opp_10_y',
 'drb_max_opp_10_y',
 'ast%_max_opp_10_y',
 'blk%_max_opp_10_y']

In [664]:
predictions = backtest(full, rr, predictors, detailed=True)

In [665]:
real_predictions = predictions[predictions["actual"] == 2]
#Get new accuracy score
predictions = predictions[predictions["actual"] != 2] # Remove the 2s for accuracy score
accuracy_score(predictions["actual"], predictions["prediction"])

0.632915120457089

In [666]:
# 2/1/25:
#To predict new games, will need to manually fill in "home_next", "team_opp_next", "date_next" columns for 30 teams
#since they will be NaN for their most recent game. 
# We can do this using the future_games.csv file: load into dataframe, grab date, home_abbrev, away_abbrev cols
#Then re-run the full = merge line and all the predictions
#Then, to look at games that haven't happened yet, look at any rows where actual/target column was 2 and take note of predictions:
# predictions[predictions["actual"] == 2]

In [671]:
#TODO: ANALYZE
real_predictions

Unnamed: 0,actual,prediction,team1,team2,last played date,predict game date
21647,2,0,MEM,MIL,2025-01-30,2025-02-02
21649,2,1,CLE,DAL,2025-01-30,2025-02-02
21652,2,0,TOR,LAC,2025-01-31,2025-02-02
21653,2,0,CHI,DET,2025-01-31,2025-02-02
21655,2,1,MIL,MEM,2025-01-31,2025-02-02
21656,2,1,PHI,BOS,2025-01-31,2025-02-02
21658,2,0,NOP,DEN,2025-01-31,2025-02-03
21659,2,1,GSW,ORL,2025-01-31,2025-02-03
21661,2,1,DET,CHI,2025-01-31,2025-02-02
21662,2,0,DAL,CLE,2025-01-31,2025-02-02
