In [40]:
# Import Libraries
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [41]:
# Read in data file
df = pd.read_csv("C:/Users/fishm/Documents/NBA Project/Data Files/nba_games.csv", index_col=0)
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
1,240.0,240.0,36.0,100.0,0.360,7.0,31.0,0.226,16.0,19.0,...,50.0,32.6,152.0,111.0,ATL,98,0,2016,2015-12-09,False
2,240.0,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,...,20.0,30.9,148.0,116.0,SAS,107,1,2018,2017-10-18,False
3,240.0,240.0,41.0,89.0,0.461,8.0,21.0,0.381,17.0,19.0,...,28.6,30.9,138.0,118.0,MIN,99,0,2018,2017-10-18,True
4,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,30.9,157.0,90.0,MEM,92,1,2021,2021-04-30,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,...,34.2,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True
17768,240.0,240.0,37.0,74.0,0.500,13.0,25.0,0.520,26.0,37.0,...,25.0,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True
17769,240.0,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
17770,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True


In [42]:
# Sort rows by 'date' and reset the index
df = df.sort_values("date")
df = df.reset_index(drop=True)

# Remove unneccesary columns 
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

# Function for assigning the 'target' variable
# target = whether the team won or lost there next game
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

# Split the dataframe up by team (so we get the specific teams next game)
# Then apply the 'add_target' funtion to each team
df = df.groupby("team", group_keys=False).apply(add_target)

In [43]:
# Visualize and test the target values for a single team
df[df["team"] == "PHI"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
28,240.0,34.0,83.0,0.410,7.0,22.0,0.318,20.0,23.0,0.870,...,42.9,239.0,107.0,BOS,112,1,2016,2015-10-28,False,False
52,240.0,19.0,63.0,0.302,6.0,15.0,0.400,27.0,37.0,0.730,...,33.1,200.0,94.0,UTA,99,0,2016,2015-10-30,False,False
92,240.0,38.0,85.0,0.447,7.0,24.0,0.292,17.0,23.0,0.739,...,34.2,171.0,112.0,CLE,107,0,2016,2015-11-02,False,False
130,240.0,34.0,81.0,0.420,6.0,28.0,0.214,13.0,15.0,0.867,...,42.0,110.0,105.0,MIL,91,1,2016,2015-11-04,False,False
157,240.0,41.0,79.0,0.519,10.0,23.0,0.435,10.0,15.0,0.667,...,31.9,177.0,113.0,CLE,108,1,2016,2015-11-06,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17697,240.0,38.0,84.0,0.452,8.0,30.0,0.267,19.0,22.0,0.864,...,33.4,200.0,114.0,MIA,119,1,2022,2022-05-04,False,True
17700,240.0,32.0,67.0,0.478,16.0,33.0,0.485,19.0,22.0,0.864,...,37.1,121.0,127.0,MIA,79,0,2022,2022-05-06,True,True
17711,240.0,37.0,68.0,0.544,16.0,33.0,0.485,26.0,34.0,0.765,...,28.6,155.0,129.0,MIA,108,0,2022,2022-05-08,True,False
17716,240.0,31.0,85.0,0.365,9.0,32.0,0.281,14.0,15.0,0.933,...,34.6,284.0,102.0,MIA,120,1,2022,2022-05-10,False,False


In [44]:
# There are NaN values in the target column for games that don't have a next game (i.e. last game of the season)
# So we convert those NaN values to a 2 (win = 1, loss = 0, neither = 2)
df["target"][pd.isnull(df["target"])] = 2

# Convert the target column from True/False to numbers (boolean to integer datatype)
df["target"] = df["target"].astype(int, errors="ignore")

# Test to make sure the 'won' column is balances (every game has a winner or losser)
df["won"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


False    8886
True     8886
Name: won, dtype: int64

In [45]:
# Test the balance of the 'target' column (unbalanced due to the '2' values)
df["target"].value_counts()

1    8872
0    8870
2      30
Name: target, dtype: int64

In [46]:
# Have to remove or fill in columns with null values
# Check the count of how many columns have null values
nulls = pd.isnull(df).sum()
# Get only the columns that have instances of null values (columns where the null sum > 0)
nulls = nulls[nulls > 0]
# For those columns without any null values, assign it to a 'valid_columns' variable
valid_columns = df.columns[~df.columns.isin(nulls.index)]

valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [47]:
# Reassign the df dataframe to select only those valid columns
# You can get a "copy" warning if you assign slices of a df to itself (hence the copy() method) 
df = df[valid_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


In [48]:
# Initialize ridge classifier to help classify if the team will win or lose there next game
rr = RidgeClassifier(alpha=1)

# Splits the data to ensure that we will use past data to predict the future not the other way around
# (use data from 2015 to predict 2022 games but not the other way around)
split = TimeSeriesSplit(n_splits=3)

# Set up our sequential feature selector
# passes our ML model
# selector trains the model based on different features
# select 'n' amount of features to return back
# sequential feature selection means you can go forward or backwards, 'forward' means it starts with 0 features and selects features that help the model the most until we get to 30
# pass in our time series split for cross-validation (cv)
sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

# Define meta-data columns that do not need to be scaled
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

# Define columns that we want to scale, between 0-1
selected_columns = df.columns[~df.columns.isin(removed_columns)]

# Ridge Regression works best on scaled data
# define the scaler
scaler = MinMaxScaler()
# Run the scaler on the selected columns in the df
df[selected_columns] = scaler.fit_transform(df[selected_columns])

df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [49]:
# Fit our sequential feature selector
# pass in our selected_columns and the 'target' column to return the 30 best features to help predict the target
sfs.fit(df[selected_columns], df["target"])

In [50]:
# Get the list of our predictor columns out of the sequential feature selector
# sfs.get_support() = True if the feature selector has picked that column we should be using 
predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 'fg%',
 '3p%',
 'orb',
 'ts%',
 'usg%',
 '3p%_max',
 'ft_max',
 'fta_max',
 '+/-_max',
 'drb%_max',
 'trb%_max',
 'tov%_max',
 'usg%_max',
 'mp_opp',
 'fg_opp',
 '3p_opp',
 'ft%_opp',
 'blk_opp',
 'usg%_opp',
 'fga_max_opp',
 '3p_max_opp',
 'ft_max_opp',
 'ft%_max_opp',
 'blk_max_opp',
 'pf_max_opp',
 'pts_max_opp',
 'drb%_max_opp',
 'blk%_max_opp',
 'usg%_max_opp']

In [51]:
# Backtest function 
# splits the data up by season and use the past seasons to predict future seasons
# start = 2 means we need at least 2 seasons of data before we start making predictions (i.e. using 2016&2017 seasons to predict 2018, then using 2016&2017&2018 to predict 2019, and so on)
# step = 1 means we make predictions (progress) for 1 team, if we wanted to make predictions for 2 seasons at a time step = 2
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = [] # a list of dataframes where each df is the predictions for a single season
    
    seasons = sorted(data["season"].unique()) # create a list of all seasons that exist in the data
    
    # loop through each season
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]  # training data, all of the data that comes before the current season (seasons[i])
        test = data[data["season"] == season]  # testing data, the current season data
        
        model.fit(train[predictors], train["target"]) # fit our training data into the model with our target value
        
        preds = model.predict(test[predictors]) # generate our predictions based off the test set
        preds = pd.Series(preds, index=test.index) # convert to a pd series because numpy arrays are harder to deal with
        combined = pd.concat([test["target"], preds], axis=1) # merge the actual values (test["target"]) and the predictions into a df
        combined.columns = ["actual", "prediction"] # rename the columns for simplicity 
        
        all_predictions.append(combined) # append the combined df's to the all_predictions list, should have 5 dfs for the 5 seasons in the data
    return pd.concat(all_predictions)

In [52]:
# Run the backtest function by passing in our data, rr model, and predictors
predictions = backtest(df, rr, predictors)
predictions

Unnamed: 0,actual,prediction
5250,1,1
5251,1,1
5252,0,0
5253,1,0
5254,0,1
...,...,...
17767,0,0
17768,1,1
17769,0,1
17770,2,1


In [53]:
# Remove target = 2 rows from the accuracy score for more precision 
predictions = predictions[predictions["actual"] != 2]

# Print the accuracy score using this sklearn function (here we can predict who wins a game 54.7% of the time)
accuracy_score(predictions["actual"], predictions["prediction"])

0.5485110470701249

In [23]:
# Now we want to improve our model to get a better accuract score
# the first step in doing so is to set a baseline for what good accuracy is
# so what this does is split the data in to 2 groups (home and away) and find the winning percentage for each group
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])
# as we can see, predicting the home team to win is more accurate than our model. We want to beat 57% !

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [24]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [25]:
# Create a rolling averages df to help us get more info on team data (i.e. last 5 to 10 game averages)
# a team can have uncharacteristically really good or bad games, so we want to add more metrics for recent-game statistics
# define a new df with our selected columns plus additional columns to help us compute our rolling averages
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

# Create a quick function to find the team average over x amount of games
# for every game we'll find the average (mean) performance of the team from there last 10 games
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

# Before calling the function, group by team so that the avg performance in previous games is specific to the team in question
# group by season because teams change a lot from szn to szn, so we want to make sure we use rolling averages only for that season in question
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
df_rolling
# we get a lot of missing (NaN) values because in the beginning of a szn team may not have 10 games played yet

  rolling = team.rolling(10).mean()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.381818,0.292647,0.428230,0.468966,0.477273,0.448100,0.434884,0.373016,0.764177,...,0.0570,0.1113,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0
17768,0.0,0.502273,0.364706,0.517703,0.455172,0.481818,0.440736,0.320930,0.282540,0.757993,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17769,0.0,0.354545,0.279412,0.404545,0.437931,0.465152,0.429572,0.434884,0.385714,0.736639,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17770,0.0,0.354545,0.294118,0.389952,0.434483,0.459091,0.431710,0.406977,0.357143,0.754142,...,0.0572,0.1111,0.483229,0.172144,0.460190,0.472941,0.344231,0.5,0.5,2022.0


In [26]:
# Now we want to rename the rolling columns and merge into our main df
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Merge our original df with our rolling avgs df
df = pd.concat([df, df_rolling], axis=1)

# Remove any rows that have NaN values (first 10 games in the season)
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
243,0.0,0.522727,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.0628,0.0679,0.413522,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0
251,0.0,0.659091,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.0613,0.0772,0.469497,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0
252,0.0,0.386364,0.382353,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.0625,0.1145,0.437841,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0
253,0.0,0.500000,0.382353,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.0646,0.0759,0.512159,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0
256,0.0,0.318182,0.132353,0.500000,0.275862,0.272727,0.432304,0.581395,0.444444,0.879813,...,0.0741,0.0982,0.313312,0.179974,0.500000,0.471765,0.380769,0.5,0.4,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.0570,0.1113,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.0572,0.1111,0.483229,0.172144,0.460190,0.472941,0.344231,0.5,0.5,2022.0


In [27]:
# Next thing to improve the accuaracy is to give the model more information (like if the next game will be home or away and who the opp is)
# create a function that takes the value from the next game and shifts it back one row
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Create a function to add a column based off the shift_col function
def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# Use those functions ^ to assign new columns ("home_next") to the next row/game value for that column
# so now we'll have 3 new columns for are they home next, who there next opp is, and what the next game data is
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
243,0.0,0.522727,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0,0.0,BOS,2015-11-13
251,0.0,0.659091,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0,1.0,BRK,2015-11-14
252,0.0,0.386364,0.382353,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0,0.0,MIN,2015-11-15
253,0.0,0.500000,0.382353,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0,0.0,CHI,2015-11-16
256,0.0,0.318182,0.132353,0.500000,0.275862,0.272727,0.432304,0.581395,0.444444,0.879813,...,0.179974,0.500000,0.471765,0.380769,0.5,0.4,2016.0,0.0,CHO,2015-11-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0,0.0,GSW,2022-06-13
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0,0.0,BOS,2022-06-16
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0,1.0,GSW,2022-06-16
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.172144,0.460190,0.472941,0.344231,0.5,0.5,2022.0,,,


In [28]:
# Now we want to pull in stats about the opponent with the same rolling columns
# using merge vs concat to join properly
# basically the left_on is the team we're trying to predict for and right_on finds there next opp and merges the opp info with our teams info
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])
# anything with _y are the columns from the opponent team df and _x or anything else if from the original team
full

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.00,0.477273,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.380294,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0,SAC,TOR
1,0.00,0.340909,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.437212,0.124904,0.404739,0.408235,0.428846,0.2,0.3,2016.0,TOR,SAC
2,0.50,0.409091,0.455882,0.330144,0.482759,0.515152,0.437055,0.372093,0.412698,0.568261,...,0.504403,0.153273,0.344076,0.384706,0.319231,0.7,0.5,2016.0,CLE,DET
3,0.25,0.545455,0.544118,0.416268,0.413793,0.454545,0.419240,0.186047,0.142857,0.883314,...,0.467505,0.276508,0.352607,0.482353,0.316346,0.7,0.6,2016.0,GSW,TOR
4,0.00,0.340909,0.558824,0.186603,0.206897,0.469697,0.203088,0.139535,0.111111,0.854142,...,0.413732,0.156739,0.470142,0.391765,0.436538,0.6,0.1,2016.0,DEN,NOP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15769,0.00,0.545455,0.426471,0.511962,0.448276,0.469697,0.440618,0.372093,0.365079,0.659277,...,0.457128,0.235173,0.562085,0.552941,0.429808,0.4,0.6,2022.0,BOS,GSW
15770,0.00,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0,GSW,BOS
15771,0.00,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.431761,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0,BOS,GSW
15772,0.00,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0,GSW,BOS


In [29]:
# Lets take a look at a sample size of our data now
# here we can see that in each row we show both teams (x and y) in a game and who they play next and on what date
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]
# notice the row count drops a little due to missing values when there is no 'next game'

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,CLE,DET,DET,CLE,2015-11-17
3,GSW,TOR,TOR,GSW,2015-11-17
4,DEN,NOP,NOP,DEN,2015-11-17
...,...,...,...,...,...
15769,BOS,GSW,GSW,BOS,2022-06-10
15770,GSW,BOS,BOS,GSW,2022-06-13
15771,BOS,GSW,GSW,BOS,2022-06-13
15772,GSW,BOS,BOS,GSW,2022-06-16


In [30]:
# We need to run the sequential feature selector again since we added these new metrics
# so we must add to the removed_columns variable new columns of type 'object' datatypes
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [31]:
# Update the selected columns variable with newly removed columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

# Then fit our seuqntial feature selector like we did earlier
sfs.fit(full[selected_columns], full["target"])

In [32]:
# Get the selected columns out of the feature selector also like we did earlier
predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 'orb',
 'ast',
 'tov',
 'usg%',
 'pf_max',
 'trb%_max',
 'stl%_max',
 'mp_opp',
 'usg%_opp',
 'usg%_10_x',
 'ft%_max_10_x',
 '3par_max_10_x',
 'usg%_opp_10_x',
 'stl_max_opp_10_x',
 'won_10_x',
 'home_next',
 'drb_10_y',
 'trb%_10_y',
 'usg%_10_y',
 'ft_max_10_y',
 'efg%_max_10_y',
 'tov%_max_10_y',
 'trb%_opp_10_y',
 'usg%_opp_10_y',
 'fga_max_opp_10_y',
 'fta_max_opp_10_y',
 'ft%_max_opp_10_y',
 'orb%_max_opp_10_y',
 'won_10_y']

In [33]:
# Run the backtest function again with our new predictors and metrics
predictions = backtest(full, rr, predictors)

# Accuracy score shows improvements! 63% is greater than our threshold of 57%
accuracy_score(predictions["actual"], predictions["prediction"])

0.6296296296296297

In [55]:
# Future updates to improve the accuracy and to actually predict future games:
# Accuracy:
# 1. To improve accuracy, we can try using a more powerful model than Ridge Regression Model (like XG Boost or random forest classifier)
# 2. Try out different numbers of features for the model (currently we have 30 features)
# 3. Could also try "backwards" direction for feature selection
# 4. Could try different ratios for rolling stats (i.e. try last 5 games vs 10 etc.)
#
# Predictions:
# 1. Look at the df on line 27, what we'd need to do is refresh the data to get the most recent games played in the season
#    and manually fill in those missing values (NaN or None) for both the team and oppenet with there actual next game info

###### Credit:
# https://www.youtube.com/watch?v=egTylm6C2is
# https://github.com/dataquestio/project-walkthroughs/tree/master/nba_games