## Training the Model
There are several steps involved in train a machine learning model to predict NBA games

 - Retrieving and storing historical data from web scraper as a pandas data frame  
 - Processing the data to make it suitable for machine learning  
    - Converting categorical data into numerical data
    - Calculating stat averages from past games
    - Combining team and opponent stats to measure a team's offensive and defensive capability  
 - Creating a ridge classification model to predict NBA games  
    - Creating testing and training data sets  
    - Training a model and testing it's accuracy  
    - Saving the model to predict future games without retraining  

Model Training:
 - https://www.youtube.com/watch?v=egTylm6C2is&t=3034s
 - https://www.youtube.com/watch?v=2Bp8bytUN24&t=534s

In [107]:
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import joblib

# load historic nba data from data file
historic_nba_data = pd.read_csv('../Data/Historic NBA Game Data.csv', index_col=0)
historic_nba_data.tail()

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,FTA_OPP,FT_PCT_OPP,ORB_OPP,DRB_OPP,TRB_OPP,AST_OPP,STL_OPP,BLK_OPP,TOV_OPP,PF_OPP
15157,2023,2023-06-07,HOME,MIA,94,DEN,109,LOSS,34,92,...,27,0.815,13,45,58,28,3,5,13,18
15158,2023,2023-06-09,AWAY,DEN,108,MIA,95,WIN,39,79,...,20,0.85,8,29,37,23,2,3,14,19
15159,2023,2023-06-09,HOME,MIA,95,DEN,108,LOSS,35,78,...,21,0.762,5,29,34,26,11,7,6,18
15160,2023,2023-06-12,AWAY,MIA,89,DEN,94,LOSS,33,96,...,23,0.565,11,46,57,21,6,7,14,13
15161,2023,2023-06-12,HOME,DEN,94,MIA,89,WIN,38,84,...,16,0.875,11,33,44,18,9,7,8,21


In [108]:
# converts categorical data into numeric values 
historic_nba_data['HOME/AWAY'] = historic_nba_data['HOME/AWAY'].astype('category')
historic_nba_data['HOME/AWAY'] = historic_nba_data['HOME/AWAY'].cat.codes
historic_nba_data['WIN/LOSS'] = historic_nba_data['WIN/LOSS'].astype('category')
historic_nba_data['WIN/LOSS'] = historic_nba_data['WIN/LOSS'].cat.codes

historic_nba_data.tail()

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,FTA_OPP,FT_PCT_OPP,ORB_OPP,DRB_OPP,TRB_OPP,AST_OPP,STL_OPP,BLK_OPP,TOV_OPP,PF_OPP
15157,2023,2023-06-07,1,MIA,94,DEN,109,0,34,92,...,27,0.815,13,45,58,28,3,5,13,18
15158,2023,2023-06-09,0,DEN,108,MIA,95,1,39,79,...,20,0.85,8,29,37,23,2,3,14,19
15159,2023,2023-06-09,1,MIA,95,DEN,108,0,35,78,...,21,0.762,5,29,34,26,11,7,6,18
15160,2023,2023-06-12,0,MIA,89,DEN,94,0,33,96,...,23,0.565,11,46,57,21,6,7,14,13
15161,2023,2023-06-12,1,DEN,94,MIA,89,1,38,84,...,16,0.875,11,33,44,18,9,7,8,21


In [109]:
nonnumeric_columns = ['SEASON', 'DATE',	'TEAM', 'TEAM_OPP', 'HOME/AWAY']
stats_column = [stat for stat in historic_nba_data.columns if stat not in nonnumeric_columns]

# finds the average of all team stats from their past ten games
def find_avg_game_data(team):
    team_stats = team[stats_column]
    avg_stats = team_stats.rolling(10).mean()
    return avg_stats

# finds the 10 game average for all games in the data set
avg_game_data = historic_nba_data
avg_game_data = avg_game_data.groupby('TEAM', group_keys=False).apply(find_avg_game_data)

# rename game average columns and combine with original data frame
avg_game_data.columns = [f'{stat}_AVG' for stat in avg_game_data.columns]
historic_nba_data = pd.concat([historic_nba_data, avg_game_data], axis=1)

historic_nba_data.tail()

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,FTA_OPP_AVG,FT_PCT_OPP_AVG,ORB_OPP_AVG,DRB_OPP_AVG,TRB_OPP_AVG,AST_OPP_AVG,STL_OPP_AVG,BLK_OPP_AVG,TOV_OPP_AVG,PF_OPP_AVG
15157,2023,2023-06-07,1,MIA,94,DEN,109,0,34,92,...,22.3,0.7981,10.5,32.5,43.0,23.6,5.6,5.0,12.7,17.7
15158,2023,2023-06-09,0,DEN,108,MIA,95,1,39,79,...,21.2,0.8311,7.5,29.3,36.8,24.2,5.2,4.3,9.8,19.9
15159,2023,2023-06-09,1,MIA,95,DEN,108,0,35,78,...,21.5,0.7984,10.1,32.9,43.0,24.0,6.1,5.4,11.8,17.7
15160,2023,2023-06-12,0,MIA,89,DEN,94,0,33,96,...,21.4,0.7674,10.6,34.6,45.2,23.9,6.5,5.2,11.7,17.0
15161,2023,2023-06-12,1,DEN,94,MIA,89,1,38,84,...,20.4,0.8478,7.8,29.2,37.0,23.8,5.5,4.3,9.6,19.8


In [110]:
# adds information about the next game (the game whose results we wish to predict)
def add_next_game_data(team):
    # target value to predict 
    team['NEXT_WIN/LOSS'] = team['WIN/LOSS'].shift(-1)

    # known data about next game
    team['NEXT_DATE'] = team['DATE'].shift(-1)
    team['NEXT_HOME/AWAY'] = team['HOME/AWAY'].shift(-1)
    team['NEXT_TEAM_OPP'] = team['TEAM_OPP'].shift(-1)

    return team

# add known data about next game for all games in the data set
historic_nba_data = historic_nba_data.groupby('TEAM', group_keys=False).apply(add_next_game_data)

historic_nba_data.tail()

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,TRB_OPP_AVG,AST_OPP_AVG,STL_OPP_AVG,BLK_OPP_AVG,TOV_OPP_AVG,PF_OPP_AVG,NEXT_WIN/LOSS,NEXT_DATE,NEXT_HOME/AWAY,NEXT_TEAM_OPP
15157,2023,2023-06-07,1,MIA,94,DEN,109,0,34,92,...,43.0,23.6,5.6,5.0,12.7,17.7,0.0,2023-06-09,1.0,DEN
15158,2023,2023-06-09,0,DEN,108,MIA,95,1,39,79,...,36.8,24.2,5.2,4.3,9.8,19.9,1.0,2023-06-12,1.0,MIA
15159,2023,2023-06-09,1,MIA,95,DEN,108,0,35,78,...,43.0,24.0,6.1,5.4,11.8,17.7,0.0,2023-06-12,0.0,DEN
15160,2023,2023-06-12,0,MIA,89,DEN,94,0,33,96,...,45.2,23.9,6.5,5.2,11.7,17.0,,,,
15161,2023,2023-06-12,1,DEN,94,MIA,89,1,38,84,...,37.0,23.8,5.5,4.3,9.6,19.8,,,,


In [111]:
# adds stats about the next game opponent 
avg_columns = [f'{col}_AVG' for col in stats_column]
historic_nba_data = historic_nba_data.merge(
    historic_nba_data[stats_column + avg_columns + ['NEXT_TEAM_OPP', 'NEXT_DATE', 'TEAM', 'HOME/AWAY']],
    left_on=['TEAM', 'NEXT_DATE'],
    right_on=['NEXT_TEAM_OPP', 'NEXT_DATE'],
    suffixes=('_A', '_H')
)

historic_nba_data.tail()

Unnamed: 0,SEASON,DATE,HOME/AWAY_A,TEAM_A,PTS_A,TEAM_OPP,PTS_OPP_A,WIN/LOSS_A,FG_A,FGA_A,...,DRB_OPP_AVG_H,TRB_OPP_AVG_H,AST_OPP_AVG_H,STL_OPP_AVG_H,BLK_OPP_AVG_H,TOV_OPP_AVG_H,PF_OPP_AVG_H,NEXT_TEAM_OPP_H,TEAM_H,HOME/AWAY_H
15125,2023,2023-06-04,1,DEN,108,MIA,111,0,39,75,...,31.3,41.3,22.1,5.5,5.4,12.6,18.1,DEN,MIA,0
15126,2023,2023-06-07,0,DEN,109,MIA,94,1,41,80,...,32.5,43.0,23.6,5.6,5.0,12.7,17.7,DEN,MIA,1
15127,2023,2023-06-07,1,MIA,94,DEN,109,0,34,92,...,28.9,36.8,24.7,5.6,4.4,9.7,20.4,MIA,DEN,0
15128,2023,2023-06-09,0,DEN,108,MIA,95,1,39,79,...,32.9,43.0,24.0,6.1,5.4,11.8,17.7,DEN,MIA,1
15129,2023,2023-06-09,1,MIA,95,DEN,108,0,35,78,...,29.3,36.8,24.2,5.2,4.3,9.8,19.9,MIA,DEN,0


In [112]:

# define target and training columns of the dataset
target = 'NEXT_WIN/LOSS'
non_numeric_columns = list(historic_nba_data.columns[historic_nba_data.dtypes == 'object']) + ['SEASON']
training_values = [col for col in historic_nba_data.columns if col not in non_numeric_columns and col != target]

# scales all columns to vales between 0 - 1
# all data points must be between 0 - 1 in order to preform ridge classifier
scaler = MinMaxScaler()
scaler.fit(historic_nba_data[training_values])
historic_nba_data[training_values] = scaler.transform(historic_nba_data[training_values])

# remove all rows with any missing data
historic_nba_data = historic_nba_data.dropna()
historic_nba_data.head()


Unnamed: 0,SEASON,DATE,HOME/AWAY_A,TEAM_A,PTS_A,TEAM_OPP,PTS_OPP_A,WIN/LOSS_A,FG_A,FGA_A,...,DRB_OPP_AVG_H,TRB_OPP_AVG_H,AST_OPP_AVG_H,STL_OPP_AVG_H,BLK_OPP_AVG_H,TOV_OPP_AVG_H,PF_OPP_AVG_H,NEXT_TEAM_OPP_H,TEAM_H,HOME/AWAY_H
261,2018,2017-11-04,1.0,DET,0.37037,SAC,0.287037,1.0,0.431818,0.262295,...,0.398773,0.50495,0.468354,0.370787,0.69697,0.541284,0.380952,DET,IND,1.0
262,2018,2017-11-04,0.0,NOP,0.259259,CHI,0.203704,1.0,0.318182,0.393443,...,0.398773,0.50495,0.462025,0.382022,0.651515,0.614679,0.436508,NOP,IND,0.0
267,2018,2017-11-04,1.0,DEN,0.37037,GSW,0.546296,0.0,0.431818,0.393443,...,0.760736,0.806931,0.367089,0.393258,0.575758,0.605505,0.769841,DEN,BRK,0.0
268,2018,2017-11-05,0.0,ATL,0.453704,CLE,0.435185,1.0,0.454545,0.377049,...,0.386503,0.405941,0.259494,0.224719,0.393939,0.568807,0.484127,ATL,BOS,0.0
272,2018,2017-11-05,0.0,BOS,0.333333,ORL,0.185185,1.0,0.409091,0.42623,...,0.601227,0.722772,0.607595,0.494382,0.409091,0.605505,0.5,BOS,ATL,0.0


In [113]:
# spilts the data in training and testing subset
x_train, x_test, y_train, y_test = train_test_split(
    historic_nba_data[training_values], 
    historic_nba_data[target], 
    test_size=0.25, 
    random_state=678
)

# trains the model with the training dataset
rr = RidgeClassifier(alpha=10)
rr.fit(x_train, y_train)

# makes predictions on the testing dataset and assess accuracy 
predictions = rr.predict(x_test)
print(format(accuracy_score(y_test, predictions), ".3%"))

# saves the model as a pkl file to be used later to make predictions without retraining
joblib.dump(value=[scaler, rr, training_values, target], filename='../Model/Ridge Classifier Model.pkl')

63.587%


['../Model/Ridge Classifier Model.pkl']