In [1]:
pip install nba_api pandas numpy scikit-learn xgboost matplotlib seaborn joblib pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# Fetching Data From NBA API

In [3]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

pd.set_option('display.max_columns', None)

# Fetch NBA game data from the API using the LeagueGameFinder method
def fetch_nba_data(season='2024-25'):
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season)
    games = gamefinder.get_data_frames()[0]
    return games

# Fetching multiple seasons - 2016 all the way to the 2024 NBA season
season17 = fetch_nba_data('2016-17')
season18 = fetch_nba_data('2017-18')
season19 = fetch_nba_data('2018-19')
season20 = fetch_nba_data('2019-20')
season21 = fetch_nba_data('2020-21')
season22 = fetch_nba_data('2021-22')
season23 = fetch_nba_data('2022-23')
season24 = fetch_nba_data('2023-24')

# combine all dataframes into one large dataset, shown below
df = pd.concat([season17, season18, season19, season20, season21, season22, season23, season24])

df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42016,1610612744,GSW,Golden State Warriors,0041600405,2017-06-12,GSW vs. CLE,W,241,129,46,90,0.511,14,38,0.368,23,28,0.821,13,29,42,27,8,2,13,24,9.0
1,42016,1610612739,CLE,Cleveland Cavaliers,0041600405,2017-06-12,CLE @ GSW,L,240,120,47,88,0.534,11,24,0.458,15,23,0.652,12,28,40,22,6,5,14,22,-9.0
2,42016,1610612739,CLE,Cleveland Cavaliers,0041600404,2017-06-09,CLE vs. GSW,W,240,137,46,87,0.529,24,45,0.533,21,31,0.677,11,30,41,27,6,3,11,24,19.0
3,42016,1610612744,GSW,Golden State Warriors,0041600404,2017-06-09,GSW @ CLE,L,241,116,39,87,0.448,11,39,0.282,27,36,0.750,16,24,40,26,5,6,12,27,-19.0
4,42016,1610612744,GSW,Golden State Warriors,0041600403,2017-06-07,GSW @ CLE,W,241,118,40,83,0.482,16,33,0.485,22,24,0.917,8,36,44,29,8,4,18,28,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,12023,1610612750,MIN,Minnesota Timberwolves,0012300001,2023-10-05,MIN vs. DAL,W,239,111,42,95,0.442,11,35,0.314,16,22,0.727,11,45,56,28,7,12,16,20,12.0
4379,12023,104,PER,Perth Wildcats,2012300002,2023-09-08,PER @ GLI,W,238,127,39,77,0.506,15,29,0.517,34,39,0.872,10,29,39,25,10,8,21,23,15.0
4380,12023,1612709930,GLI,G League Ignite,2012300002,2023-09-08,GLI vs. PER,L,240,112,37,90,0.411,9,29,0.310,29,35,0.829,17,25,42,24,12,10,17,29,-15.0
4381,12023,104,PER,Perth Wildcats,2012300001,2023-09-06,PER @ GLI,L,239,105,36,91,0.396,11,36,0.306,22,26,0.846,21,19,40,27,13,8,21,21,-19.0


# Preparing the Data for Machine Learning

In [5]:
# sort the dataframe by date
df = df.sort_values("GAME_DATE")
df = df.reset_index(drop=True)

df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,12016,1610612742,DAL,Dallas Mavericks,0011600002,2016-10-01,DAL @ NOP,L,239,102,36,84,0.429,12,28,0.429,18,25,0.720,13,23,36,23,9,5,9,20,-14.0
1,12016,1610612744,GSW,Golden State Warriors,0011600001,2016-10-01,GSW @ TOR,L,241,93,33,84,0.393,12,36,0.333,15,24,0.625,9,33,42,19,15,7,21,24,-4.0
2,12016,1610612740,NOP,New Orleans Pelicans,0011600002,2016-10-01,NOP vs. DAL,W,241,116,47,82,0.573,12,21,0.571,10,14,0.714,9,35,44,29,3,6,16,23,14.0
3,12016,1610612761,TOR,Toronto Raptors,0011600001,2016-10-01,TOR vs. GSW,W,240,97,37,86,0.430,6,23,0.261,17,29,0.586,16,42,58,14,14,7,23,25,4.0
4,12016,12329,SDS,Shanghai Shanghai Sharks,0011600003,2016-10-02,SDS @ HOU,L,238,94,36,85,0.424,4,23,0.174,18,24,0.750,6,35,41,18,4,2,20,20,-37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31594,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,W,240,106,38,82,0.463,17,46,0.370,13,14,0.929,6,30,36,26,4,6,9,19,7.0
31595,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,L,239,84,29,80,0.363,14,41,0.341,12,13,0.923,4,27,31,18,2,5,13,19,-38.0
31596,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,W,240,122,46,91,0.505,15,37,0.405,15,22,0.682,13,39,52,21,7,2,8,17,38.0
31597,42023,1610612738,BOS,Boston Celtics,0042300405,2024-06-17,BOS vs. DAL,W,240,106,38,89,0.427,13,39,0.333,17,20,0.850,15,36,51,25,9,2,7,15,18.0


In [6]:
# setting up TARGET to find out how team/group did in their next game
def add_TARGET(group):
    group["TARGET"] = group["WL"].shift(-1)
    return group

df = df.groupby("TEAM_ABBREVIATION", group_keys=False).apply(add_TARGET)

  df = df.groupby("TEAM_ABBREVIATION", group_keys=False).apply(add_TARGET)


In [7]:
df[df["TEAM_ABBREVIATION"] == "DAL"] # Testing on the Dallas Mavericks (DAL) for example

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,TARGET
0,12016,1610612742,DAL,Dallas Mavericks,0011600002,2016-10-01,DAL @ NOP,L,239,102,36,84,0.429,12,28,0.429,18,25,0.720,13,23,36,23,9,5,9,20,-14.0,W
9,12016,1610612742,DAL,Dallas Mavericks,0011600009,2016-10-03,DAL vs. CHA,W,239,95,38,90,0.422,13,33,0.394,6,10,0.600,9,40,49,21,7,3,12,27,7.0,L
63,12016,1610612742,DAL,Dallas Mavericks,0011600034,2016-10-08,DAL @ MIL,L,239,74,29,86,0.337,6,32,0.188,10,15,0.667,17,33,50,16,8,5,18,18,-14.0,W
86,12016,1610612742,DAL,Dallas Mavericks,0011600046,2016-10-11,DAL vs. OKC,W,241,114,40,86,0.465,9,28,0.321,25,31,0.806,12,40,52,25,7,4,15,19,5.0,L
125,12016,1610612742,DAL,Dallas Mavericks,0011600064,2016-10-14,DAL @ PHX,L,240,107,39,91,0.429,8,16,0.500,21,28,0.750,15,41,56,16,6,8,18,30,-5.0,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31590,42023,1610612742,DAL,Dallas Mavericks,0042300401,2024-06-06,DAL @ BOS,L,239,89,35,84,0.417,7,27,0.259,12,19,0.632,10,33,43,9,8,1,11,16,-18.0,L
31591,42023,1610612742,DAL,Dallas Mavericks,0042300402,2024-06-09,DAL @ BOS,L,240,98,38,80,0.475,6,26,0.231,16,24,0.667,9,34,43,21,5,3,15,17,-7.0,L
31593,42023,1610612742,DAL,Dallas Mavericks,0042300403,2024-06-12,DAL vs. BOS,L,240,99,38,86,0.442,9,25,0.360,14,16,0.875,7,36,43,15,5,1,8,17,-7.0,W
31596,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,W,240,122,46,91,0.505,15,37,0.405,15,22,0.682,13,39,52,21,7,2,8,17,38.0,L


In [8]:
# Changing TARGET col vals to 0 (L), 1 (W), 2 (No data for next game; meaning that there is no next game that season for that team)

df["TARGET"][pd.isnull(df["TARGET"])] = 2

df.loc[df["TARGET"] == 'L', 'TARGET'] = 0
df.loc[df["TARGET"] == 'W', 'TARGET'] = 1

df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["TARGET"][pd.isnull(df["TARGET"])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TARGET"][pd.isnul

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,TARGET
0,12016,1610612742,DAL,Dallas Mavericks,0011600002,2016-10-01,DAL @ NOP,L,239,102,36,84,0.429,12,28,0.429,18,25,0.720,13,23,36,23,9,5,9,20,-14.0,1
1,12016,1610612744,GSW,Golden State Warriors,0011600001,2016-10-01,GSW @ TOR,L,241,93,33,84,0.393,12,36,0.333,15,24,0.625,9,33,42,19,15,7,21,24,-4.0,1
2,12016,1610612740,NOP,New Orleans Pelicans,0011600002,2016-10-01,NOP vs. DAL,W,241,116,47,82,0.573,12,21,0.571,10,14,0.714,9,35,44,29,3,6,16,23,14.0,0
3,12016,1610612761,TOR,Toronto Raptors,0011600001,2016-10-01,TOR vs. GSW,W,240,97,37,86,0.430,6,23,0.261,17,29,0.586,16,42,58,14,14,7,23,25,4.0,0
4,12016,12329,SDS,Shanghai Shanghai Sharks,0011600003,2016-10-02,SDS @ HOU,L,238,94,36,85,0.424,4,23,0.174,18,24,0.750,6,35,41,18,4,2,20,20,-37.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31594,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,W,240,106,38,82,0.463,17,46,0.370,13,14,0.929,6,30,36,26,4,6,9,19,7.0,0
31595,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,L,239,84,29,80,0.363,14,41,0.341,12,13,0.923,4,27,31,18,2,5,13,19,-38.0,1
31596,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,W,240,122,46,91,0.505,15,37,0.405,15,22,0.682,13,39,52,21,7,2,8,17,38.0,0
31597,42023,1610612738,BOS,Boston Celtics,0042300405,2024-06-17,BOS vs. DAL,W,240,106,38,89,0.427,13,39,0.333,17,20,0.850,15,36,51,25,9,2,7,15,18.0,2


In [9]:
# Checking Data Consistency, W/L should be equal
df["WL"].value_counts()

# If you check the TARGET col, it will not be consistent, because there will be some games where the season for that team ends (for example,
# they lose in the playoffs) so there will be some of the games with the value 2 (mentioned in comment above)

WL
L    15797
W    15797
Name: count, dtype: int64

In [10]:
# removing any and all nulls from the df

nulls = pd.isnull(df).sum()

In [11]:
nulls = nulls[nulls > 0]
nulls

WL         5
FG_PCT     2
FG3_PCT    3
FT_PCT     9
dtype: int64

In [12]:
cols_wo_nulls = df.columns[~df.columns.isin(nulls.index)]
cols_wo_nulls

# Error fix: keeps removing WL col
wl_col = df['WL'].copy()

In [13]:
df = df[cols_wo_nulls].copy()

df['WL'] = wl_col
df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,MIN,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,TARGET,WL
0,12016,1610612742,DAL,Dallas Mavericks,0011600002,2016-10-01,DAL @ NOP,239,102,36,84,12,28,18,25,13,23,36,23,9,5,9,20,-14.0,1,L
1,12016,1610612744,GSW,Golden State Warriors,0011600001,2016-10-01,GSW @ TOR,241,93,33,84,12,36,15,24,9,33,42,19,15,7,21,24,-4.0,1,L
2,12016,1610612740,NOP,New Orleans Pelicans,0011600002,2016-10-01,NOP vs. DAL,241,116,47,82,12,21,10,14,9,35,44,29,3,6,16,23,14.0,0,W
3,12016,1610612761,TOR,Toronto Raptors,0011600001,2016-10-01,TOR vs. GSW,240,97,37,86,6,23,17,29,16,42,58,14,14,7,23,25,4.0,0,W
4,12016,12329,SDS,Shanghai Shanghai Sharks,0011600003,2016-10-02,SDS @ HOU,238,94,36,85,4,23,18,24,6,35,41,18,4,2,20,20,-37.0,0,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31594,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,240,106,38,82,17,46,13,14,6,30,36,26,4,6,9,19,7.0,0,W
31595,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,239,84,29,80,14,41,12,13,4,27,31,18,2,5,13,19,-38.0,1,L
31596,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,240,122,46,91,15,37,15,22,13,39,52,21,7,2,8,17,38.0,0,W
31597,42023,1610612738,BOS,Boston Celtics,0042300405,2024-06-17,BOS vs. DAL,240,106,38,89,13,39,17,20,15,36,51,25,9,2,7,15,18.0,2,W


# Feature Selection and Scaling/Standardizing

In [15]:
# Since our data is time-series related data, we need to split our data up correctly (use past data to predict the future, not the other way
# around), hence the TimeSeriesSplit from sklearn. 

# Split - we do not use the train_test_split as we have used in class, as we do not want to leak future stats into past ones if the data
# is shuffled. Hence, why we are opting for the TimeSeriesSplit instead

ridge_c = RidgeClassifier(alpha=1)
log_model = LogisticRegression()
split = TimeSeriesSplit(n_splits=3)

sfs_rc = SequentialFeatureSelector(log_model, 
                                n_features_to_select="auto", 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

sfs_log = SequentialFeatureSelector(ridge_c, 
                                n_features_to_select="auto", 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [16]:
removed_cols = ["SEASON_ID", "TEAM_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "GAME_DATE", "GAME_ID", "MATCHUP", "TARGET", "WL"]
selected_cols = df.columns[~df.columns.isin(removed_cols)]

In [17]:
selected_cols

Index(['MIN', 'PTS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [18]:
scaler = StandardScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

In [19]:
# everything we didn't want to scale has not been scaled

df

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,MIN,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,TARGET,WL
0,12016,1610612742,DAL,Dallas Mavericks,0011600002,2016-10-01,DAL @ NOP,-0.232551,-0.654554,-0.857675,-0.518622,0.078296,-0.645945,0.271930,0.484803,0.616255,-1.940210,-1.230555,-0.234227,0.336423,0.022064,-1.237555,-0.056451,-0.949394,1,L
1,12016,1610612744,GSW,Golden State Warriors,0011600001,2016-10-01,GSW @ TOR,-0.007472,-1.333474,-1.404782,-0.518622,0.078296,0.407130,-0.203559,0.354372,-0.403744,-0.150227,-0.353943,-0.987446,2.290642,0.808082,1.602109,0.838237,-0.270715,1,L
2,12016,1610612740,NOP,New Orleans Pelicans,0011600002,2016-10-01,NOP vs. DAL,-0.007472,0.401545,1.148384,-0.777585,0.078296,-1.567386,-0.996040,-0.949942,-0.403744,0.207769,-0.061739,0.895603,-1.617796,0.415073,0.418916,0.614565,0.950908,0,W
3,12016,1610612761,TOR,Toronto Raptors,0011600001,2016-10-01,TOR vs. GSW,-0.120011,-1.031732,-0.675306,-0.259659,-1.445619,-1.304117,0.113434,1.006529,1.381255,1.460757,1.983689,-1.928970,1.964939,0.808082,2.075387,1.061909,0.272229,0,W
4,12016,12329,SDS,Shanghai Shanghai Sharks,0011600003,2016-10-02,SDS @ HOU,-0.345090,-1.258039,-0.857675,-0.389141,-1.953591,-1.304117,0.271930,0.354372,-1.168743,0.207769,-0.500045,-1.175751,-1.292093,-1.156963,1.365471,-0.056451,-2.510356,0,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31594,42023,1610612738,BOS,Boston Celtics,0042300403,2024-06-12,BOS @ DAL,-0.120011,-0.352811,-0.492937,-0.777585,1.348225,1.723473,-0.520551,-0.949942,-1.168743,-0.687222,-1.230555,0.330688,-1.292093,0.415073,-1.237555,-0.280123,0.475832,0,W
31595,42023,1610612738,BOS,Boston Celtics,0042300404,2024-06-14,BOS @ DAL,-0.232551,-2.012395,-2.134258,-1.036547,0.586268,1.065301,-0.679047,-1.080374,-1.678742,-1.224217,-1.961065,-1.175751,-1.943499,0.022064,-0.291000,-0.280123,-2.578224,1,L
31596,42023,1610612742,DAL,Dallas Mavericks,0042300404,2024-06-14,DAL vs. BOS,-0.120011,0.854159,0.966015,0.387747,0.840253,0.538764,-0.203559,0.093509,0.616255,0.923762,1.107077,-0.610836,-0.314983,-1.156963,-1.474193,-0.727466,2.579738,0,W
31597,42023,1610612738,BOS,Boston Celtics,0042300405,2024-06-17,BOS vs. DAL,-0.120011,-0.352811,-0.492937,0.128785,0.332282,0.802033,0.113434,-0.167354,1.126255,0.386767,0.960975,0.142383,0.336423,-1.156963,-1.710832,-1.174810,1.222379,2,W


# Using the Model (Logistic Regression and Ridge Classifier)

In [21]:
df["TARGET"] = df["TARGET"].astype(int) # Ensuring TARGETs are all ints (recieved error based on them not being ints before)

sfs_log.fit(df[selected_cols], df["TARGET"]) # Running logistic regression model

In [22]:
sfs_rc.fit(df[selected_cols], df["TARGET"]) # Running ridge classifier

In [23]:
# Will give us the list of cols we should be using 
predictors_ridge = list(selected_cols[sfs_rc.get_support()])
predictors_log = list(selected_cols[sfs_log.get_support()])

In [24]:
predictors_ridge

['MIN', 'FGA', 'FTM', 'FTA', 'OREB', 'AST', 'BLK', 'PLUS_MINUS']

In [25]:
predictors_log

['MIN', 'FTM', 'REB', 'AST', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

In [26]:
# This function will make the predicitons; will use the past seasons to predict the future seasons
# start: Will take in x (default being 2) number of seasons to make predictions for the following season(s)
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["SEASON_ID"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]

        # Split our train and test data:
        train = data[data["SEASON_ID"] < season]
        test = data[data["SEASON_ID"] == season]
        
        model.fit(train[predictors], train["TARGET"])
        
        predictions = model.predict(test[predictors])
        predictions = pd.Series(predictions, index=test.index)
        combined = pd.concat([test["TARGET"], predictions], axis=1) # axis 1: treat the 2 pd Series as seperate cols
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [27]:
predictions_log = backtest(df, log_model, predictors_log)
predictions_ridge = backtest(df, ridge_c, predictors_ridge)

In [28]:
# There are cases where we are right and where we are wrong as shown below, when comparing the actual and prediction cols (0 and 1)

predictions_log

Unnamed: 0,actual,prediction
8102,1,1
8103,0,0
8104,0,1
8105,1,1
8106,1,1
...,...,...
21832,1,0
21874,0,1
21884,0,0
22004,0,1


In [29]:
predictions_ridge

Unnamed: 0,actual,prediction
8102,1,1
8103,0,0
8104,0,1
8105,1,1
8106,1,0
...,...,...
21832,1,0
21874,0,1
21884,0,0
22004,0,1


In [30]:
# Using accuracy_score from sklearn.metrics to determine how accurate the logistic regression model is
accuracy_score(predictions_log["actual"], predictions_log["prediction"])

0.5340850845720143

In [31]:
# Now for the ridge classifier model
accuracy_score(predictions_ridge["actual"], predictions_ridge["prediction"])

0.5337006663249616

### They both seem to run at an almost identical accuracy — the Logistic Regression and Ridge Classifier models show comparable performance on the NBA game predictor. This suggests that regularization from Ridge did not significantly impact the results, likely because the data is already well-conditioned or not overly prone to overfitting. Given their similar accuracy, the choice between them may come down to model interpretability or specific use-case preferences, with Logistic Regression offering more straightforward coefficient analysis.



In [33]:
joblib.dump(log_model, 'nba_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Saving both the model and scaler as pkl files, for possible external use if needed

['scaler.pkl']

In [34]:
with open('nba_model.pkl', 'rb') as model:
    nba_model = joblib.load(model)

with open('scaler.pkl', 'rb') as sca:
    scaler = joblib.load(sca)