Checking The Dataset Out

In [2]:
import pandas as pd 
from sklearn.linear_model import RidgeClassifier 
from sklearn.feature_selection import SequentialFeatureSelector 
from sklearn.model_selection import TimeSeriesSplit  
from sklearn.preprocessing import MinMaxScaler 
import plotly.express as px  
from sklearn.metrics import accuracy_score 

In [None]:
# Reading and tidying up dataframe 
nba_df = pd.read_csv("nba_games.csv", index_col=0) 
nba_df = nba_df.sort_values(by="date") #sorting by date 
nba_df= nba_df.reset_index(drop=True)  # prevents old index from becoming a column 
nba_df

In [None]:
# Creating a function to define the target variable 
def add_targetvar(team): 
    team["target"] = team["won"].shift(-1) # Target var is the team's result in their next game; gets result by pulling the value of team["won"] from the next row 
    return team 

nba_df = nba_df.groupby("team", group_keys=False).apply(add_targetvar) 
nba_df

In [None]:
nba_df[nba_df["team"] == "TOR"] #favourite team

In [None]:
# Replacing all null values in target column with a 2 & converting target column into integers
nba_df["target"].fillna(2,inplace=True)  # 2 means null value
nba_df["target"] = nba_df["target"].astype(int,errors="ignore") # 0=loss & 1=win
nba_df

In [None]:
# Checking columns with null values 
nulls = nba_df.isnull().sum() 
nulls = nulls[nulls > 0] # Finding columns with at least 1 null value; discovered 6 columns with null values for each row 
nulls 

In [8]:
# Replacing all columns with null values 
nba_df = nba_df.drop(["mp.1","mp_opp.1", "index_opp", "+/-", "mp_max", "mp_max.1", "+/-_opp", "mp_max_opp", "mp_max_opp.1"], axis=1) # down to 147 columns  


Data Visualization 

In [None]:
fig = px.violin(nba_df, y="fg%", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Higher fg% equates to higher chance of winning next game 

In [None]:
fig = px.violin(nba_df, y="3p%", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Higher e point fg% equates to higher chance of winning next game 

In [None]:
fig = px.violin(nba_df, y="orb", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Doesn't affect winning a lot 

In [None]:
fig = px.violin(nba_df, y="ts%", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() #  Higher ts% does help with winning 

In [None]:
fig = px.violin(nba_df, y="pts_max_opp", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # doesn't have an impact on winning 

In [None]:
fig = px.violin(nba_df, y="3p_opp", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Slightly helps with winning 

In [None]:
fig = px.violin(nba_df, y="blk_opp", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Helps with winning; less blocks from opponents -> more chance of winning 

In [None]:
fig = px.violin(nba_df, y="fg_opp", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Slighlty increases chance of winning; lower fg -> higher chance of winning 

In [None]:
fig = px.violin(nba_df, y="tov_opp", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Does help with winning; more tov from opp -> more opportunities to score -> higher chance of winning 

In [None]:
fig = px.violin(nba_df, y="tov%_max", x="target",  box=True, points="all",
          hover_data=nba_df.columns)
fig.show() # Doesn't really impact winning 

Getting The Best Features 

In [19]:
# Defining classifier, cross-validator, and feature selector
ridge = RidgeClassifier(alpha=1) 
tscv = TimeSeriesSplit(n_splits=3) #Splits dataset based on time 
sfs = SequentialFeatureSelector(ridge, n_features_to_select=30, direction="forward",cv=tscv, n_jobs=1) 

In [None]:
unwanted_columns = ["season", "date", "won", "target", "team", "team_opp"]  # List of all categorical columns 
wanted_columns = nba_df.columns[~nba_df.columns.isin(unwanted_columns)] # Excluding categorical columns 

scaler = MinMaxScaler() 
nba_df[wanted_columns] = scaler.fit_transform(nba_df[wanted_columns]) 
nba_df

In [None]:
# Fitting the feature selector to input and target variables
sfs.fit(nba_df[wanted_columns], nba_df["target"])

In [None]:
# Getting the best predictors 
best_columns = list(wanted_columns[sfs.get_support()])  
best_columns

Testing the Initial Model 

In [23]:
def test(data, model,predictors,start=2, step=1): 

    all_preds = [] # List for predictions for all seasons 

    seasons = sorted(data["season"].unique()) # Sorted list of all seasons in the dataframe  

    for i in range(start, len(seasons), step): 

        season = seasons[i] 
        train = data[data["season"] < season] # Train set should include data from previous seasons 
        test = data[data["season"] == season] 

        model.fit(train[predictors], train["target"]) 

        preds = model.predict(test[predictors]) #Will be a numpy arrary; hard to work with; convert to dataframe 
        preds = pd.Series(preds, index=test.index)  # Convert numpy arrary to pandas Series 

        combined_preds = pd.concat([test["target"], preds], axis=1) # Combining target columns from test data and the predictions for side-by-side comparison 
        combined_preds.columns = ["Actual", "Predicted"] # Renaming columns 

        all_preds.append(combined_preds) # Adding predictions for all seasons to an empty list
        
    return pd.concat(all_preds) # Combining all predictions from the list 

In [None]:
initial_predictions = test(nba_df, ridge, best_columns) 
initial_predictions

In [None]:
test_accuracy = accuracy_score(initial_predictions["Actual"] , initial_predictions["Predicted"]) # Changing it to !=2 lowers accuracy; need to find out why this happens
test_accuracy # We need to determine the baseline accuracy to beat; one baseline could be chance of winning home games; if my model is more accurate than someone blindly guessing that all 
# teams will win their home games, my model has produced some sort of useful prediction; next steps: beating Vegas odds 

In [None]:
nba_df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0]) # 57.2% chance of winning home games; have to beat this accuracy 

Generating Rolling Averages

In [27]:
# Rolling averages allow us to determine if the team is going to win or not based on multiple previous games rather than relying on a single previous game  

# Create a df for rolling averages 
# Create a function that calculates rolling averages and apply it to the df created in step 1  
# Remove null values 
# Rename the rolling avg columns to avoid overlap with the pre-existing df(nba_df) 
# Concatenate the dataframes 

In [None]:
wanted_columns = [col for col in wanted_columns if col != "home"] # Excluding home column as I don't want to calculate the rolling average for an encoded column 
nba_df_rolling = nba_df[list(wanted_columns) + ["won", "team", "season"]] # Copying wanted numerical columns and the categorical columns to a new dataframe 

def find_team_averages(team):
    rolling = team[wanted_columns].rolling(10).mean() # Applying rolling averages of previous 10 games for numerical columns 
    rolling["won"] = team["won"]  # Copying the categorical won column onto the rolling dataframe; same step for every other categorical column 
    rolling["team"] = team["team"]
    rolling["season"] = team["season"]   
    return rolling

nba_df_rolling = nba_df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages) #Applying the rolling averages function to the dataframe
nba_df_rolling # Grouping by team and season is needed in order to calculate the rolling averages for each specific team and season; otherwise, the rolling averages would be calculated based on the random order of the dataframe

In [29]:
rolling_cols_list = list(nba_df_rolling.columns) 
rolling_cols_list = [f"{col}_10" for col in rolling_cols_list] # Renaming rolling average columns 
rolling_cols_list 
nba_df_rolling.columns = rolling_cols_list 
nba_df = pd.concat([nba_df, nba_df_rolling], axis=1) # Joining the rolling averages dataframe with the original dataframe; axis=1 means columns

In [30]:
nba_df = nba_df.dropna() # Dropping null values; 2 rows dropped

In [None]:
nba_df

Adding information regarding opponents

In [None]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col # Shifts specific columns up by 1 to get the next game data 

def add_col(df, col_name): 
    return df.groupby("team_10", group_keys=False).apply(lambda x: shift_col(x, col_name)) # Adding a new column that stores data regarding a team's next game 

nba_df["home_next"] = add_col(nba_df, "home")
nba_df["team_opp_next"] = add_col(nba_df, "team_opp")
nba_df["date_next"] = add_col(nba_df, "date") 



In [None]:
nba_df

In [None]:
nba_df["team_10"]

In [None]:
full_df = nba_df.merge(nba_df[rolling_cols_list + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"]) 
full_df

# Creating a dataframe that contains the rolling averages of a team and the rolling averages of the opponent team for the next game; left_on and right_on indicate the direction of the columns
# Number of rows also decreased because merge removes rows with null values; 108 rows were dropped because these rows contain data regarding the last game of the season; hence no next game data is available

In [None]:
full_df[["team_x","team_opp_next_x","team_y", "team_opp_next_y", "date"]] #Left side is the team data and right side is the opponent data

In [None]:
full_df["target"]

Training and Testing A More Accurate Model 

In [None]:
unwanted_columns =  list(full_df.columns[full_df.dtypes == "object"]) + unwanted_columns # Defining all categorical columns from the full_df 
unwanted_columns

In [None]:
selected_columns = full_df.columns[~full_df.columns.isin(unwanted_columns)] # Excluding categorical columns 
sfs.fit(full_df[selected_columns], full_df["target"])

In [None]:
best_columns = list(selected_columns[sfs.get_support()]) # Retrieving the best predictors  
best_columns

In [None]:
predictions = test(full_df, ridge, best_columns) # Getting the predictions by calling the test function 
predictions

In [None]:
test_accuracy = accuracy_score(predictions["Actual"], predictions["Predicted"]) # Getting the accuracy score 
test_accuracy # 63.2% accuracy; 8.5% increase from the last model and also beat the naive model as well ; still a long way to go 

How to Improve Model Accuracy 

In [None]:
# Using other models(i.e XGBoost, Random Forest Classifier) 
# Optimizing hyperparameters of sfs; ie changing the number of features to select and the direction of the feature selection 
# Optimizing hyperparameters of the model; ie changing the alpha value of the RidgeClassifier 
# Changing ratio of rolling averages; previous 5 games, previous 15 games, etc 


How to Get Predictions for Future Games 

In [None]:
# Get up to date data 
# Fill in the missing rows(games that have not happened yet) 
# Fit the sfs to target value of 2; indicates that game has not happened yet  
# Get predictions for these games 