### Baseline Model and Improvement for NBA Statistics and RAG Prediction Engine

#### Imports

In [119]:
# pandas/numpy for dataframe handling
import pandas as pd
import numpy as np

# display to look at data
from IPython.display import display, HTML

# pickle for data saving
import pickle

# Random Forest and Gradient Boosting for Model Building
# (Source: Enhancing Basketball Game Outcome Prediction through Fused Graph Convolutional Networks and Random Forest Algorithm (Zhao et al., 2023))
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

from collections import Counter

#### Filepaths

Loading relevant data which was scraped from various sources like basketball-reference and nba.com

In [102]:
nba_data_path = 'data/NBA_2024_cleaned_results.csv'
advanced_stats_path = 'data/advanced_team_stats_2024.csv'
clutch_stats_path = 'data/clutch_team_stats_2024.csv'
nba_data = pd.read_csv(nba_data_path)
advanced_stats = pd.read_csv(advanced_stats_path)
clutch_stats = pd.read_csv(clutch_stats_path)

#### Label data
using the cleaned results and passing game_id, home_team, away_team, home_win (binary) and point differential

In [103]:
# game_id column
nba_data['game_id'] = nba_data.index + 1

# home_win column: 1 if home team won, 0 if away team won
nba_data['home_win'] = (nba_data['Home Team'] == nba_data['Winner']).astype(int)

# renaming columns for better understanding
labeled_data = nba_data[['game_id', 'Home Team', 'Away Team', 'home_win', 'Point Differential']]
labeled_data.columns = ['game_id', 'home_team', 'away_team', 'home_win', 'point_differential']

After running feature importances with radient forest decided on these statistics

In [104]:
# for advanced stats
advanced_stats_selected = advanced_stats[['Team', 'ORtg', 'DRtg', 'NRtg', 'Off_eFGpct', 'Off_TOVpct', 'Def_DRBpct']]

# for clutch stats
clutch_stats_selected = clutch_stats[['TEAM', 'PTS', 'FGpct', 'REB', 'AST', 'TOV']]

# renaming the columns to avoid conflicts when merging
advanced_stats_selected.columns = ['team', 'ORtg', 'DRtg', 'NRtg', 'Off_eFG%', 'Off_TOV%', 'Def_DRB%']
clutch_stats_selected.columns = ['team', 'clutch_PTS', 'clutch_FG%', 'clutch_REB', 'clutch_AST', 'clutch_TOV']

#### Merging Datasets
merge the datasets on the Team column and since we have a couple of duplicate columns I used a feature importance beforehand (randomforest) and dropped the ones with lower values
fixing the label data as well and merge it with the main data

In [106]:
# merging data
team_stats = pd.merge(advanced_stats_selected, clutch_stats_selected, on='team')

In [107]:
# add labeled data for both home and away teams
labeled_data = labeled_data.merge(team_stats, left_on='home_team', right_on='team', suffixes=('', '_home')).drop(columns=['team'])
labeled_data = labeled_data.merge(team_stats, left_on='away_team', right_on='team', suffixes=('', '_away')).drop(columns=['team'])

In [109]:
# rename the data to allow for knowing which team has which stats
# Rename columns to indicate home and away team stats
labeled_data.columns = [
    'game_id', 'home_team', 'away_team', 'home_win', 'point_differential',
    'home_ORtg', 'home_DRtg', 'home_NRtg', 'home_Off_eFG%', 'home_Off_TOV%', 'home_Def_DRB%',
    'home_clutch_PTS', 'home_clutch_FG%', 'home_clutch_REB', 'home_clutch_AST', 'home_clutch_TOV',
    'away_ORtg', 'away_DRtg', 'away_NRtg', 'away_Off_eFG%', 'away_Off_TOV%', 'away_Def_DRB%',
    'away_clutch_PTS', 'away_clutch_FG%', 'away_clutch_REB', 'away_clutch_AST', 'away_clutch_TOV'
]

#### Descriptive Stats to get an overview over the data

In [111]:
# descriptive statistics for advanced stats
team_stats_numeric = team_stats.select_dtypes(include=[float, int])
team_stats_description = team_stats_numeric.describe()
team_stats_correlation = team_stats_numeric.corr()

# descriptive stats for clutch statistical analysis
labeled_data_numeric = labeled_data.select_dtypes(include=[float, int])
labeled_data_description = labeled_data_numeric.describe()
labeled_data_correlation = labeled_data_numeric.corr()

In [112]:
# tables for each result
team_stats_description_table = team_stats_description.to_html()
team_stats_correlation_table = team_stats_correlation.to_html()
labeled_data_description_table = labeled_data_description.to_html()
labeled_data_correlation_table = labeled_data_correlation.to_html()

display(HTML("<h2>Advanced Stats Description</h2>" + team_stats_description_table))
display(HTML("<h2>Advanced Stats Correlation</h2>" + team_stats_correlation_table))
display(HTML("<h2>Label Data Stats Description</h2>" + labeled_data_description_table))
display(HTML("<h2>Label Data Stats Correlation</h2>" + labeled_data_correlation_table))


Unnamed: 0,ORtg,DRtg,NRtg,Off_eFG%,Off_TOV%,Def_DRB%,clutch_PTS,clutch_FG%,clutch_REB,clutch_AST,clutch_TOV
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,115.283333,115.256667,0.026667,0.5465,12.146667,75.76,8.573333,43.79,3.3,1.54,0.95
std,3.877973,2.872423,5.619052,0.019273,0.883853,1.417282,1.380888,4.297902,0.492705,0.328634,0.225526
min,107.2,109.0,-10.5,0.503,10.7,72.5,5.5,35.4,2.6,0.9,0.5
25%,113.25,113.1,-4.4,0.5325,11.5,74.8,7.9,40.975,2.9,1.425,0.8
50%,115.75,115.25,1.95,0.5405,11.95,76.0,8.45,44.15,3.2,1.6,0.9
75%,117.75,117.225,3.075,0.56175,12.9,76.775,9.375,46.075,3.7,1.8,1.1
max,123.2,120.4,11.6,0.578,13.6,78.4,11.9,53.0,4.2,2.3,1.6


Unnamed: 0,ORtg,DRtg,NRtg,Off_eFG%,Off_TOV%,Def_DRB%,clutch_PTS,clutch_FG%,clutch_REB,clutch_AST,clutch_TOV
ORtg,1.0,-0.371852,0.880236,0.858327,-0.624016,0.121526,0.362254,0.450266,-0.157191,0.24081,-0.476088
DRtg,-0.371852,1.0,-0.767826,-0.352839,0.211757,-0.467576,-0.078108,-0.117098,0.069197,-0.048146,0.16315
NRtg,0.880236,-0.767826,1.0,0.772741,-0.538912,0.322893,0.289937,0.37061,-0.143858,0.190806,-0.411972
Off_eFG%,0.858327,-0.352839,0.772741,1.0,-0.400819,0.143664,0.21094,0.408369,-0.274535,0.187288,-0.389139
Off_TOV%,-0.624016,0.211757,-0.538912,-0.400819,1.0,0.048063,-0.418502,-0.399373,0.085518,-0.231022,0.491296
Def_DRB%,0.121526,-0.467576,0.322893,0.143664,0.048063,1.0,0.112904,-0.231997,0.210362,0.025024,0.003236
clutch_PTS,0.362254,-0.078108,0.289937,0.21094,-0.418502,0.112904,1.0,0.647843,0.650761,0.803321,0.116262
clutch_FG%,0.450266,-0.117098,0.37061,0.408369,-0.399373,-0.231997,0.647843,1.0,0.107799,0.702187,0.054964
clutch_REB,-0.157191,0.069197,-0.143858,-0.274535,0.085518,0.210362,0.650761,0.107799,1.0,0.432314,0.502728
clutch_AST,0.24081,-0.048146,0.190806,0.187288,-0.231022,0.025024,0.803321,0.702187,0.432314,1.0,0.209366


Unnamed: 0,game_id,home_win,point_differential,home_ORtg,home_DRtg,home_NRtg,home_Off_eFG%,home_Off_TOV%,home_Def_DRB%,home_clutch_PTS,home_clutch_FG%,home_clutch_REB,home_clutch_AST,home_clutch_TOV,away_ORtg,away_DRtg,away_NRtg,away_Off_eFG%,away_Off_TOV%,away_Def_DRB%,away_clutch_PTS,away_clutch_FG%,away_clutch_REB,away_clutch_AST,away_clutch_TOV
count,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0,1237.0
mean,619.0,0.544867,12.582053,115.294341,115.249555,0.044786,0.546554,12.142441,75.764915,8.582053,43.802264,3.301859,1.541552,0.950364,115.286176,115.261035,0.025141,0.546494,12.145432,75.764026,8.57114,43.783266,3.299192,1.539208,0.94996
std,357.23545,0.498184,9.521349,3.802533,2.818112,5.50832,0.018934,0.86821,1.394823,1.361301,4.21859,0.48569,0.323217,0.221448,3.811286,2.82487,5.520356,0.018936,0.868976,1.395192,1.361235,4.234576,0.485834,0.324472,0.221594
min,1.0,0.0,1.0,107.2,109.0,-10.5,0.503,10.7,72.5,5.5,35.4,2.6,0.9,0.5,107.2,109.0,-10.5,0.503,10.7,72.5,5.5,35.4,2.6,0.9,0.5
25%,310.0,0.0,6.0,113.2,113.0,-4.9,0.532,11.5,74.8,7.9,40.7,2.9,1.4,0.8,113.2,113.0,-4.9,0.532,11.5,74.8,7.9,40.7,2.9,1.4,0.8
50%,619.0,1.0,10.0,115.9,115.2,1.8,0.541,11.9,76.0,8.5,44.2,3.2,1.6,0.9,115.9,115.2,1.8,0.54,11.9,76.0,8.4,44.2,3.2,1.6,0.9
75%,928.0,1.0,17.0,117.8,117.5,3.1,0.562,13.0,76.9,9.4,46.2,3.7,1.8,1.1,117.8,117.5,3.1,0.562,13.0,76.9,9.4,46.2,3.7,1.8,1.1
max,1237.0,1.0,62.0,123.2,120.4,11.6,0.578,13.6,78.4,11.9,53.0,4.2,2.3,1.6,123.2,120.4,11.6,0.578,13.6,78.4,11.9,53.0,4.2,2.3,1.6


Unnamed: 0,game_id,home_win,point_differential,home_ORtg,home_DRtg,home_NRtg,home_Off_eFG%,home_Off_TOV%,home_Def_DRB%,home_clutch_PTS,home_clutch_FG%,home_clutch_REB,home_clutch_AST,home_clutch_TOV,away_ORtg,away_DRtg,away_NRtg,away_Off_eFG%,away_Off_TOV%,away_Def_DRB%,away_clutch_PTS,away_clutch_FG%,away_clutch_REB,away_clutch_AST,away_clutch_TOV
game_id,1.0,-0.049793,0.054916,-0.019225,-0.005138,-0.010642,-0.020283,0.014538,0.002695,-0.021484,-0.021955,0.00905,-0.026994,0.022089,0.018209,-0.000927,0.013046,0.013035,-0.02562,0.004845,0.015794,0.01349,-0.016662,0.021798,-0.024414
home_win,-0.049793,1.0,0.112692,0.325576,-0.254946,0.355186,0.285092,-0.196417,0.119747,0.175962,0.17261,0.0142,0.120056,-0.136004,-0.228216,0.229996,-0.275255,-0.211227,0.133585,-0.085268,-0.056608,-0.112876,0.081713,-0.040171,0.107933
point_differential,0.054916,0.112692,1.0,0.071227,-0.078903,0.089537,0.064408,-0.088482,-0.000325,0.032722,0.070031,0.003527,0.015769,-0.03287,-0.056939,0.10035,-0.090662,-0.040308,0.060988,-0.059217,-0.027474,0.003376,-0.000178,-0.014987,0.056457
home_ORtg,-0.019225,0.325576,0.071227,1.0,-0.370502,0.879878,0.857224,-0.62247,0.123377,0.359437,0.448785,-0.157198,0.239136,-0.475291,-0.029167,0.016032,-0.028341,-0.02388,0.036007,0.001375,-0.012972,-0.014621,0.01547,-0.013177,0.024599
home_DRtg,-0.005138,-0.254946,-0.078903,-0.370502,1.0,-0.767377,-0.350744,0.210701,-0.468273,-0.073333,-0.113585,0.071906,-0.043788,0.163536,0.023994,-0.03725,0.035627,0.021153,-0.02706,0.011378,-0.009033,-0.011631,-0.014898,-0.01785,-0.029426
home_NRtg,-0.010642,0.355186,0.089537,0.879878,-0.767377,1.0,0.771207,-0.537503,0.324743,0.285646,0.367919,-0.145305,0.187484,-0.411772,-0.03241,0.030125,-0.037792,-0.027307,0.038701,-0.004872,-0.004333,-0.004143,0.018301,3.5e-05,0.032036
home_Off_eFG%,-0.020283,0.285092,0.064408,0.857224,-0.350744,0.771207,1.0,-0.396701,0.146334,0.210083,0.407113,-0.271953,0.185978,-0.387275,-0.020558,0.00629,-0.017412,-0.024036,0.028869,-0.000596,0.001437,-0.002372,0.019196,0.001665,0.026973
home_Off_TOV%,0.014538,-0.196417,-0.088482,-0.62247,0.210701,-0.537503,-0.396701,1.0,0.045031,-0.415874,-0.396704,0.085078,-0.230164,0.489637,0.017559,-0.030176,0.027564,0.020151,-0.021582,-0.007575,0.030454,0.045821,-0.012981,0.05058,0.000535
home_Def_DRB%,0.002695,0.119747,-0.000325,0.123377,-0.468273,0.324743,0.146334,0.045031,1.0,0.115089,-0.230358,0.211017,0.026566,0.005044,-0.006325,0.026531,-0.017943,-0.002682,-0.002315,-0.036033,-0.001642,0.013982,-0.01105,0.003185,0.004079
home_clutch_PTS,-0.021484,0.175962,0.032722,0.359437,-0.073333,0.285646,0.210083,-0.415874,0.115089,1.0,0.646302,0.653692,0.803594,0.117762,0.012366,-0.003853,0.01051,0.012133,0.01508,-0.008038,-0.051472,-0.018737,-0.046948,-0.039857,-0.022022


In [114]:
# Save the final dataset to a CSV file
labeled_data.to_csv('data/NBA_Matchups_with_Features.csv', index=False)

# Display the first few rows of the integrated dataset
print(labeled_data.head())

   game_id              home_team           away_team  home_win  \
0        1         Denver Nuggets  Los Angeles Lakers         1   
1        2  Golden State Warriors        Phoenix Suns         0   
2        3          Orlando Magic     Houston Rockets         1   
3        4        New York Knicks      Boston Celtics         0   
4        5         Indiana Pacers  Washington Wizards         1   

   point_differential  home_ORtg  home_DRtg  home_NRtg  home_Off_eFG%  \
0                  12      118.5      113.0        5.5          0.562   
1                   4      117.8      115.2        2.6          0.557   
2                  30      113.4      111.3        2.1          0.541   
3                   4      118.2      113.4        4.8          0.540   
4                  23      121.0      118.0        3.0          0.578   

   home_Off_TOV%  ...  away_DRtg  away_NRtg  away_Off_eFG%  away_Off_TOV%  \
0           11.5  ...      115.3        0.6          0.566           12.5   
1   

#### Data Preparation
Merging the team stats with the match results to prepare features for the model and splitting the data afterwards

In [115]:
# Train test splitting
X = labeled_data.drop(columns=['game_id', 'home_team', 'away_team', 'home_win', 'point_differential'])
y = labeled_data['home_win']

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Random Forest again for Feature Importance

training a RandomForestRegressor model using the training set (source: https://www.mdpi.com/1099-4300/25/5/765)
validating the model using cross-validation and evaluated its performance used this on the data earlier - during this run not needed

In [93]:
# using the random forest for feature importance
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X_train, y_train)
# importances = rf_model.feature_importances_

In [94]:
# grid search now needed for gradient boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

#### Model Training + Gradient Boosting
I was training the Gradient Boosting Classifier on a subset of the data to ensure faster computation

In [117]:
gbc  = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_gbc = grid_search.best_estimator_

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [120]:
# Predict on the test set
y_pred = best_gbc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("Test Set Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Test Set Accuracy: 0.6733870967741935
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.60      0.64       121
           1       0.66      0.75      0.70       127

    accuracy                           0.67       248
   macro avg       0.68      0.67      0.67       248
weighted avg       0.68      0.67      0.67       248



In [121]:
# save model
model_path = 'data/gradient_boosting_nba_model.pkl'
joblib.dump(best_gbc, model_path)

print(f"Model saved to {model_path}")

Model saved to data/gradient_boosting_nba_model.pkl


In [126]:
import joblib
from sklearn.ensemble import GradientBoostingClassifier

# Assuming X_train and y_train are already defined
gbc = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.01,
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=42
)
gbc.fit(X_train, y_train)

# Save the trained model
model_path_corrected = 'gradient_boosting_nba_model_corrected.pkl'
joblib.dump(gbc, model_path_corrected)

['gradient_boosting_nba_model_corrected.pkl']

In [127]:
import joblib

# Load the Gradient Boosting model using joblib
model_path = 'gradient_boosting_nba_model_corrected.pkl'
loaded_object = joblib.load(model_path)

# Check the type and attributes of the loaded object
loaded_object_type = type(loaded_object)
has_predict_method = hasattr(loaded_object, 'predict')

print(f"Loaded object type: {loaded_object_type}")
if has_predict_method:
    print("The loaded object is a valid model.")
else:
    raise TypeError("The loaded object is not a model. Please provide a valid model.")

Loaded object type: <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
The loaded object is a valid model.


#### Feature importances for matchup -> redundant method used before to decide for the most relevant stats

After Optimizing the model using GridSearch to find the best parameters for a Gradient Boosting Regressor or Classifier (GBR, GBC) and to find the most important features for a possible prediction
Evaluation the Model based on its performance

In [99]:
# saving top 6 features
top_features_indices = np.argsort(importances)[-6:]
top_features = features.columns[top_features_indices]

# for each matchup
matchup_top_features = features.iloc[:, top_features_indices]
matchup_top_features['Winner'] = labels
matchup_top_features.to_csv('data/matchup_top_features.csv', index=False)

# Evaluate models
rf_predictions = rf_model.predict(X_test)
gb_predictions = best_gb_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gb_accuracy = accuracy_score(y_test, gb_predictions)

print(f'Random Forest Accuracy: {rf_accuracy}')
print(f'Gradient Boosting Accuracy: {gb_accuracy}')

Random Forest Accuracy: 0.782258064516129
Gradient Boosting Accuracy: 0.9717741935483871


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matchup_top_features['Winner'] = labels


gradient boosting will be used in rag system

#### Bracket
running the prediction model on the playoff matchups

In [283]:
# bracket

{'Round 1': {'Western': {'Thunder vs Pelicans': 'Oklahoma City Thunder',
   'Clippers vs Mavericks': 'Dallas Mavericks',
   'Timberwolves vs Suns': 'Minnesota Timberwolves',
   'Nuggets vs Lakers': 'Denver Nuggets'},
  'Eastern': {'Boston vs Heat': 'Boston Celtics',
   'Cavaliers vs Orlando': 'Orlando Magic',
   'Milwaukee vs Pacers': 'Indiana Pacers',
   'Knicks vs 76ers': 'Philadelphia 76ers'}},
 'Round 2': {'Western': {'Dallas Mavericks vs Oklahoma City Thunder': 'Oklahoma City Thunder',
   'Denver Nuggets vs Minnesota Timberwolves': 'Denver Nuggets'},
  'Eastern': {'Orlando Magic vs Boston Celtics': 'Boston Celtics',
   'Philadelphia 76ers vs Indiana Pacers': 'Indiana Pacers'}},
 'Round 3': {'Western': {'winners': 'Denver Nuggets'},
  'Eastern': {'winners': 'Boston Celtics'}},
 'NBA Finals': {'winner': 'Denver Nuggets'}}


running the prediction model on the playoff matchups

Round 1: Predicted 6 out of 8 correctly (75%)
Round 2: Predicted 2 out of 4 correctly (50%)
Round 3: Predicted 1 out of 2 correctly (50%)
NBA Finals: Predicted 0 out of 1 correctly (0%)

Percentage accuracy: (9/15)×100=60
