In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor as xgb_model
import numpy as np

First lets load the data, split the labels from the features, and split into training and testing sets. We also need to convert strings to categorical type.

In [2]:
data = pd.read_csv('./combined_since_2010.csv')
#for now, lets remove some columns which aren't stats
X=data.drop(columns=["Team","Year","POff_W","Rk"])
Y= data["POff_W"]
X_Train,X_Test,Y_Train,Y_Test = train_test_split(X,Y,test_size=0.25)

Now, we need to create an XGBoost model and tell is to learn with the training data!

In [3]:
my_xgb_model = xgb_model()
my_xgb_model.fit(X_Train,Y_Train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

Now, lets see how our model does. Let's compare to a naive multilinear regression for reference

In [4]:
xgb_preds=my_xgb_model.predict(X_Test)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

my_linear_model = LinearRegression()
my_linear_model.fit(X_Train,Y_Train)
linear_preds = my_linear_model.predict(X_Test)

xgb_err = mean_squared_error(xgb_preds,Y_Test)
linear_err = mean_squared_error(linear_preds,Y_Test)
print("XGB MSE: "+str(xgb_err))
print("Linear MSE: " + str(linear_err))

XGB MSE: 13.887929983063435
Linear MSE: 12.322448299313507


We need to see if the model is overfitting, for that we should compare the models' performances on the test set!

In [5]:
linear_train_pred = my_linear_model.predict(X_Train)
xgb_train_pred = my_xgb_model.predict(X_Train)
xgb_train_err = mean_squared_error(xgb_train_pred,Y_Train)
linear_train_err = mean_squared_error(linear_train_pred,Y_Train)
print("XGB MSE: "+str(xgb_train_err))
print("Linear MSE: " + str(linear_train_err))

XGB MSE: 5.137047996458348e-07
Linear MSE: 8.740863800600735


The model fits the training data perfectly but doesn't do all that well on the test set. This means we're probably over fitting. Let's try to introduce some heavier regularization. First let's try limiting max_depth of a tree in each of the classifiers in the random forest model. You can play with changing these values!

In [6]:
depth_limited = xgb_model(max_depth=2)
depth_limited.fit(X_Train,Y_Train)
depth_limited_train_err = mean_squared_error(depth_limited.predict(X_Train),Y_Train)
depth_limited_test_err = mean_squared_error(depth_limited.predict(X_Test),Y_Test)
print('Depth Limited Test MSE: '+str(depth_limited_test_err))
print('Depth Limited Train MSE: '+str(depth_limited_train_err))
print('Linear Baseline: '+str(linear_err))

Depth Limited Test MSE: 13.565971544918723
Depth Limited Train MSE: 1.261341733484448
Linear Baseline: 12.322448299313507


Changing the training rate can also prevent overfitting. I don't find much benefit from changing this parameter

In [7]:
slow_learn = xgb_model(eta=0.4)
slow_learn.fit(X_Train,Y_Train)
slow_learn_train_err = mean_squared_error(slow_learn.predict(X_Train),Y_Train)
slow_learn_test_err = mean_squared_error(slow_learn.predict(X_Test),Y_Test)
print('Slow Learner Test MSE: '+str(slow_learn_test_err))
print('Slow Learner Train MSE: '+str(slow_learn_train_err))
print('Linear Baseline: '+str(linear_err))

Slow Learner Test MSE: 15.082310654735343
Slow Learner Train MSE: 5.582341266142695e-07
Linear Baseline: 12.322448299313507


XGBoost allows for both ride and lasso-type regularization penalties. Let's try both in the next block. Default valeus are lambda=1 and alpha=0. To more heavily regularize, we want to increase these valeus

In [8]:
ridge = xgb_model(reg_lambda=10)
ridge.fit(X_Train,Y_Train)
ridge_train_err = mean_squared_error(ridge.predict(X_Train),Y_Train)
ridge_test_err = mean_squared_error(ridge.predict(X_Test),Y_Test)
print('Ridge Test MSE: '+str(ridge_test_err))
print('Ridge Train MSE: '+str(ridge_train_err))
print('Linear Baseline: '+str(linear_err))

lasso = xgb_model(reg_alpha=3)
lasso.fit(X_Train,Y_Train)
lasso_train_err = mean_squared_error(lasso.predict(X_Train),Y_Train)
lasso_test_err = mean_squared_error(lasso.predict(X_Test),Y_Test)
print('Lasso Test MSE: '+str(lasso_test_err))
print('Lasso Train MSE: '+str(lasso_train_err))

Ridge Test MSE: 15.144644786493494
Ridge Train MSE: 0.0006792371655470172
Linear Baseline: 12.322448299313507
Lasso Test MSE: 15.500357415727535
Lasso Train MSE: 0.08191292315623806


What if we use the linear model's output as a feature in our classifier? This is an example of "feature engineering". 

In [11]:
data['Linear_Model']=my_linear_model.predict(X)
new_X=data.drop(columns=["Team","Year","POff_W","Rk"])
new_X_Train,new_X_Test,new_Y_Train,new_Y_Test = train_test_split(new_X,Y,test_size=0.25)
feature_engineered_model = xgb_model()
feature_engineered_model.fit(new_X_Train,new_Y_Train)
feature_engineered_test_err =mean_squared_error(feature_engineered_model.predict(new_X_Test),new_Y_Test)
feature_engineered_train_err =mean_squared_error(feature_engineered_model.predict(new_X_Train),new_Y_Train)
print('Feature Engineering Test MSE: '+str(feature_engineered_test_err))
print('Feature Engineering Train MSE: '+str(feature_engineered_train_err))
print('Linear Baseline: '+str(linear_err))

Lasso Test MSE: 9.459530934783102
Lasso Train MSE: 5.342420226585526e-07
Linear Baseline: 12.322448299313507


We want to check which features actually mattered. Notice how half of the importance comes from the linear model, other important parameters are 3P% and rebounds

In [16]:
importances=pd.DataFrame({'Feature':feature_engineered_model.get_booster().feature_names,'Importance':feature_engineered_model.feature_importances_})
importances

Unnamed: 0,Feature,Importance
0,G,0.012184
1,MP,0.010472
2,FG,0.005122
3,FGA,0.011676
4,FG%,0.034957
5,3P,0.024438
6,3PA,0.000486
7,3P%,0.045822
8,2P,0.007207
9,2PA,0.031971


Finally, lets see how the model works with this year's data. Who are the favorites?

In [19]:
data23=pd.read_csv('./2023.csv')
teams = data23["Team"] #for later
data23=data23.drop(columns=["Rk","Team"])
linear23_pred = my_linear_model.predict(data23)
data23["Linear_Model"]=linear23_pred
this_year_preds = feature_engineered_model.predict(data23)
preds = pd.DataFrame({'Team':teams,'Predicted Playoff Wins':this_year_preds})
preds

Unnamed: 0,Team,Predicted Playoff Wins
0,Sacramento Kings*,7.871833
1,Golden State Warriors*,8.713643
2,Atlanta Hawks*,0.425932
3,Boston Celtics*,3.335354
4,Oklahoma City Thunder*,0.082005
5,Los Angeles Lakers*,0.744034
6,Utah Jazz,4.61657
7,Memphis Grizzlies*,0.720668
8,Milwaukee Bucks*,7.052516
9,Indiana Pacers,-0.068852
