In [47]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

In [48]:
# Modify these according to the path of your desired datasets
train1 = pd.read_csv('/kaggle/input/nba-wins/combined_stats_19to20.csv')
train2 = pd.read_csv('/kaggle/input/nba-wins/combined_stats_20to21.csv')
test = pd.read_csv('/kaggle/input/nba-wins/combined_stats_21to22.csv')

train = pd.concat([train1, train2])

train.shape, test.shape

((62, 50), (31, 50))

# **Data Preprocessing**

In [49]:
train.describe()

Unnamed: 0,Rk_OFF,G_OFF,MP_OFF,FG_OFF,FGA_OFF,FG%_OFF,3P_OFF,3PA_OFF,3P%_OFF,2P_OFF,...,FT%_DEF,ORB_DEF,DRB_DEF,TRB_DEF,AST_DEF,STL_DEF,BLK_DEF,TOV_DEF,PF_DEF,PTS_DEF
count,60.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,...,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0
mean,15.5,71.306452,241.6,41.025806,88.609677,0.462984,12.45,34.369355,0.361919,28.577419,...,0.775161,9.964516,34.616129,44.570968,24.61129,7.612903,4.885484,14.195161,20.032258,111.966129
std,8.728484,2.399376,0.820835,1.503759,1.92964,0.014009,1.550436,3.768445,0.016575,2.011558,...,0.01235,0.653138,1.418634,1.805219,1.210894,0.642863,0.635261,1.296961,1.287999,3.510824
min,1.0,64.0,240.0,37.3,83.7,0.429,9.6,28.0,0.333,24.5,...,0.745,8.6,32.0,41.2,21.5,6.1,3.6,11.5,17.7,104.7
25%,8.0,72.0,241.0,40.0,87.825,0.45225,11.2,31.6,0.34925,27.15,...,0.767,9.525,33.325,42.9,23.825,7.2,4.5,13.2,19.125,109.125
50%,15.5,72.0,241.7,41.15,88.6,0.466,12.35,34.15,0.3635,28.6,...,0.777,10.0,34.55,44.8,24.75,7.6,4.9,14.15,20.0,111.95
75%,23.0,72.0,242.1,42.175,90.3,0.472,13.55,36.775,0.37375,30.025,...,0.783,10.4,35.65,45.9,25.4,8.0,5.3,15.0,21.0,114.675
max,30.0,75.0,243.1,44.7,91.8,0.494,16.7,45.3,0.411,33.0,...,0.803,11.5,37.7,47.9,27.1,9.4,6.4,18.3,22.9,119.7


In [50]:
yTrain = train["WON_MORE_THAN_HALF"]
xTrain = train.drop(["WON_MORE_THAN_HALF", "Team"], axis=1, inplace=False)
xTrain.shape, yTrain.shape

((62, 48), (62,))

In [51]:
yTest = test["WON_MORE_THAN_HALF"]
xTest = test.drop(["WON_MORE_THAN_HALF", "Team"], axis=1, inplace=False)
xTest.shape, yTest.shape

((31, 48), (31,))

In [52]:
for col in xTrain:
    mean = xTrain[col].mean()
    xTrain[col].fillna(value=mean, inplace=True)

In [53]:
for col in xTest:
    mean = xTest[col].mean()
    xTest[col].fillna(value=mean, inplace=True)

# **Tuning the Models**

In [54]:
def RandomForestTuning(xTrain, yTrain):
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    rf = RandomForestClassifier()
    randomForest = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)
    randomForest.fit(xTrain, yTrain)
    print(randomForest.best_params_)

**RandomForest Best Parameters:** {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}

In [55]:
def GradientBoostTuning(xTrain, yTrain):
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    gb = GradientBoostingClassifier()
    gradientBoost = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)
    gradientBoost.fit(xTrain, yTrain)
    print(gradientBoost.best_params_)

**GradientBoosting Best Parameters:** {'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 40}

# **Random Forest Feature Selection**

In [56]:
def FeatureSelectionRandomForest(xTrain, yTrain):
    from mlxtend.feature_selection import SequentialFeatureSelector as SFS
    sfs = SFS(RandomForestClassifier(n_estimators = 400, min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 30, bootstrap = True),
             k_features = (1, 40),
              forward= False,
              floating = False,
              verbose= 2,
              scoring= 'accuracy',
              cv = 2,
              n_jobs= -1
             ).fit(xTrain, yTrain)
    sbs = sfs
    print(sbs.k_score_)
    print(sfs.k_feature_names_)
    print(sfs.k_feature_idx_)

**K Score of Selected Features:** 0.9516129032258065

**Names of Selected Features:** ('Rk_OFF', 'MP_OFF', 'FG_OFF', 'FG%_OFF', '3P_OFF', '3PA_OFF', '2P_OFF', '2PA_OFF', '2P%_OFF', 'FTA_OFF', 'DRB_OFF', 'AST_OFF', 'STL_OFF', 'BLK_OFF', 'PTS_OFF', 'MP_DEF', 'FG_DEF', '3PA_DEF', '3P%_DEF', '2PA_DEF', '2P%_DEF', 'FT_DEF', 'DRB_DEF', 'PTS_DEF')

**Index of Selected Features:** (0, 2, 3, 5, 6, 7, 9, 10, 11, 13, 16, 18, 19, 20, 23, 26, 27, 31, 32, 34, 35, 36, 40, 47)

In [57]:
names = [0, 2, 3, 5, 6, 7, 9, 10, 11, 13, 16, 18, 19, 20, 23, 26, 27, 31, 32, 34, 35, 36, 40, 47]
xTrainRandom = xTrain.iloc[:, names]
xTestRandom = xTest.iloc[:, names]
xTrainRandom.shape, xTestRandom.shape

((62, 24), (31, 24))

# **Fitting the Classifiers**

In [58]:
#RandomForestTuning(xTrain, yTrain)
#FeatureSelectionRandomForest(xTrain, yTrain)
rf = RandomForestClassifier(n_estimators = 400, min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 30, bootstrap = True)
rf.fit(xTrainRandom, yTrain)

RandomForestClassifier(max_depth=30, max_features='sqrt', min_samples_split=5,
                       n_estimators=400)

In [59]:
#GradientBoostTuning(xTrain, yTrain)
gb = GradientBoostingClassifier(n_estimators = 1000, min_samples_split = 5, min_samples_leaf = 2, max_features = 'sqrt', max_depth = 40)
gb.fit(xTrain, yTrain)

GradientBoostingClassifier(max_depth=40, max_features='sqrt',
                           min_samples_leaf=2, min_samples_split=5,
                           n_estimators=1000)

In [60]:
lgbm = LGBMClassifier(n_estimators = 1000)
lgbm.fit(xTrain, yTrain)

LGBMClassifier(n_estimators=1000)

# **NBA Predictions**

In [61]:
rfPrediction = rf.predict(xTestRandom)
print("RF Predictions:", rfPrediction, "\n")
print(classification_report(yTest, rfPrediction, zero_division = 0))

print("Training Accuracy:", rf.score(xTrainRandom, yTrain))
print("Testing Accuracy:", rf.score(xTestRandom, yTest))

RF Predictions: ['0' '1' '1' '0' '0' '1' '1' '1' '0' '1' '0' '0' '1' '1' '0' '1' '1' '1'
 '1' '0' '1' '0' '0' '1' '1' '0' '0' '0' '1' '1' '0'] 

              precision    recall  f1-score   support

           0       0.79      0.92      0.85        12
           1       0.88      0.83      0.86        18
           X       0.00      0.00      0.00         1

    accuracy                           0.84        31
   macro avg       0.56      0.58      0.57        31
weighted avg       0.82      0.84      0.83        31

Training Accuracy: 0.9838709677419355
Testing Accuracy: 0.8387096774193549


# Random Forest Results

**Normal Training Accuracy:** 1.0

**Normal Testing Accuracy:** 0.7741935483870968

**Tuned Training Accuracy:** 1.0

**Tuned Testing Accuracy:** 0.8064516129032258

**Feature Set and Tuned Training Accuracy:** 0.9838709677419355

**Feature Set and Tuned Testing Accuracy:** 0.8387096774193549

In [62]:
gbPrediction = gb.predict(xTest)
print("GB Predictions:", gbPrediction, "\n")
print(classification_report(yTest, gbPrediction, zero_division = 0))

print("Training Accuracy:", gb.score(xTrain, yTrain))
print("Testing Accuracy:", gb.score(xTest, yTest))

GB Predictions: ['0' '1' '1' '0' '0' '1' '1' '1' '0' '1' '0' '0' '1' '1' '0' '1' '1' '1'
 '0' '0' '1' '0' '0' '1' '1' '0' '0' '0' '1' '1' '0'] 

              precision    recall  f1-score   support

           0       0.73      0.92      0.81        12
           1       0.88      0.78      0.82        18
           X       0.00      0.00      0.00         1

    accuracy                           0.81        31
   macro avg       0.54      0.56      0.55        31
weighted avg       0.79      0.81      0.79        31

Training Accuracy: 1.0
Testing Accuracy: 0.8064516129032258


# Gradient Boosting Results

**Normal Training Accuracy:** 1.0

**Normal Testing Accuracy:** 0.7419354838709677

**Tuned Training Accuracy:** 1.0

**Tuned Testing Accuracy:** 0.8064516129032258

In [63]:
lgbmPrediction = lgbm.predict(xTest)
print("LGBM Predictions:", lgbmPrediction, "\n")
print(classification_report(yTest, lgbmPrediction, zero_division = 0))

print("Training Accuracy:", lgbm.score(xTrain, yTrain))
print("Testing Accuracy:", lgbm.score(xTest, yTest))

LGBM Predictions: ['1' '1' '1' '1' '0' '1' '1' '1' '0' '1' '0' '0' '1' '1' '0' '1' '1' '1'
 '1' '0' '1' '0' '0' '1' '1' '0' '0' '0' '1' '1' '0'] 

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.89      0.94      0.92        18
           X       0.00      0.00      0.00         1

    accuracy                           0.90        31
   macro avg       0.60      0.62      0.61        31
weighted avg       0.87      0.90      0.89        31

Training Accuracy: 1.0
Testing Accuracy: 0.9032258064516129


# Light Gradient Boosting Machine Results

**Normal Training Accuracy:** 1.0

**Normal Testing Accuracy:** 0.8709677419354839

**Tuned Training Accuracy:** 1.0

**Tuned Testing Accuracy:** 0.9032258064516129