In [1]:
# We load the necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning

pd.set_option("display.max_columns", 30)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# We load the library containing functions we made for this project.

from utils import *

In [2]:
# We load our processed data

print("Dataset with all features : ")
df = load_data(data_class="Processed")
display(df.head())
print("\nDataset with selected features : ")
df_selected = load_data(data_class="Processed", data=r"nba_logreg_selected.csv")
display(df_selected.head())

Dataset with all features : 


Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,EFG%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,Target
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,37.5,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,35.1,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,46.8,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,42.7,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,53.3,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1



Dataset with selected features : 


Unnamed: 0,Name,GP,MIN,FGA,FG%,EFG%,FTM,FT%,OREB,DREB,AST,STL,BLK,TOV,Target
0,Brandon Ingram,36,27.4,7.6,34.7,37.5,1.6,69.9,0.7,3.4,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,6.7,29.6,35.1,2.6,76.5,0.5,2.0,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,4.7,42.2,46.8,0.9,67.0,0.5,1.7,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.5,42.6,42.7,0.9,68.9,1.0,0.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,3.0,52.4,53.3,1.3,67.4,1.0,1.5,0.3,0.3,0.4,0.8,1


In [3]:
# We analyze our features selection impact on models performance.

models = [LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear", random_state=0), 
          DecisionTreeClassifier(max_depth=3, class_weight="balanced", random_state=0), 
          KNeighborsClassifier(20)]
compare_features(df, df_selected, models)

Unnamed: 0,All features,Selected features
LogisticRegression,0.701511,0.702307
DecisionTreeClassifier,0.69057,0.694647
KNeighborsClassifier,0.703178,0.719639


We notice slightly better performance with the selected features. So we will proceed with our models' development using the selected features.

We will proceed with models development. But first, we will define a baseline performance.

In [4]:
# We split our dataset into training and test sets.

X = df_selected.iloc[:, 1:-1].to_numpy()
Y = df_selected.iloc[:, -1].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)

Train set: (1046, 13) (1046,)
Test set: (262, 13) (262,)


In [5]:
# We normalize our data.

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
save_scaler(scaler)

In [6]:
# We will train our models for a first choice on metrics

models = [LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear", random_state=0),
          SVC(max_iter=1000, random_state=0),
          SGDClassifier(class_weight="balanced", random_state=0),
          DecisionTreeClassifier(max_depth=3, class_weight="balanced", random_state=0),
          RandomForestClassifier(max_depth=3, class_weight="balanced", random_state=0),
          XGBClassifier(max_depth=3, use_label_encoder=False, random_state=0),
          LGBMClassifier(max_depth=3, objective="binary", class_weight="balanced", random_state=0),
          KNeighborsClassifier(20)]
comparison_metrics = compare_models_kfolds(x_train, y_train, models, 3)
comparison_metrics



Unnamed: 0,Accuracy,Precision,Recall,F1-score
LogisticRegression,0.705959,0.808742,0.685693,0.705472
SVC,0.698646,0.762442,0.839422,0.728552
SGDClassifier,0.638865,0.846806,0.556317,0.568621
DecisionTreeClassifier,0.720114,0.827814,0.681887,0.714798
RandomForestClassifier,0.731482,0.843161,0.675038,0.7217
XGBClassifier,0.983352,0.985002,0.99239,0.985632
LGBMClassifier,0.882299,0.948402,0.841705,0.873648
KNeighborsClassifier,0.728919,0.800701,0.790715,0.745138


The scorer function proposed in test.py uses the recall as an evaluation metric for classification. The latter minimizes False Negative error where in our case is representative of missed investments on good players (career length >= 5). However, there is another type of error to consider, False Positive. This error is representative of bad investments in bad players (career length < 5). This last error can be evaluated with the precision metric. Besides, we can take into consideration both error types in our evaluation by using the F1-score, which is the harmonic mean of precision and recall. After initial training of models, we notice that our models respond very well to the false-negative error as they all have a high recall. But they don’t respond to false-positive error as much as the precision is lesser than the recall. Therefore, we will be using the F1-score since both errors have the same weight on the investing process. 

In [7]:
# We will train our models to define a baseline performance.

models = [LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear", random_state=0),
          SVC(max_iter=1000, random_state=0),
          SGDClassifier(class_weight="balanced", random_state=0),
          DecisionTreeClassifier(max_depth=3, class_weight="balanced", random_state=0),
          RandomForestClassifier(max_depth=3, class_weight="balanced", random_state=0),
          XGBClassifier(max_depth=3, use_label_encoder=False, random_state=0),
          LGBMClassifier(max_depth=3, objective="binary", class_weight="balanced", random_state=0),
          KNeighborsClassifier(20)]
comparison_models = compare_models(x_train, x_test, y_train, y_test, models, model_store_path=r"Models\Baseline")
comparison_models



Unnamed: 0,Training,Test
LogisticRegression,0.707345,0.662227
SVC,0.727172,0.706307
SGDClassifier,0.725023,0.719053
DecisionTreeClassifier,0.713278,0.639855
RandomForestClassifier,0.713249,0.639864
XGBClassifier,0.95098,0.667663
LGBMClassifier,0.834222,0.64324
KNeighborsClassifier,0.73858,0.669556


In [8]:
# We rank our models based on their test set F1-score.

print("The ranking of our models based on the test F1-score: ")
comparison_models["Test"].sort_values(ascending=False)

The ranking of our models based on the test F1-score: 


SGDClassifier             0.719053
SVC                       0.706307
KNeighborsClassifier      0.669556
XGBClassifier             0.667663
LogisticRegression        0.662227
LGBMClassifier            0.643240
RandomForestClassifier    0.639864
DecisionTreeClassifier    0.639855
Name: Test, dtype: float64

We note that the baseline performance on the test set (f1-score) is 0.70. </br>We will tune the top two models to finally choose the one with the best performance:
- Stochastic Gradient Descent Classifier
- Support Vector Classifier

In [9]:
# We fine-tune the Stochastic Gradient Descent Classifier.

grid_param = {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "penalty": ['l2'],
              "max_iter": [10, 25, 50, 100], "alpha": [0.1, 0.05, 0.025, 0.01, 0.005, 0.002, 0.001], "random_state": [0]}
best_param_sgd = tune_model(x_train, y_train, SGDClassifier, 3, grid_param)

Best params with F1-score = 0.716337903007997 :

{'alpha': 0.001, 'loss': 'modified_huber', 'max_iter': 25, 'penalty': 'l2', 'random_state': 0}


In [10]:
# We fine-tune the Support Vector Classifier.

grid_param = {"C": [0.001, 0.01, 0.1, 1, 10, 100], "kernel": ['rbf', 'poly', 'sigmoid'],
              "gamma": [1, 0.1, 0.01, 0.001, 0.0001], "coef0": [0.001, 0.01, 0.1, 1, 10, 100],
              "max_iter": [10, 100, 250, 500], "random_state": [0]}
best_param_svc = tune_model(x_train, y_train, SVC, 3, grid_param)

Best params with F1-score = 0.7152732215716958 :

{'C': 10, 'coef0': 0.001, 'gamma': 1, 'kernel': 'rbf', 'max_iter': 500, 'random_state': 0}


In [11]:
# We will compare our fine-tuned models.

models = [SGDClassifier(**best_param_sgd), SVC(**best_param_svc)]
comparison_models = compare_models(x_train, x_test, y_train, y_test, models,
                                   model_store_path=r"Models\TunedModels")
comparison_models

Unnamed: 0,Training,Test
SGDClassifier,0.720697,0.735123
SVC,0.751524,0.716275


The Stochastic Gradient Descent Classifier showed a higher performance compared to other models. So we will choose it in our application deployment.

In [12]:
# We will simulate the whole prediction process and get the final classification report.

data = load_data()
data.rename(columns={"3P Made": "3PM", "TARGET_5Yrs": "Target"}, inplace=True)
data.fillna(0, inplace=True)
duplicate_idx = data[data.duplicated(subset=data.columns[1:-1], keep=False)].index
data.iloc[duplicate_idx, -1] = data.iloc[duplicate_idx].groupby(list(data.columns[1:-1]))["Target"] \
    .transform(lambda x: 1 * (x.mean() >= 0.5) + 0 * (x.mean() < 0.5))
data.drop_duplicates(inplace=True)
data.insert(loc=10, column="EFG%", value=round((data["FGM"] + 0.5 * data["3PM"]) * 100 / data["FGA"], 1))
data = data[["Name", "GP", "MIN", "FGA", "FG%", "EFG%", "FTM",
             "FT%", "OREB", "DREB", "AST", "STL", "BLK", "TOV", "Target"]]
X = data.iloc[:, 1:-1].to_numpy()
Y = data.iloc[:, -1].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
pipeline = Pipeline([('scaler', MinMaxScaler()), ('Classifier', SGDClassifier(**best_param_sgd))])
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.65      0.62      0.63        97
         1.0       0.78      0.81      0.79       165

    accuracy                           0.74       262
   macro avg       0.72      0.71      0.71       262
weighted avg       0.73      0.74      0.74       262



In [13]:
# We save the final pipeline.

save_model(pipeline, model_store_path=r"Models")