In [None]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as pl
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
import seaborn as sns


In [None]:
def score_classifier(dataset,classifier,labels):

    kf = KFold(n_splits=3,random_state=50,shuffle=True)
    confusion_mat = np.zeros((2,2))
    recall = 0
    for training_ids,test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set,training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat+=confusion_matrix(test_labels,predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
    recall/=3
    print(confusion_mat)
    print(recall)
    return recall

In [None]:
# Load dataset
df = pd.read_csv("nba_logreg.csv", sep=";")
df

In [None]:
#First, let's see what feature seem to make the greatest impact on the Target feature :
correlation = df.corr()

pl.figure(figsize=(10,10))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
pl.title("Corrélation entre les features et la variable cible")
pl.show()

Unsurprinsingly, what seems to make a difference is mainly the games the player played (i.e. a player has a better shot at making 5 seasons if one of the seasons he played in, he played a lot of games, insinuating he was a positive asset for his team)
Let's visually see that through a graph

In [None]:
colors = ['blue' if val == 1 else 'red' for val in df['TARGET_5Yrs']]

In [None]:
pl.scatter(df['GP'],df['FG%'],c=colors)

In [None]:
# extract names, labels, features names and values
names = df['Name'].values.tolist() # players names
labels = df['TARGET_5Yrs'].values # labels
paramset = df.drop(['TARGET_5Yrs','Name'],axis=1).columns.values
df_vals = df.drop(['TARGET_5Yrs','Name'],axis=1).values

# replacing Nan values (only present when no 3 points attempts have been performed by a player)
for x in np.argwhere(np.isnan(df_vals)):
    df_vals[x]=0.0

# normalize dataset
X = MinMaxScaler().fit_transform(df_vals)

#example of scoring with support vector classifier
score_classifier(X,SVC(),labels)

# TODO build a training set and choose a classifier which maximize recall score returned by the score_classifier function


Now we shall build a new classifier with the intent of getting higher that ~82% accuracy in our prediction.
I chose random forests because it's both effective and quite easy to set up and modify, while not beeing too demanding in resources.

In [None]:
#We make 30 different distributions to balance the randomness in the sampling
n = 30
scores = []

for i in range(n):
    
    X_train, X_test, y_train, y_test = train_test_split(df_vals, labels, test_size=0.25, random_state=i)
    
    clf = RandomForestClassifier(random_state=i, max_depth=2)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = score_classifier(X_test, clf,labels)
    
    scores.append(accuracy)

mean_scores = np.mean(scores)

In [None]:
pl.plot(range(n),scores)
pl.axhline(mean_scores, color='red', linestyle='--', label=f'Moyenne = {mean_scores:.2f}')

pl.legend()
pl.show()

##### We get a 92% accuracy with this classifier, and this result is quite robust to variance in samples.

##### Now let's see what feature the random forest classifier uses to make it's prediction :

In [None]:
importances = clf.feature_importances_

# Rank features by importance
indices = np.argsort(importances)[::-1]


print("Feature importance :")
for i in range(X.shape[1]):
    print(f"Feature {indices[i]} : {importances[indices[i]]}")
    
nom_indices = [paramset[i] for i in indices]    

pl.figure(figsize=(15, 8))
pl.title("Feature importance :")
pl.bar(range(X.shape[1]), importances[indices], align="center")
pl.xticks(range(X.shape[1]), nom_indices)
pl.show()

##### Finally, let's pack it into a usable API so users can put the season of 1 player and ask the model whether that player has a chance of playing for more than 5 years in the league

In [None]:
import joblib

#save the model
joblib.dump(clf, 'decision_tree_model.pkl')


In [None]:
from flask import Flask, request, jsonify
import joblib
import numpy as np


app = Flask(__name__)

model = joblib.load('decision_tree_model.pkl')

#POST request the stats required
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()

    
    features = np.array([[
        data.get('GP'),
        data.get('MIN'),
        data.get('PTS'),
        data.get('FGM'),
        data.get('FGA'),
        data.get('FG%'),
        data.get('3PM'),
        data.get('3P%'),
        data.get('FTM'),
        data.get('FTA'),
        data.get('FT%'),
        data.get('OREB'),
        data.get('DREB'),
        data.get('REB'),
        data.get('AST'),
        data.get('STL'),
        data.get('BLK'),
        data.get('TOV'),
    ]])

    #make the model predict
    prediction = model.predict(features)

    #return said prediction
    return jsonify({'prediction': int(prediction[0])})