In [218]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

## Load and Refine Dataset

In [220]:
NUMERIC_COLUMNS = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'eFG%', 'TRB', 'AST', 'STL', 'BLK', 'PTS']
POSITIONS = ['PG', 'SG', 'SF', 'PF', 'C']

df = pd.read_csv('./nba_data_processed.csv')
df = df[df['Pos'].notna()]

# Keep only the columns needed
df = df[['Player', 'Pos', 'G', 'MP'] + NUMERIC_COLUMNS]

# Normalize position values
def normalize(x):
    if ('-' in x):
        return x[0:x.find('-')]
    return x

df['Pos'] = df['Pos'].apply(normalize)

# Group up players that have played for multiple teams
df = df.groupby(['Player', 'Pos']).mean()
df = df.reset_index()

# The original columns are distributed in "value/game", so we convert them to "value/minute"
games_played = df['G']
minutes_played = df['MP'] * games_played

df = df.drop(['Player', 'G', 'MP'], axis=1)

for column in NUMERIC_COLUMNS:
    df[column] = (df[column] * games_played) / minutes_played

## Define prediction algorithms

In [221]:
RANDOM_STATE = 42

def prepare(x_cols, y_cols):
    global X, y, X_train, X_test, y_train, y_test
    
    X = df[x_cols]
    y = df[y_cols]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

def run_predictions():
    dt_prediction = decision_tree()
    knn_prediction = knn()
    lr_prediction = logistic_regression()

    print("Decision Tree:")
    print_results(dt_prediction)

    print("KNN:")
    print_results(knn_prediction)

    print("Logistic Regression:")
    print_results(lr_prediction)

def decision_tree():
    clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
    return predict(clf)

def knn():
    pipeline = make_pipeline(SimpleImputer(), KNeighborsClassifier(n_neighbors=5))
    return predict(pipeline)

def logistic_regression():
    pipeline = make_pipeline(SimpleImputer(), LogisticRegression(max_iter=1000))
    return predict(pipeline)

def predict(classifier):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    return y_pred

def print_results(y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    
    print(f"Accuracy: {accuracy:.3f} | Recall: {recall:.3f} | Precision: {precision:.3f} | F1-Score: {f1:.3f}\n")

## Analyzing results

In [222]:
# Testing with all columns
prepare(x_cols=['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'eFG%', 'TRB', 'AST', 'STL', 'BLK', 'PTS'], y_cols='Pos')
run_predictions()

Decision Tree:
Accuracy: 0.400 | Recall: 0.400 | Precision: 0.432 | F1-Score: 0.411

KNN:
Accuracy: 0.473 | Recall: 0.473 | Precision: 0.494 | F1-Score: 0.481

Logistic Regression:
Accuracy: 0.564 | Recall: 0.564 | Precision: 0.653 | F1-Score: 0.541



In [223]:
# Removing 'PTS'
prepare(x_cols=['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'eFG%', 'TRB', 'AST', 'STL', 'BLK'], y_cols='Pos')
run_predictions()

Decision Tree:
Accuracy: 0.400 | Recall: 0.400 | Precision: 0.420 | F1-Score: 0.406

KNN:
Accuracy: 0.500 | Recall: 0.500 | Precision: 0.522 | F1-Score: 0.503

Logistic Regression:
Accuracy: 0.564 | Recall: 0.564 | Precision: 0.679 | F1-Score: 0.538



In [224]:
# Keeping only "Game Impact" columns
prepare(x_cols=['TRB', 'AST', 'STL', 'BLK'], y_cols='Pos')
run_predictions()

Decision Tree:
Accuracy: 0.409 | Recall: 0.409 | Precision: 0.504 | F1-Score: 0.436

KNN:
Accuracy: 0.573 | Recall: 0.573 | Precision: 0.632 | F1-Score: 0.592

Logistic Regression:
Accuracy: 0.564 | Recall: 0.667 | Precision: 0.693 | F1-Score: 0.620



In [225]:
# Including '2P%' and '3P%'
prepare(x_cols=['TRB', 'AST', 'STL', 'BLK', '2P%', '3P%'], y_cols='Pos')
run_predictions()

Decision Tree:
Accuracy: 0.400 | Recall: 0.400 | Precision: 0.466 | F1-Score: 0.422

KNN:
Accuracy: 0.609 | Recall: 0.609 | Precision: 0.632 | F1-Score: 0.615

Logistic Regression:
Accuracy: 0.573 | Recall: 0.677 | Precision: 0.695 | F1-Score: 0.633



## Confusion Matrix

In [226]:
prepare(x_cols=['TRB', 'AST', 'STL', 'BLK', '2P%', '3P%'], y_cols='Pos')

dt_prediction = decision_tree()
knn_prediction = knn()
lr_prediction = logistic_regression()

cm_dt = confusion_matrix(y_test, dt_prediction)
cm_knn = confusion_matrix(y_test, knn_prediction)
cm_lr = confusion_matrix(y_test, lr_prediction)

print(f"Decision Tree: \n{cm_dt}\n")
print(f"KNN: \n{cm_knn}\n")
print(f"Logistic Regression: \n{cm_lr}")

Decision Tree: 
[[16 11  0  6  1]
 [ 4  4  0  1  2]
 [ 0  0 12  5  4]
 [ 2  7  1  2  5]
 [ 2  6  7  2 10]]

KNN: 
[[30  2  0  2  0]
 [ 5  3  1  2  0]
 [ 0  1 14  2  4]
 [ 0  8  2  6  1]
 [ 2  2  4  5 14]]

Logistic Regression: 
[[26  5  0  0  3]
 [ 3  3  0  0  5]
 [ 0  0  9  0 12]
 [ 1  3  0  0 13]
 [ 0  2  0  0 25]]
