In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
df = pd.read_csv("../data/NBA_2024_Shots.csv")

In [18]:
X = df.drop(columns=["SHOT_MADE", "EVENT_TYPE"])
y = df['SHOT_MADE']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.preprocessing import LabelEncoder

# Columns you want to drop
drop_cols = [
    'SEASON_2', 'TEAM_NAME', 'PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID',
    'GAME_DATE', 'GAME_ID', 'HOME_TEAM', 'AWAY_TEAM',
    'POSITION_GROUP', 'POSITION', 'ZONE_NAME', 'ZONE_ABB', 'EVENT_TYPE'
]

# Only keep the columns that actually exist
drop_cols = [col for col in drop_cols if col in X_train.columns]

# Drop safely
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)


cat_cols = ['ACTION_TYPE', 'SHOT_TYPE', 'BASIC_ZONE', 'ZONE_RANGE']

for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

print(X_train.dtypes)

SEASON_1           int64
ACTION_TYPE        int64
SHOT_TYPE          int64
BASIC_ZONE         int64
ZONE_RANGE         int64
LOC_X            float64
LOC_Y            float64
SHOT_DISTANCE      int64
QUARTER            int64
MINS_LEFT          int64
SECS_LEFT          int64
dtype: object


In [25]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [26]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

from model_utils import evaluate_model

y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)


Confusion Matrix:
[[16159  6861]
 [10684 10037]]

Classification Report:
              precision    recall  f1-score   support

       False       0.60      0.70      0.65     23020
        True       0.59      0.48      0.53     20721

    accuracy                           0.60     43741
   macro avg       0.60      0.59      0.59     43741
weighted avg       0.60      0.60      0.59     43741



In [27]:
print(X_train.shape, X_test.shape)
print(y_train.value_counts())
print(y_test.value_counts())

(174960, 11) (43741, 11)
False    91942
True     83018
Name: SHOT_MADE, dtype: int64
False    23020
True     20721
Name: SHOT_MADE, dtype: int64


In [28]:
print(set(X_train.index).intersection(set(X_test.index)))

set()


In [29]:
import pandas as pd

# Feature importances
importances = model.feature_importances_
feat_imp = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
print(feat_imp)


SECS_LEFT        0.218189
LOC_X            0.209966
LOC_Y            0.206415
MINS_LEFT        0.113088
ACTION_TYPE      0.087217
SHOT_DISTANCE    0.074126
QUARTER          0.056312
BASIC_ZONE       0.020202
ZONE_RANGE       0.007852
SHOT_TYPE        0.006633
SEASON_1         0.000000
dtype: float64
