In [2]:
import pandas as pd

df = pd.read_csv('../data/NBA_2024_Shots.csv')

df.head()

Unnamed: 0,SEASON_1,SEASON_2,TEAM_ID,TEAM_NAME,PLAYER_ID,PLAYER_NAME,POSITION_GROUP,POSITION,GAME_DATE,GAME_ID,...,BASIC_ZONE,ZONE_NAME,ZONE_ABB,ZONE_RANGE,LOC_X,LOC_Y,SHOT_DISTANCE,QUARTER,MINS_LEFT,SECS_LEFT
0,2024,2023-24,1610612764,Washington Wizards,1629673,Jordan Poole,G,SG,11-03-2023,22300003,...,In The Paint (Non-RA),Center,C,8-16 ft.,-0.4,17.45,12,1,11,1
1,2024,2023-24,1610612764,Washington Wizards,1630166,Deni Avdija,F,SF,11-03-2023,22300003,...,Above the Break 3,Center,C,24+ ft.,1.5,30.55,25,1,10,26
2,2024,2023-24,1610612764,Washington Wizards,1626145,Tyus Jones,G,PG,11-03-2023,22300003,...,Restricted Area,Center,C,Less Than 8 ft.,-3.3,6.55,3,1,9,46
3,2024,2023-24,1610612764,Washington Wizards,1629673,Jordan Poole,G,SG,11-03-2023,22300003,...,Restricted Area,Center,C,Less Than 8 ft.,-1.0,5.85,1,1,8,30
4,2024,2023-24,1610612764,Washington Wizards,1626145,Tyus Jones,G,PG,11-03-2023,22300003,...,Restricted Area,Center,C,Less Than 8 ft.,-0.0,6.25,1,1,8,8


In [None]:
df.columns

df.info()

In [5]:
#Clean and prepare the data
df_clean = df.dropna()

df_clean['SHOT_MADE'] = df_clean['SHOT_MADE'].astype(int)

df_clean['TIME_LEFT_SEC'] = df_clean['MINS_LEFT'] * 60 + df_clean['SECS_LEFT']

features = [
    'SHOT_TYPE', 'ACTION_TYPE', 'BASIC_ZONE', 'ZONE_RANGE',
    'TIME_LEFT_SEC', 'POSITION'
]

target = 'SHOT_MADE'

df_model = df_clean[features + [target]]
df_model.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['SHOT_MADE'] = df_clean['SHOT_MADE'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['TIME_LEFT_SEC'] = df_clean['MINS_LEFT'] * 60 + df_clean['SECS_LEFT']


Unnamed: 0,SHOT_TYPE,ACTION_TYPE,BASIC_ZONE,ZONE_RANGE,TIME_LEFT_SEC,POSITION,SHOT_MADE
0,2PT Field Goal,Driving Floating Jump Shot,In The Paint (Non-RA),8-16 ft.,661,SG,0
1,3PT Field Goal,Jump Shot,Above the Break 3,24+ ft.,626,SF,1
2,2PT Field Goal,Driving Layup Shot,Restricted Area,Less Than 8 ft.,586,PG,1
3,2PT Field Goal,Running Finger Roll Layup Shot,Restricted Area,Less Than 8 ft.,510,SG,1
4,2PT Field Goal,Cutting Layup Shot,Restricted Area,Less Than 8 ft.,488,PG,1


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = df_model.drop('SHOT_MADE', axis=1)
y = df_model['SHOT_MADE']

categorical_cols = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep numeric features like TIME_LEFT_SEC
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13973  8896]
 [10205 10414]]
              precision    recall  f1-score   support

           0       0.58      0.61      0.59     22869
           1       0.54      0.51      0.52     20619

    accuracy                           0.56     43488
   macro avg       0.56      0.56      0.56     43488
weighted avg       0.56      0.56      0.56     43488

