In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/Regular_Season_Total_Data.csv")

In [3]:
data['GAME_DATE'] = pd.to_datetime(data['GAME_DATE'])
data['Home'] = data['MATCHUP'].str.contains('vs\\.')

In [4]:
# Feature engineering
def compute_eff_index(df):
    df = df.copy()
    df['MissedFG'] = np.maximum(df['FGA'] - df['FGM'], 0)
    df['MissedFT'] = np.maximum(df['FTA'] - df['FTM'], 0)
    df['TSA'] = df['FGA'] + 0.44 * df['FTA']
    df['TS'] = np.where(df['TSA'] > 0, df['PTS'] / (2 * df['TSA']), np.nan)
    df['TS_Pct'] = df['TS'] * 100
    df['EffIndex'] = df['PTS'] + df['REB'] + df['AST'] + df['STL'] + df['BLK'] - (df['MissedFG'] + df['MissedFT'] + df['TOV'])
    df['FG_Eff'] = df['FGM'] / (df['FGA'] + 1)
    df['RebRatio'] = df['OREB'] / (df['OREB'] + df['DREB'] + 1)
    return df

data = compute_eff_index(data)

In [5]:
def make_rolling(series, window=10):
    return series.rolling(window=window, min_periods=1).mean().shift(1)

In [6]:
# Apply rolling features per team
data = data.sort_values(['TEAM_ABBREVIATION', 'GAME_DATE'])
# Compute rolling averages for individual features (10-game rolling, shifted to avoid leakage)
data[['r10_MissedFG', 'r10_MissedFT', 'r10_TSA', 'r10_TS_Pct', 'r10_FG_Eff', 'r10_RebRatio', 'r10_TS']] = (
    data.groupby('TEAM_ABBREVIATION', group_keys=False)
        .apply(lambda g: g.assign(
            r10_MissedFG=make_rolling(g['MissedFG']),
            r10_MissedFT=make_rolling(g['MissedFT']),
            r10_TSA=make_rolling(g['TSA']),
            r10_TS_Pct=make_rolling(g['TS_Pct']),
            r10_FG_Eff=make_rolling(g['FG_Eff']),
            r10_RebRatio=make_rolling(g['RebRatio']),
            r10_TS=make_rolling(g['TS'])
        ))[['r10_MissedFG', 'r10_MissedFT', 'r10_TSA', 'r10_TS_Pct', 'r10_FG_Eff', 'r10_RebRatio', 'r10_TS']]
)

# Drop any rows missing the new rolling features
data = data.dropna(subset=['r10_MissedFG', 'r10_MissedFT', 'r10_TSA', 'r10_TS_Pct', 'r10_FG_Eff', 'r10_RebRatio', 'r10_TS'])
data_sorted = data.sort_values('GAME_DATE').reset_index(drop=True)

  .apply(lambda g: g.assign(


In [7]:
n = len(data_sorted)
train_end = int(0.7 * n)
test_end = int(0.9 * n)

In [8]:
train_cutoff_date = data_sorted.loc[train_end, 'GAME_DATE']
test_cutoff_date = data_sorted.loc[test_end, 'GAME_DATE']

In [9]:
train_data = data_sorted[data_sorted['GAME_DATE'] <= train_cutoff_date]
test_data = data_sorted[(data_sorted['GAME_DATE'] > train_cutoff_date) & (data_sorted['GAME_DATE'] <= test_cutoff_date)]
validation_data = data_sorted[data_sorted['GAME_DATE'] > test_cutoff_date]

train_data.to_csv("C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/train.csv", index=False)
test_data.to_csv("C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/test.csv", index=False)
validation_data.to_csv("C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/validation.csv", index=False)