# To do:

 - Figure out how to signal injuries
 - Add shooting percentages features for PTS stats
 - Add team shooting percentages for AST stats
 - Add def team shooting percentages for REB stats
 - Add TOV stats for STL
 - Add field goal attempts and 3 point attempts features FOR pts stats

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import math         # haversine_km()
import os

import xgboost as xgb
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)
tgt_stat = "PTS"
print('Target Stat:', tgt_stat)

Today's date: 2025-12-22
Target Stat: PTS


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model):
    importance = model.get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
def create_baseline_model(df, pred_col, train_df, val_df, test_df):

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP',
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            'starter', 'bench', 'reserve'
        ]
    else:
        print('Stats Model')
        feature_cols = [
            tgt_stat,
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}',
            f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
            'DaysLstGm'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    feature_cols = [col for col in df.columns if col not in ['Date', pred_col]]

    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst, (X_train, y_train, X_val, y_val, X_test, y_test)

In [5]:
def hyperparam_tuning(splits, n_iter=20, early_stopping_rounds=50):
    """
    Hyperparameter tuning using native XGBoost API and DMatrix,
    with early stopping support (compatible with XGBoost 3.1.2)
    """

    X_train, y_train, X_val, y_val, X_test, y_test = splits

    # Convert datasets to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.01, 0.05),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(1, 8),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 2),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 2)
    }

    # Sample n_iter random parameter combinations
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k, v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_mae = float('inf')
    best_params = None
    best_bst = None

    # Manual hyperparameter search
    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')
        params.update({
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device": "cuda",
            "seed": 42
        })
        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        # Predict on validation set to compute MAE
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        mae = mean_absolute_error(y_val, val_preds)
        print(f"Validation MAE: {mae:.4f}")
        if mae < best_mae:
            best_mae = mae
            best_params = params.copy()
            best_bst = bst

    print("\nBest validation MAE:", best_mae)
    print("Best parameters:", best_params)

    # Predict on test set using best model
    preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    print("\nTest Metrics:")
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))

    return best_bst, preds

### Create Base df

In [6]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
for i in [2022, 2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_schd = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_schd['Season'] = i
    df2 = pd.concat([df2, df_schd])
    
    df_gms = pd.read_csv(f"../tables/{i}/season_gamelogs.csv")
    df_gms['Season'] = i
    df3 = pd.concat([df3, df_gms])

df['Date'] = pd.to_datetime(df.Date)
df2['Date'] = pd.to_datetime(df2.Date)
df3['Date'] = pd.to_datetime(df3.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)].drop(['WrngTm', 'WrngOpp', 'Tms'], axis=1)

df3 = df3[['game_id', 'Date', 'Team', 'Player', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TOV', 'PF', '+/-']]\
        .rename(columns={"3PA": "TPA", "3P%": "TP%"})
df3 = df3[~df3[['Date', 'Team', 'Player']].duplicated(keep='last')]
df = df.merge(df3, on=['game_id', 'Date', 'Team', 'Player'])

df_mtch = df2[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Season', 'Date', 'Team', 'Team_type', 'AwayPTS', 'HomePTS', 'cup_gm', 'pstszn_gm']]
df_mtch = df_mtch.sort_values(["Team", "Date"])
df_mtch['team_game_num'] = df_mtch.groupby(["Team", "Season"]).cumcount() + 1
df_mtch['Spread'] = np.where(df_mtch.Team_type == 'Home', df_mtch.HomePTS - df_mtch.AwayPTS, df_mtch.AwayPTS - df_mtch.HomePTS)
df_mtch['Total'] = df_mtch.AwayPTS + df_mtch.HomePTS
df_mtch['is_Win'] = np.where(df_mtch.Spread > 0, 1, 0)
df_mtch['Szn_Wins'] = df_mtch.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()
position_encoder = LabelEncoder()

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df["Pos"] = position_encoder.fit_transform(df["Pos"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
print('base df created', datetime.now())

base df created 2025-12-22 12:31:32.917972


# Minutes Projection Model

In [7]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 'TOV', 'PF', '+/-',
             'Spread', 'Total', 'team_game_num', 'Szn_Wins', 'cup_gm', 'pstszn_gm']]
    
    for col in ['MP', 'TOV', 'PF', '+/-']:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )

        df[f'{col}_last_5_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=5, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

        df[f'{col}_last_10_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=10, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
        
    
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
      
    df['missed_games'] = (
        df.groupby(['Player', 'Team', 'Season'])['team_game_num']      
          .diff()
          .sub(1)
          .fillna(0)
          .astype(int)
    )

#     df["career_min_mins"] = (
#         df.assign(MP_nonzero=df["MP"].replace(0, np.nan))
#           .groupby("Player")["MP_nonzero"]
#           .cummin()
#           .shift(1)
#     )
#     df["career_max_mins"] = df.groupby("Player")["MP"].cummax().shift(1)
    
    df['blowout'] = np.where(abs(df.Spread >= 15), 1, 0)
    
#     df['Szn_Wins'] = df.groupby(['Player', 'Season', 'Team'])['Szn_Wins'].shift(1).fillna(0)
#     df['Win_Pct'] = df.Szn_Wins / df.team_game_num
    
    # Have to derive OppSzn_Wins and then add it to the df
#     df['OppSzn_Wins'] = df.groupby(['Player', 'Season', 'Opp'])['Szn_Wins'].shift(1).fillna(0)
    
    df = df.drop(['reserve_td', 'bench_td', 'starter_td', 'Szn_Wins', 'TOV', 'PF', '+/-'], axis=1)    
    
    return df

In [8]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

n = len(df_mins)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)
mins_train_df = df_mins.iloc[:train_end]
mins_val_df   = df_mins.iloc[train_end:val_end]
mins_test_df  = df_mins.iloc[val_end:]

mins_model, mins_splits = create_baseline_model(df_mins, "MP", mins_train_df, mins_val_df, mins_test_df)
mins_model, mins_preds = hyperparam_tuning(mins_splits, n_iter=1)
# feature_importance(mins_model)

Unnamed: 0,Season,Date,Team,Team_type,Opp,Player,Pos,B2B,MP,Spread,Total,team_game_num,cup_gm,pstszn_gm,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,TOV_lst_gm,TOV_last_5_avg,TOV_last_10_avg,PF_lst_gm,PF_last_5_avg,PF_last_10_avg,+/-_lst_gm,+/-_last_5_avg,+/-_last_10_avg,games_last_7_days,prev_team_mins_pct,reserve,bench,starter,missed_games,blowout
0,2022,2022-10-21,0,1,21,2,3,0,0.00,10.0,206.0,2,0,0,,25.226,18.698,,0.8,0.7,,1.8,1.6,,-1.0,-1.1,3,,0,1,0,0,0
1,2022,2022-10-21,0,1,21,5,2,0,14.37,10.0,206.0,2,0,0,,24.286,26.067,,1.0,1.4,,2.2,1.9,,17.0,12.5,3,,1,0,0,0,0
2,2022,2022-10-21,0,1,21,120,0,0,31.62,10.0,206.0,2,0,0,,0.934,3.465,,0.0,0.0,,0.0,0.0,,1.8,1.2,3,,0,0,1,0,0
3,2022,2022-10-21,0,1,21,171,3,0,32.53,10.0,206.0,2,0,0,,29.318,30.218,,2.2,3.2,,2.4,2.1,,10.8,3.8,4,,0,0,1,0,0
4,2022,2022-10-21,0,1,21,178,4,0,39.62,10.0,206.0,2,0,0,,29.280,28.101,,1.4,1.0,,2.8,2.0,,-4.0,0.9,2,,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95276,2025,2025-12-20,28,1,21,711,1,0,10.63,-1.0,255.0,27,0,0,24.73,6.464,8.926,0.0,0.0,0.3,0.0,0.2,0.2,-9.0,-3.8,-2.7,4,0.103042,0,1,0,3,0
95277,2025,2025-12-20,29,0,14,89,4,0,39.43,8.0,252.0,26,0,0,29.10,33.262,31.922,2.0,2.8,1.8,0.0,2.4,2.2,-15.0,-9.6,-6.3,3,0.121250,0,0,1,0,0
95278,2025,2025-12-20,29,0,14,474,3,0,22.12,8.0,252.0,26,0,0,25.18,27.544,26.354,2.0,2.4,2.0,2.0,2.6,2.8,-19.0,-1.2,-3.3,3,0.104917,0,0,1,4,0
95279,2025,2025-12-20,29,0,14,491,3,0,39.23,8.0,252.0,26,0,0,36.08,32.572,31.845,1.0,3.0,3.0,4.0,4.0,3.5,-21.0,-8.0,-8.1,3,0.150333,0,0,1,0,0


Minutes Model
Train: 76224 / Validation: 9528 / Test: 9529
RMSE: 6.028734525410876
MAE: 4.368276925351835
R²: 0.7590828378459518

Trial 1/1: {'n_estimators': 525, 'learning_rate': 0.05087242544996178, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.8009191034232703, 'colsample_bytree': 0.9251858422222503, 'gamma': 1.224939044253881, 'reg_lambda': 2.391097315424174, 'reg_alpha': 1.091653768788486}
Validation MAE: 4.9237

Best validation MAE: 4.923689562191391
Best parameters: {'learning_rate': 0.05087242544996178, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.8009191034232703, 'colsample_bytree': 0.9251858422222503, 'gamma': 1.224939044253881, 'reg_lambda': 2.391097315424174, 'reg_alpha': 1.091653768788486, 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 6.032971683194684
MAE: 4.376883731401284
R²: 0.758744072637791


In [9]:
rmse = np.sqrt(mean_squared_error(mins_splits[5], mins_preds)) # splits[5] = y_test
mae = mean_absolute_error(mins_splits[5], mins_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'MP']], on=['Date', 'Team', 'Player'])
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', 'MP_last_5_avg']][df_yesterday.MP > 0]

df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff'] <= rmse, 1, 0)

print("\nYesterday's Results:")
print("Total Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))

df_yesterday = df_yesterday.drop(['Diff'], axis=1)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 6.032971683194684

Yesterday's Results:
Total Accuracy (InRMSE_Range): nan


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,MP_last_5_avg,InRMSE_Range


# Main Model

In [10]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [11]:
def setup_df_main(df):
    
    # Minutes based Features
    df['MP_lst_gm'] = (
        df
        .groupby(['Player', 'Season'])['MP']
        .shift(1)
    )

    df['MP_last_5_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=5, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    df['MP_last_10_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=10, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)
    
    # Efficiency metrics
    df['three_rate_raw'] =  np.where(df.FGA > 0, df['TPA'] / df['FGA'], 0)
    df['ft_rate_raw']    =  np.where(df.FGA > 0, df['FTA'] / df['FGA'], 0)
    df['eFG_raw'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS_raw'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))    
    df['usage_proxy_raw'] =  np.where(df.MP > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
    eff_cols = []
    for w in [3, 5, 10]:
        for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
            col = f"{metric}_L{w}"
            df[col] = (
                df.groupby(['Player','Season'])[f'{metric}_raw']
                  .rolling(w, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0,1], drop=True)
            )
            eff_cols.append(col)
    for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
        col = f'{metric}_weighted'
        df[col] = (
            0.6 * df[f'{metric}_L3'] +
            0.3 * df[f'{metric}_L5'] +
            0.1 * df[f'{metric}_L10']
        )
        eff_cols.append(col)
    
    df['FGA_L5_avg'] = df.groupby(['Player', 'Season'])['FGA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['TPA_L5_avg'] = df.groupby(['Player', 'Season'])['TPA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['FTA_L5_avg'] = df.groupby(['Player', 'Season'])['FTA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    
    stat_cols = []
    for col in [f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}']:
        stat_cols.append(col)

    final_cols = ['Date', 'Team', 'Team_type', 'B2B', 'cup_gm', 'pstszn_gm', 'Player', 'Pos', 'Opp', 
                  'MP', 'MP_lst_gm', 'MP_last_5_avg', 'MP_last_10_avg', 'Spread', 'Total',
                  'DaysLstGm', 'travel_km', 'travel_hours', 'PrevLocation', 'is_long_trip', 'same_arena', 
                  'FGA_L5_avg', 'TPA_L5_avg', 'FTA_L5_avg',
                  tgt_stat] + stat_cols + eff_cols
    df = df[final_cols]
    
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(0)
        
    # PRA features
    if tgt_stat == 'PTS':
        pass
    elif tgt_stat == 'PRA':
        df['PRA_per_min'] = np.where(df.MP > 0, df.PRA / df['MP'], 0)
        df['PRA_last_5_per_min_avg'] = df.groupby('Player')['PRA_per_min'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0,1], drop=True)
        df = df.drop(['PRA_per_min'], axis=1)
        
    return df

In [12]:
df_main = df.copy()
df_main = setup_df_main(df_main)
display(df_main)

n = len(df)
train_end = int(0.65 * n)
val_end   = int(0.85 * n)
main_train_df = df_main.iloc[:train_end]
main_val_df   = df_main.iloc[train_end:val_end]
main_test_df  = df_main.iloc[val_end:]

stat_model, main_splits = create_baseline_model(df_main, tgt_stat, main_train_df, main_val_df, main_test_df)
stat_model, stat_preds = hyperparam_tuning(main_splits, n_iter=1)
# feature_importance(stat_model)

Unnamed: 0,Date,Team,Team_type,B2B,cup_gm,pstszn_gm,Player,Pos,Opp,MP,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,Spread,Total,DaysLstGm,travel_km,travel_hours,PrevLocation,is_long_trip,same_arena,FGA_L5_avg,TPA_L5_avg,FTA_L5_avg,PTS,Off_PTS,Off_L5_PTS,Def_PTS,Def_L5_PTS,three_rate_L3,ft_rate_L3,eFG_L3,TS_L3,usage_proxy_L3,three_rate_L5,ft_rate_L5,eFG_L5,TS_L5,usage_proxy_L5,three_rate_L10,ft_rate_L10,eFG_L10,TS_L10,usage_proxy_L10,three_rate_weighted,ft_rate_weighted,eFG_weighted,TS_weighted,usage_proxy_weighted
0,2022-10-21,0,1,0,0,0,2,3,21,0.00,0.00,25.226,18.698,10.0,206.0,0,0.0,0.0,0.0,0,0,8.4,4.0,5.2,0,0.000000,0.0,8.000000,8.0,0.508995,0.681481,0.505556,0.606562,0.419508,0.486349,0.646984,0.496984,0.583267,0.428189,0.551508,0.369325,0.479742,0.513125,0.425240,0.506452,0.639917,0.500403,0.590230,0.422685
1,2022-10-21,0,1,0,0,0,5,2,21,14.37,0.00,24.286,26.067,10.0,206.0,0,0.0,0.0,0.0,0,0,11.4,4.4,4.2,4,5.000000,5.0,18.000000,18.0,0.559259,0.829630,0.416667,0.519801,0.616138,0.508283,0.691717,0.475758,0.563433,0.552491,0.479785,0.587601,0.523390,0.606242,0.509510,0.536019,0.764053,0.445066,0.541535,0.586381
2,2022-10-21,0,1,0,0,0,120,0,21,31.62,0.00,0.934,3.465,10.0,206.0,0,0.0,0.0,0.0,0,0,0.2,0.2,0.0,8,2.000000,2.0,14.000000,14.0,0.333333,0.000000,1.500000,1.500000,0.071378,0.200000,0.000000,1.500000,1.500000,0.042827,0.300000,0.000000,0.600000,0.600000,0.088741,0.290000,0.000000,1.410000,1.410000,0.064549
3,2022-10-21,0,1,0,0,0,171,3,21,32.53,0.00,29.318,30.218,10.0,206.0,0,0.0,0.0,0.0,0,0,12.8,5.6,2.8,12,22.000000,22.0,8.000000,8.0,0.411422,0.320513,0.337558,0.394174,0.462433,0.434304,0.245641,0.446849,0.485416,0.477099,0.403221,0.308395,0.509975,0.558581,0.554476,0.417466,0.296839,0.387587,0.437987,0.476037
4,2022-10-21,0,1,0,0,0,178,4,21,39.62,0.00,29.280,28.101,10.0,206.0,0,0.0,0.0,0.0,0,0,9.4,0.0,1.0,20,20.000000,20.0,19.000000,19.0,0.000000,0.134680,0.821549,0.799477,0.334001,0.000000,0.098990,0.705051,0.694954,0.338075,0.000000,0.175991,0.721115,0.718415,0.332501,0.000000,0.128104,0.776556,0.760014,0.335073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95276,2025-12-20,28,1,0,0,0,711,1,21,10.63,24.73,6.464,8.926,-1.0,255.0,13,0.0,0.0,20.0,0,0,2.8,1.6,1.0,6,6.750000,8.0,15.588235,19.0,0.233333,0.166667,0.566667,0.576503,0.307915,0.340000,0.100000,0.877778,0.884335,0.507329,0.340000,0.116667,0.452381,0.473601,0.389357,0.276000,0.141667,0.648571,0.658562,0.375883
95277,2025-12-20,29,0,0,0,0,89,4,14,39.43,29.10,33.262,31.922,8.0,252.0,2,0.0,0.0,26.0,0,0,16.4,7.4,3.6,28,18.600000,21.2,17.235294,16.8,0.344444,0.150000,0.494444,0.507257,0.531173,0.454510,0.203725,0.557059,0.581515,0.535877,0.453431,0.138815,0.563123,0.575743,0.562953,0.388363,0.164999,0.520097,0.536383,0.535762
95278,2025-12-20,29,0,0,0,0,474,3,14,22.12,25.18,27.544,26.354,8.0,252.0,16,0.0,0.0,1.0,0,0,9.2,2.2,1.4,6,10.437500,11.0,17.600000,16.2,0.216667,0.127778,0.641667,0.635210,0.413624,0.255000,0.151667,0.505833,0.521756,0.357809,0.310357,0.147262,0.537202,0.556965,0.326939,0.237536,0.136893,0.590470,0.593349,0.388211
95279,2025-12-20,29,0,0,0,0,491,3,14,39.23,36.08,32.572,31.845,8.0,252.0,2,0.0,0.0,26.0,0,0,10.4,5.8,1.8,28,14.695652,13.0,17.600000,16.2,0.551049,0.208858,0.466084,0.483617,0.380428,0.572188,0.182458,0.601728,0.616889,0.344574,0.514665,0.170450,0.494804,0.520920,0.339360,0.553752,0.197097,0.509649,0.527329,0.365565


Stats Model
Train: 61932 / Validation: 19056 / Test: 14293
RMSE: 4.620586248085948
MAE: 3.222500801086426
R²: 0.7486312389373779

Trial 1/1: {'n_estimators': 738, 'learning_rate': 0.05700578128429378, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.768747489969833, 'colsample_bytree': 0.8192643843047207, 'gamma': 0.34556419878845346, 'reg_lambda': 3.682015605039073, 'reg_alpha': 1.532619403160661}
Validation MAE: 3.0699

Best validation MAE: 3.0699033737182617
Best parameters: {'learning_rate': 0.05700578128429378, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.768747489969833, 'colsample_bytree': 0.8192643843047207, 'gamma': 0.34556419878845346, 'reg_lambda': 3.682015605039073, 'reg_alpha': 1.532619403160661, 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 4.619746755950084
MAE: 3.229062557220459
R²: 0.7487226128578186


In [13]:
rmse = np.sqrt(mean_squared_error(main_splits[5], stat_preds)) # splits[5] = y_test
mae = mean_absolute_error(main_splits[5], stat_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)
df_gms = df_gms.rename(columns={"TRB": "REB", "3PM": "TPM", "3PA": "TPA"})
df_gms['STL_BLK'] = df_gms.STL + df_gms.BLK
df_gms['PR'] = df_gms.PTS + df_gms.REB 
df_gms['PA'] = df_gms.PTS + df_gms.AST
df_gms['RA'] = df_gms.REB + df_gms.AST
df_gms['PRA'] = df_gms.PTS + df_gms.REB + df_gms.AST

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)

df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff2'] <= rmse, 1, 0)

print("Total Accuracy (ParlayHit):", ((df_yesterday.ParlayHit == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.ParlayHit == 1).sum(), "/", df_yesterday.shape[0])

print("\nTotal Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.InRMSE_Range == 1).sum(), "/", df_yesterday.shape[0])

df_yesterday = df_yesterday.drop(['Diff', 'Diff2', 'Act_Res', 'Pred_Res'], axis=1).sort_values(f'{tgt_stat}_line', ascending=False)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 4.619746755950084
Total Accuracy (ParlayHit): nan
0 / 0

Total Accuracy (InRMSE_Range): nan
0 / 0


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,PTS_line,PTS_proj,PTS,ParlayHit,InRMSE_Range


### Today's predictions

In [14]:
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)

df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
df_pred['Date'] = pd.to_datetime(df_pred.Date)
df_pred['Season'] = 2025
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred[col].astype('category')
df_pred = df_pred.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])
df_pred[tgt_stat] = 0

# Predict minutes
df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP', 'TPM']], on=['Date', 'Team', 'Player'], how='left')
df_pred = df_pred.merge(df3[['Date', 'Team', 'Player', 'TOV', 'PF', '+/-', 'FGA', 'FG', 'TPA', 'FT', 'FTA']], on=['Date', 'Team', 'Player'], how='left')

df_pred = df_pred.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_pred_mins = setup_df_mins(con, df_pred)

df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
df_pred_mins["Opp"] = team_encoder.transform(df_pred_mins["Opp"])
df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
df_pred_mins["Pos"] = position_encoder.transform(df_pred_mins["Pos"])
DM_mins = xgb.DMatrix(df_pred_mins)
df_pred['MP'] = mins_model.predict(DM_mins)
df_pred['N_TPM'] = df_pred.FG - df_pred.TPM
df_pred['PTS'] = (df_pred.FT * 1) + (df_pred.N_TPM * 2) + (df_pred.TPM * 3)
df_pred = setup_df_main(df_pred)
feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
df_pred = df_pred[df_pred.Date == now][feature_cols]

# Predict stat
df_pred["Team"] = team_encoder.transform(df_pred["Team"])
df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
df_pred = df_pred[~(df_pred.PrevLocation.isnull())] # Filters out players who are debuting on the year
df_pred["PrevLocation"] = team_encoder.transform(df_pred["PrevLocation"])
df_pred["Player"] = player_encoder.transform(df_pred["Player"])
df_pred["Pos"] = position_encoder.transform(df_pred["Pos"])
df_pred['Team_type'] = team_type_encoder.transform(df_pred['Team_type'])
DM_stats = xgb.DMatrix(df_pred)
df_pred[f"{tgt_stat}_proj"] = stat_model.predict(DM_stats)

df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
df_pred['Opp'] = team_encoder.inverse_transform(df_pred["Opp"])
df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])
df_pred['Pos'] = position_encoder.inverse_transform(df_pred["Pos"])

df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

tds_picks = df_pred[~(df_pred[f'{tgt_stat}_line'].isnull())]\
            [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_last_5_avg', f'{tgt_stat}_line', f'{tgt_stat}_proj']]
tds_picks['Diff'] = abs((df_pred[f'{tgt_stat}_line'] - df_pred[f'{tgt_stat}_proj']))
tds_picks['Diff2'] = abs((df_pred['MP'] - df_pred['MP_last_5_avg']))
tds_picks = tds_picks[(tds_picks.Diff >= mae) & (tds_picks.Diff2 <= 5)].sort_values('Diff', ascending=False).drop(['Diff', 'Diff2'], axis=1)
display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/2025/gmday_preds_{tgt_stat}.csv")

ValueError: You are trying to merge on float64 and object columns for key 'Team'. If you wish to proceed you should use pd.concat