# To do:

 - Figure out how to signal injuries
 - Add team shooting percentages for AST stats
 - Add def team shooting percentages for REB stats
 - Add TOV stats for STL

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import math         # haversine_km()
import os

import xgboost as xgb
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)
tgt_stat = "PRA"
print('Target Stat:', tgt_stat)

Today's date: 2025-12-28
Target Stat: PRA


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model):
    importance = model.get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
def create_baseline_model(df, pred_col, DFS):
    
    train_df, val_df, test_df = DFS

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            'starter', 'bench', 'reserve'
        ]
    else:
        print(f'{pred_col} Stats Model')
        feature_cols = [
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            f'{pred_col}_last_3_avg', f'{pred_col}_last_5_avg', f'{pred_col}_last_10_avg',
            f'Def_{pred_col}', f'Def_L5_{pred_col}'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst

In [42]:
def hyperparam_tuning(DFS, pred_col, n_iter=20, early_stopping_rounds=50):
    """
    Hyperparameter tuning using native XGBoost API and DMatrix,
    with early stopping support (compatible with XGBoost 3.1.2)
    """

    train_df, val_df, test_df = DFS
    feature_cols = [col for col in train_df.columns if col not in ['Date', pred_col]]
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert datasets to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.01, 0.05),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(1, 8),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 2),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 2)
    }

    # Sample n_iter random parameter combinations
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k, v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_mae = float('inf')
    best_params = None
    best_bst = None

    # Manual hyperparameter search
    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')
        params.update({
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device": "cuda",
            "seed": 42
        })
        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        # Predict on validation set to compute MAE
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        mae = mean_absolute_error(y_val, val_preds)
        print(f"Validation MAE: {mae:.4f}")
        if mae < best_mae:
            best_mae = mae
            best_params = params.copy()
            best_bst = bst

    print("\nBest validation MAE:", best_mae)
    print("Best parameters:", best_params)

    # Predict on test set using best model
    preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    test_df[pred_col] = y_test
    test_df[f'{pred_col}_preds'] = preds
    test_df['Team'] = team_encoder.inverse_transform(test_df["Team"])
    test_df['Opp'] = team_encoder.inverse_transform(test_df["Opp"])
    test_df['Player'] = player_encoder.inverse_transform(test_df["Player"])
    test_df['Pos'] = position_encoder.inverse_transform(test_df["Pos"])
    analyze_df = test_df[['Date', 'Team', 'Player', 'Pos', 'Opp', pred_col, f'{pred_col}_preds']]
    print("\nTest Metrics:")
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))

    return best_bst, preds, y_test, analyze_df

### Create Base df

In [51]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
for i in [2022, 2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_schd = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_schd['Season'] = i
    df2 = pd.concat([df2, df_schd])
    
    df_gms = pd.read_csv(f"../tables/{i}/season_gamelogs.csv")
    df_gms['Season'] = i
    df3 = pd.concat([df3, df_gms])
    
    df_inj = pd.read_csv(f"../tables/{i}/injuries.csv")
    df_inj['Season'] = i
    df4 = pd.concat([df4, df_inj])

df['Date'] = pd.to_datetime(df.Date)
df2['Date'] = pd.to_datetime(df2.Date)
df3['Date'] = pd.to_datetime(df3.Date)
df3 = df3[~df3[['Date', 'Team', 'Player']].duplicated(keep='last')]
df4['Date'] = pd.to_datetime(df4.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)].drop(['WrngTm', 'WrngOpp', 'Tms'], axis=1)

df3_temp = df3[['game_id', 'Date', 'Team', 'Player', 'Active', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TOV', 'PF', '+/-']]\
        .rename(columns={"3PA": "TPA", "3P%": "TP%"})
df = df.merge(df3_temp, on=['game_id', 'Date', 'Team', 'Player'])

df_mtch = df2[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Season', 'Date', 'Team', 'Team_type', 'AwayPTS', 'HomePTS', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch = df_mtch.sort_values(["Team", "Date"])
df_mtch['team_game_num'] = df_mtch.groupby(["Team", "Season"]).cumcount() + 1
df_mtch['Spread'] = np.where(df_mtch.Team_type == 'Home', df_mtch.HomePTS - df_mtch.AwayPTS, df_mtch.AwayPTS - df_mtch.HomePTS)
df_mtch['Total'] = df_mtch.AwayPTS + df_mtch.HomePTS
df_mtch['is_Win'] = np.where(df_mtch.Spread > 0, 1, 0)
df_mtch['Szn_Wins'] = df_mtch.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])

df = df.merge(df4[['Date', 'Team', 'Player', 'Status']], on=['Date', 'Team', 'Player'], how='left')
df['Status'] = np.where((df.Active == 1), 'Available', df.Status)
df['Status'] = np.where((df.Active == 0), 'Out', df.Status)
df['Status'] = np.where((df.Status == 'Out') & (df.Active != 0), 'Available', df.Status)

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()
position_encoder = LabelEncoder()

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df["Pos"] = position_encoder.fit_transform(df["Pos"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
df = df[(df.Active == 1) & (df.MP > 0)].sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
print('base df created', datetime.now())

base df created 2025-12-28 23:03:41.241675


# Minutes Projection Model

In [44]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [45]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 'TOV', 'PF', '+/-',
             'Spread', 'Total', 'team_game_num', 'Szn_Wins', 'cup_gm', 'pstszn_gm', 'is_OT']]
    
    for col in ['MP', 'TOV', 'PF', '+/-']:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )
        for N in [3, 5, 10]:
            df[f'{col}_last_{N}_avg'] = (
                df.groupby(['Player', 'Season'])[col]
                  .rolling(window=N, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0, 1], drop=True)
            )
            df[f"{col}_last_{N}_std"] = (
                df.groupby(['Player', 'Season'])[col]
                  .shift(1)
                  .rolling(window=N, min_periods=1)
                  .std()
            )
        df[f"{col}_change_L1"] = df[f"{col}_lst_gm"] - df[f"{col}_last_5_avg"]
        df[f"{col}_change_L3"] = df[f"{col}_last_3_avg"] - df[f"{col}_last_10_avg"]
        df[f"{col}_pct_change"] = (
            (df[f"{col}_lst_gm"] - df[f"{col}_last_10_avg"]) /
            (df[f"{col}_last_10_avg"] + 1e-6)
        )
    df["MP_spike"] = (df["MP_lst_gm"] > df["MP_last_10_avg"] + 8).astype(int)
    df["MP_drop"]  = (df["MP_lst_gm"] < df["MP_last_10_avg"] - 8).astype(int)
    df["MP_trend"] = df["MP_last_3_avg"] - df["MP_last_10_avg"]

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
           
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
      
    df['missed_games'] = (
        df.groupby(['Player', 'Team', 'Season'])['team_game_num']      
          .diff()
          .sub(1)
          .fillna(0)
          .astype(int)
    )
    
    df['blowout'] = np.where(abs(df.Spread >= 15), 1, 0)
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)
    
    df = df.drop(['reserve_td', 'bench_td', 'starter_td', 'Szn_Wins', 'TOV', 'PF', '+/-', 
                  'PrevOpp', 'PrevLocation', 'Location'], axis=1)    
    
    return df

In [46]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

n = len(df_mins)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)
mins_train_df = df_mins.iloc[:train_end]
mins_val_df   = df_mins.iloc[train_end:val_end]
mins_test_df  = df_mins.iloc[val_end:]
mins_DFS = (mins_train_df, mins_val_df, mins_test_df)

mins_model = create_baseline_model(df_mins, "MP", mins_DFS)
mins_model, mins_preds, y_test_mins, analyze_df_mins = hyperparam_tuning(mins_DFS, "MP", n_iter=1)
# feature_importance(mins_model)

Unnamed: 0,Season,Date,Team,Team_type,Opp,Player,Pos,B2B,MP,Spread,Total,team_game_num,cup_gm,pstszn_gm,is_OT,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,MP_change_L1,MP_change_L3,MP_pct_change,TOV_lst_gm,TOV_last_3_avg,TOV_last_3_std,TOV_last_5_avg,TOV_last_5_std,TOV_last_10_avg,TOV_last_10_std,TOV_change_L1,TOV_change_L3,TOV_pct_change,PF_lst_gm,PF_last_3_avg,PF_last_3_std,PF_last_5_avg,PF_last_5_std,PF_last_10_avg,PF_last_10_std,PF_change_L1,PF_change_L3,PF_pct_change,+/-_lst_gm,+/-_last_3_avg,+/-_last_3_std,+/-_last_5_avg,+/-_last_5_std,+/-_last_10_avg,+/-_last_10_std,+/-_change_L1,+/-_change_L3,+/-_pct_change,MP_spike,MP_drop,MP_trend,games_last_7_days,prev_team_mins_pct,reserve,bench,starter,missed_games,blowout,DaysLstGm,travel_km,travel_hours,is_long_trip,same_arena
0,2021,2021-10-22,1,1,27,7,3,0,3.98,-32.0,198.0,2,0,0,0,,24.423333,,24.570,,23.240,,,1.183333,,,1.666667,,1.2,,1.1,,,0.566667,,,1.333333,,1.6,,1.8,,,-0.466667,,,4.333333,,3.6,,7.8,,,-3.466667,,0,0,1.183333,3,,1,0,0,0,0,0,0.0,0.0,0,0
1,2021,2021-10-22,1,1,27,219,2,0,22.72,-32.0,198.0,2,0,0,0,,39.463333,,38.628,,36.743,,,2.720333,,,5.000000,,4.8,,4.3,,,0.700000,,,3.000000,,2.6,,2.9,,,0.100000,,,-4.333333,,-2.4,,-1.4,,,-2.933333,,0,0,2.720333,3,,0,0,1,0,0,0,0.0,0.0,0,0
2,2021,2021-10-22,1,1,27,276,0,0,5.28,-32.0,198.0,2,0,0,0,,15.806667,,11.134,,7.479,,,8.327667,,,0.666667,,0.4,,0.2,,,0.466667,,,0.666667,,0.4,,0.4,,,0.266667,,,-6.000000,,-4.8,,-3.4,,,-2.600000,,0,0,8.327667,2,,0,1,0,0,0,0,0.0,0.0,0,0
3,2021,2021-10-22,1,1,27,315,1,0,10.38,-32.0,198.0,2,0,0,0,,9.460000,,13.036,,13.560,,,-4.100000,,,0.666667,,0.6,,0.6,,,0.066667,,,1.333333,,1.4,,1.7,,,-0.366667,,,-10.333333,,-10.8,,-4.1,,,-6.233333,,0,0,-4.100000,1,,0,0,1,0,0,0,0.0,0.0,0,0
4,2021,2021-10-22,1,1,27,429,3,0,28.40,-32.0,198.0,2,0,0,0,,15.055000,,15.055,,15.055,,,0.000000,,,1.500000,,1.5,,1.5,,,0.000000,,,1.000000,,1.0,,1.0,,,0.000000,,,-8.000000,,-8.0,,-8.0,,,0.000000,,0,0,0.000000,2,,0,0,1,0,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107186,2025,2025-12-26,29,1,27,68,4,0,25.97,21.0,255.0,29,0,0,0,22.57,26.680000,11.049689,28.976,9.501056,26.939,7.589945,-6.406,-0.259000,-0.162181,3.0,2.333333,1.732051,2.4,1.516575,1.7,1.370320,0.6,0.633333,0.764705,2.0,1.666667,2.000000,2.8,1.673320,2.6,1.449138,-0.8,-0.933333,-0.230769,-18.0,-13.000000,7.094599,-5.4,8.792042,-5.6,7.549099,-12.6,-7.400000,2.214286,0,0,-0.259000,1,0.094042,0,0,1,3,1,8,0.0,0.0,0,0
107187,2025,2025-12-26,29,1,27,104,4,0,30.78,21.0,255.0,29,0,0,0,34.17,34.233333,9.972002,34.696,9.443306,32.890,7.671929,-0.526,1.343333,0.038918,2.0,1.333333,1.527525,1.2,1.516575,1.7,1.316561,0.8,-0.366667,0.176470,3.0,2.000000,1.527525,2.2,1.516575,2.2,1.398412,0.8,-0.200000,0.363636,-2.0,-0.666667,10.969655,1.4,8.105554,-6.2,8.974656,-3.4,5.533333,-0.677419,0,0,1.343333,3,0.142375,0,0,1,1,1,5,0.0,0.0,0,0
107188,2025,2025-12-26,29,1,27,564,3,0,19.33,21.0,255.0,29,0,0,0,22.12,26.240000,6.830874,27.152,9.137200,26.376,7.455239,-5.032,-0.136000,-0.161359,1.0,1.666667,1.000000,2.2,1.303840,1.9,1.333333,-1.2,-0.233333,-0.473684,2.0,2.333333,0.577350,2.8,1.483240,2.7,1.418136,-0.8,-0.366667,-0.259259,0.0,-7.333333,9.865766,0.2,9.964939,-1.7,10.163114,-0.2,-5.633333,-1.000001,0,0,-0.136000,1,0.092167,0,0,1,2,1,6,0.0,0.0,0,0
107189,2025,2025-12-26,29,1,27,583,3,0,28.47,21.0,255.0,29,0,0,0,33.27,36.193333,6.712364,34.158,8.371425,32.618,7.416872,-0.888,3.575333,0.019989,2.0,2.333333,0.577350,2.4,1.140175,2.7,1.286684,-0.4,-0.366667,-0.259259,5.0,4.666667,1.527525,4.6,1.816590,4.0,1.563472,0.4,0.666667,0.250000,-14.0,-7.333333,7.571878,-2.2,10.039920,-6.0,9.843215,-11.8,-1.333333,1.333334,0,0,3.575333,3,0.138625,0,0,1,1,1,5,0.0,0.0,0,0


Minutes Model
Train: 85752 / Validation: 10719 / Test: 10720
RMSE: 6.530190855150937
MAE: 4.955414635999878
R²: 0.6287075720455628

Trial 1/1: {'n_estimators': 1463, 'learning_rate': np.float64(0.03991729738916267), 'max_depth': 3, 'min_child_weight': 6, 'subsample': np.float64(0.7437685799309767), 'colsample_bytree': np.float64(0.8007706056844097), 'gamma': np.float64(0.19594266464297405), 'reg_lambda': np.float64(3.575020574642522), 'reg_alpha': np.float64(0.027558284055451576)}
Validation MAE: 4.6135

Best validation MAE: 4.613475174094683
Best parameters: {'learning_rate': np.float64(0.03991729738916267), 'max_depth': 3, 'min_child_weight': 6, 'subsample': np.float64(0.7437685799309767), 'colsample_bytree': np.float64(0.8007706056844097), 'gamma': np.float64(0.19594266464297405), 'reg_lambda': np.float64(3.575020574642522), 'reg_alpha': np.float64(0.027558284055451576), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 6.1062

In [47]:
rmse = np.sqrt(mean_squared_error(y_test_mins, mins_preds)) # splits[5] = y_test
mae = mean_absolute_error(y_test_mins, mins_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'MP']], on=['Date', 'Team', 'Player'])
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', 'MP_last_5_avg']][df_yesterday.MP > 0]

df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff'] <= rmse, 1, 0)

print("\nYesterday's Results:")
print("Total Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.InRMSE_Range == 1).sum(), '/', df_yesterday.shape[0])

df_yesterday = df_yesterday.drop(['Diff'], axis=1)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 6.106293561782478

Yesterday's Results:
Total Accuracy (InRMSE_Range): 0.8529411764705882
29 / 34


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,MP_last_5_avg,InRMSE_Range
0,2025-12-27,DEN,Nikola Jokic,C,ORL,37.244587,38.02,36.226789,1
1,2025-12-27,NYK,Karl-Anthony Towns,C,ATL,29.840424,29.8,33.430463,1
2,2025-12-27,CLE,Donovan Mitchell,SG,HOU,31.292988,30.45,35.315555,1
3,2025-12-27,ATL,Jalen Johnson,SF,NYK,34.768074,37.07,32.978499,1
4,2025-12-27,DAL,Cooper Flagg,PG,SAC,32.576683,34.22,37.789293,1
5,2025-12-27,MIN,Anthony Edwards,SG,BRK,37.231415,34.7,37.179587,1
6,2025-12-27,DAL,P.J. Washington,PF,SAC,25.979902,33.82,32.823985,0
7,2025-12-27,ATL,Trae Young,PG,NYK,28.952557,31.38,28.294092,1
8,2025-12-27,BRK,Michael Porter Jr.,SF,MIN,28.458183,33.07,32.378516,1
9,2025-12-27,NYK,Jalen Brunson,PG,ATL,36.708183,37.67,36.814547,1


# Main Model

In [52]:
def setup_df_main(df):
    
    # Stat dependent features 
    if tgt_stat == 'PTS':
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]
        # Efficiency metrics
        df['three_rate_raw'] =  np.where(df.FGA > 0, df['TPA'] / df['FGA'], 0)
        df['ft_rate_raw']    =  np.where(df.FGA > 0, df['FTA'] / df['FGA'], 0)
        df['eFG_raw'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
        df['TS_raw'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))    
        df['usage_proxy_raw'] =  np.where(df.MP > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
        
        for w in [3, 5, 10]:
            for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
                col = f"{metric}_L{w}"
                df[col] = (
                    df.groupby(['Player','Season'])[f'{metric}_raw']
                      .rolling(w, min_periods=1)
                      .mean()
                      .shift(1)
                      .reset_index(level=[0,1], drop=True)
                )
        for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
            col = f'{metric}_weighted'
            df[col] = (
                0.6 * df[f'{metric}_L3'] +
                0.3 * df[f'{metric}_L5'] +
                0.1 * df[f'{metric}_L10']
            )
            df = df.drop(f'{metric}_raw', axis=1)
        tgt_stat_cols = ['PTS']
        
    elif tgt_stat == 'PRA':
        tgt_stat_cols = ['PTS', 'REB', 'AST']
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]
        
    else:
        tgt_stat_cols = []
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]

    
    # Create rolling + lag features    
    for col in ['MP', 'FGA', 'TPA', 'FTA', tgt_stat] + tgt_stat_cols:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )
        for N in [3, 5, 10]:
            df[f'{col}_last_{N}_avg'] = (
                df.groupby(['Player', 'Season'])[col]
                  .rolling(window=N, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0, 1], drop=True)
            )
            df[f"{col}_last_{N}_std"] = (
                df.groupby(['Player', 'Season'])[col]
                  .shift(1)
                  .rolling(window=N, min_periods=1)
                  .std()
            )

    # Role identifiers features
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
    
    for col in categories + ['FG', 'FGA', 'FT', 'FTA', 'TPM', 'TPA', 'reserve_td', 'bench_td', 'starter_td'] + tgt_stat_cols:
        if col == tgt_stat:
            continue
        if col in df.columns:
            df = df.drop(col, axis=1)
        
    return df

In [53]:
df_main = df.copy()
df_main = setup_df_main(df_main)
display(df_main)

n = len(df_main)
train_end = int(0.65 * n)
val_end   = int(0.85 * n)
main_train_df = df_main.iloc[:train_end]
main_val_df   = df_main.iloc[train_end:val_end]
main_test_df  = df_main.iloc[val_end:]
main_DFS = (main_train_df, main_val_df, main_test_df)

# stat_model = create_baseline_model(df_main, tgt_stat, main_DFS)
stat_model, stat_preds, y_test_stat, analyze_df_stat = hyperparam_tuning(main_DFS, tgt_stat, n_iter=1)
# feature_importance(stat_model)

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,B2B,MP,PRA,Def_PRA,Def_L5_PRA,Spread,Total,cup_gm,pstszn_gm,is_OT,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,FGA_lst_gm,FGA_last_3_avg,FGA_last_3_std,FGA_last_5_avg,FGA_last_5_std,FGA_last_10_avg,FGA_last_10_std,TPA_lst_gm,TPA_last_3_avg,TPA_last_3_std,TPA_last_5_avg,TPA_last_5_std,TPA_last_10_avg,TPA_last_10_std,FTA_lst_gm,FTA_last_3_avg,FTA_last_3_std,FTA_last_5_avg,FTA_last_5_std,FTA_last_10_avg,FTA_last_10_std,PRA_lst_gm,PRA_last_3_avg,PRA_last_3_std,PRA_last_5_avg,PRA_last_5_std,PRA_last_10_avg,PRA_last_10_std,PTS_lst_gm,PTS_last_3_avg,PTS_last_3_std,PTS_last_5_avg,PTS_last_5_std,PTS_last_10_avg,PTS_last_10_std,REB_lst_gm,REB_last_3_avg,REB_last_3_std,REB_last_5_avg,REB_last_5_std,REB_last_10_avg,REB_last_10_std,AST_lst_gm,AST_last_3_avg,AST_last_3_std,AST_last_5_avg,AST_last_5_std,AST_last_10_avg,AST_last_10_std,reserve,bench,starter
0,2022,2022-10-21,0,21,5,2,0,14.37,7,29.00000,29.0,10.0,206.0,0,0,0,,20.166667,,24.286,,26.067000,,,11.333333,,11.4,,11.2,,,4.000000,,4.4,,4.400000,,,3.333333,,4.2,,4.500000,,,23.666667,,24.2,,24.300000,,,16.000000,,16.6,,16.800000,,,7.333333,,6.8,,6.400000,,,0.333333,,0.8,,1.1,,0,1,0
1,2022,2022-10-21,0,21,120,0,0,31.62,15,23.50000,23.5,10.0,206.0,0,0,0,,4.266667,,7.614,,19.136000,,,1.333333,,2.2,,4.0,,,1.333333,,2.0,,2.900000,,,0.000000,,0.0,,0.300000,,,1.333333,,3.0,,10.800000,,,1.000000,,1.6,,5.100000,,,0.000000,,0.8,,4.200000,,,0.333333,,0.6,,1.5,,0,0,1
2,2022,2022-10-21,0,21,171,3,0,32.53,19,17.00000,17.0,10.0,206.0,0,0,0,,26.603333,,29.612,,30.320000,,,12.666667,,13.2,,15.2,,,4.666667,,5.4,,6.200000,,,2.333333,,3.4,,4.600000,,,24.333333,,25.6,,29.700000,,,13.666667,,15.8,,19.800000,,,5.000000,,3.8,,4.100000,,,5.666667,,6.0,,5.8,,0,0,1
3,2022,2022-10-21,0,21,178,4,0,39.62,38,26.00000,26.0,10.0,206.0,0,0,0,,30.693333,,30.206,,28.729000,,,8.666667,,8.4,,8.1,,,0.000000,,0.0,,0.000000,,,1.333333,,0.8,,1.500000,,,21.666667,,21.8,,22.300000,,,13.666667,,12.2,,13.100000,,,7.666667,,9.0,,8.500000,,,0.333333,,0.6,,0.7,,0,0,1
4,2022,2022-10-21,0,21,321,3,0,11.77,5,17.00000,17.0,10.0,206.0,0,0,0,,23.016667,,22.238,,17.904286,,,6.333333,,6.8,,5.0,,,2.666667,,4.0,,2.857143,,,1.666667,,2.2,,1.857143,,,12.666667,,13.6,,10.571429,,,8.333333,,9.4,,7.285714,,,1.666667,,1.6,,1.285714,,,2.666667,,2.6,,2.0,,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80232,2025,2025-12-26,29,27,59,4,0,25.97,30,23.80000,24.8,21.0,255.0,0,0,0,22.57,26.680000,11.049689,28.976,9.501056,26.939000,7.589945,7.0,7.000000,7.234178,7.2,5.310367,6.5,4.779586,4.0,3.666667,2.645751,3.8,2.588436,3.100000,2.131770,2.0,1.000000,2.516611,2.2,2.167948,2.600000,2.054805,9.0,10.666667,16.196707,13.0,15.368800,13.100000,12.613925,3.0,4.000000,13.076697,6.4,9.964939,6.800000,8.350649,4.0,4.666667,3.511885,4.2,4.898979,3.500000,4.022161,2.0,2.000000,1.000000,2.4,3.114482,2.8,3.405877,0,0,1
80233,2025,2025-12-26,29,27,89,4,0,30.78,28,23.80000,24.8,21.0,255.0,0,0,0,34.17,34.233333,9.972002,34.696,9.443306,32.890000,7.671929,15.0,14.333333,4.358899,16.2,5.357238,16.1,4.831609,5.0,7.333333,0.577350,6.8,2.073644,7.700000,2.043961,2.0,2.333333,1.154701,3.2,1.949359,2.800000,1.932184,22.0,26.333333,8.144528,28.4,13.773162,26.800000,12.149074,15.0,18.000000,6.244998,19.8,9.884331,19.800000,8.393119,3.0,4.000000,1.527525,4.4,3.114482,3.200000,3.835507,4.0,4.333333,2.000000,4.2,2.774887,3.8,3.299832,0,0,1
80234,2025,2025-12-26,29,27,473,3,0,19.33,13,27.62069,24.4,21.0,255.0,0,0,0,22.12,26.240000,6.830874,27.152,9.137200,26.376000,7.455239,9.0,10.333333,4.163332,9.8,5.540758,8.8,4.289522,3.0,3.000000,1.000000,2.2,2.280351,2.500000,2.043961,4.0,2.666667,1.154701,2.2,1.949359,1.600000,2.011080,13.0,20.333333,6.658328,22.4,11.886968,18.500000,11.830281,6.0,12.333333,6.244998,11.8,9.813256,10.100000,8.230026,3.0,3.333333,0.577350,4.6,2.588436,4.100000,3.910101,4.0,4.666667,1.154701,6.0,1.788854,4.3,3.134042,0,0,1
80235,2025,2025-12-26,29,27,490,3,0,28.47,32,27.62069,24.4,21.0,255.0,0,0,0,33.27,36.193333,6.712364,34.158,8.371425,32.618000,7.416872,13.0,14.000000,3.055050,12.6,3.435113,10.9,4.301163,7.0,6.000000,2.000000,6.4,1.483240,5.700000,2.170509,3.0,3.333333,1.000000,3.0,1.483240,2.100000,1.828782,25.0,30.333333,6.244998,27.4,7.949843,23.700000,11.671428,14.0,17.666667,4.932883,15.4,5.357238,13.200000,8.055364,4.0,5.666667,0.577350,5.4,1.224745,5.000000,3.915780,7.0,7.000000,1.732051,6.6,2.607681,5.5,3.197221,0,0,1



Trial 1/1: {'n_estimators': 596, 'learning_rate': np.float64(0.03044645156904415), 'max_depth': 3, 'min_child_weight': 4, 'subsample': np.float64(0.8734391207022232), 'colsample_bytree': np.float64(0.9423618545450435), 'gamma': np.float64(1.0861822604748617), 'reg_lambda': np.float64(4.0612047148317085), 'reg_alpha': np.float64(1.267956018002808)}
Validation MAE: 4.2475

Best validation MAE: 4.247480869293213
Best parameters: {'learning_rate': np.float64(0.03044645156904415), 'max_depth': 3, 'min_child_weight': 4, 'subsample': np.float64(0.8734391207022232), 'colsample_bytree': np.float64(0.9423618545450435), 'gamma': np.float64(1.0861822604748617), 'reg_lambda': np.float64(4.0612047148317085), 'reg_alpha': np.float64(1.267956018002808), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 5.837077038490368
MAE: 4.39534330368042
R²: 0.784623384475708


In [54]:
analyze_df_stat = main_test_df.drop([tgt_stat, f'{tgt_stat}_preds'], axis=1)\
                .merge(analyze_df_stat[['Date', 'Team', 'Player', tgt_stat, f'{tgt_stat}_preds']], on=['Date', 'Team', 'Player'])
analyze_df_stat['Diff'] = analyze_df_stat[tgt_stat] - analyze_df_stat[f'{tgt_stat}_preds']
analyze_df_stat[analyze_df_stat.MP > 38].sort_values('Diff', ascending=True).head(15)

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,B2B,MP,Def_PRA,Def_L5_PRA,Spread,Total,cup_gm,pstszn_gm,is_OT,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,FGA_lst_gm,FGA_last_3_avg,FGA_last_3_std,FGA_last_5_avg,FGA_last_5_std,FGA_last_10_avg,FGA_last_10_std,TPA_lst_gm,TPA_last_3_avg,TPA_last_3_std,TPA_last_5_avg,TPA_last_5_std,TPA_last_10_avg,TPA_last_10_std,FTA_lst_gm,FTA_last_3_avg,FTA_last_3_std,FTA_last_5_avg,FTA_last_5_std,FTA_last_10_avg,FTA_last_10_std,PRA_lst_gm,PRA_last_3_avg,PRA_last_3_std,PRA_last_5_avg,PRA_last_5_std,PRA_last_10_avg,PRA_last_10_std,PTS_lst_gm,PTS_last_3_avg,PTS_last_3_std,PTS_last_5_avg,PTS_last_5_std,PTS_last_10_avg,PTS_last_10_std,REB_lst_gm,REB_last_3_avg,REB_last_3_std,REB_last_5_avg,REB_last_5_std,REB_last_10_avg,REB_last_10_std,AST_lst_gm,AST_last_3_avg,AST_last_3_std,AST_last_5_avg,AST_last_5_std,AST_last_10_avg,AST_last_10_std,reserve,bench,starter,PRA,PRA_preds,Diff
69,2024,2025-04-01,DEN,MIN,Peyton Watson,SF,0,45.32,19.5,18.2,-1.0,279.0,0,0,2,27.22,33.53,5.419339,29.712,10.309997,26.426,8.115482,9.0,9.666667,8.020806,9.6,6.503845,7.6,4.972145,2.0,2.0,2.081666,2.4,1.923538,1.9,2.424413,0.0,2.666667,6.928203,2.2,5.272571,2.5,3.835507,17.0,22.666667,20.420578,20.4,16.8523,15.5,11.392005,8.0,14.666667,13.868429,13.0,10.521407,9.6,7.805981,5.0,6.0,5.507571,5.6,5.338539,4.2,3.949684,4.0,2.0,1.154701,1.8,2.167948,1.7,1.699673,0,1,0,10,31.934019,-21.934019
10641,2025,2025-12-07,LAL,PHI,Austin Reaves,SG,0,39.13,21.047619,12.6,4.0,220.0,0,0,0,33.32,35.83,7.681154,37.688,7.805798,37.192,6.604033,18.0,17.0,8.717798,16.2,6.870226,15.8,5.035982,8.0,8.666667,4.163332,8.2,3.63318,7.2,2.635231,17.0,11.0,9.291573,10.8,7.224957,9.7,5.600099,47.0,43.0,24.33105,44.8,17.564168,41.6,13.106402,36.0,32.0,19.731531,33.4,14.142136,30.2,10.405661,3.0,4.0,0.0,5.0,2.607681,5.9,2.108185,8.0,7.0,4.618802,6.4,3.361547,5.5,2.472066,0,0,1,19,40.650291,-21.650291
2934,2024,2025-04-29,IND,MIL,Pascal Siakam,PF,0,41.2,21.691589,24.6,1.0,237.0,0,1,1,29.4,31.586667,8.857998,32.142,7.805467,31.082,8.068112,10.0,15.0,3.0,14.0,3.04959,14.6,4.268749,2.0,3.666667,1.0,3.6,1.140175,3.6,3.212822,2.0,3.666667,2.516611,3.2,2.073644,3.1,2.162817,19.0,30.666667,8.326664,29.2,7.231874,27.8,7.015063,12.0,21.333333,6.082763,19.8,5.357238,18.2,5.652925,3.0,6.333333,2.0,6.8,1.67332,6.7,1.988858,4.0,3.0,1.527525,2.6,1.643168,2.9,1.433721,0,0,1,15,36.574711,-21.574711
11892,2025,2025-12-25,DEN,MIN,Peyton Watson,SF,0,42.22,,,4.0,280.0,0,0,1,27.85,18.84,6.957032,23.038,5.162843,28.033,7.933442,9.0,7.666667,2.0,8.6,3.435113,10.2,6.398785,4.0,3.0,1.732051,3.0,4.764452,4.0,3.977716,4.0,4.0,1.154701,3.4,2.280351,3.0,4.157991,25.0,18.0,8.962886,18.0,8.734987,20.9,12.976903,20.0,15.333333,4.163332,13.0,6.024948,14.4,9.527154,3.0,1.666667,5.291503,3.8,4.219005,4.7,4.571652,2.0,1.0,6.658328,1.2,5.07937,1.8,3.977716,0,0,1,13,33.275703,-20.275703
3750,2024,2025-05-26,MIN,OKC,Anthony Edwards,SG,0,40.93,19.727723,28.8,-2.0,254.0,0,1,0,29.7,35.446667,3.691021,36.154,7.683709,38.754,13.967116,17.0,18.666667,7.023769,18.4,6.797058,19.7,8.962267,8.0,8.333333,4.163332,9.0,3.03315,8.9,3.984693,2.0,6.333333,1.154701,4.8,1.414214,7.0,2.110819,45.0,40.666667,15.69501,40.4,14.202113,41.1,17.088007,30.0,26.666667,12.055428,26.4,10.382678,26.9,12.547244,9.0,9.0,2.309401,7.6,3.34664,8.5,3.359894,6.0,5.0,3.21455,6.4,2.280351,5.7,3.560587,0,0,1,26,45.571358,-19.571358
7317,2025,2025-11-14,DAL,LAC,Cooper Flagg,PG,0,47.65,27.230769,27.8,-6.0,260.0,1,0,2,33.7,34.056667,15.287004,32.894,11.035132,32.702,11.630775,15.0,15.0,7.571878,14.8,5.761944,13.2,7.14065,7.0,4.666667,3.05505,4.6,3.03315,4.3,3.665151,4.0,4.666667,2.081666,3.8,1.788854,3.4,2.002776,28.0,30.666667,14.798649,28.6,12.095454,25.3,12.073847,16.0,18.0,9.539392,17.2,7.582875,15.3,7.456541,6.0,7.333333,2.645751,7.4,2.345208,6.6,3.835507,6.0,5.333333,3.0,4.0,3.130495,3.4,3.05505,0,0,1,21,39.899536,-18.899536
1960,2024,2025-04-13,LAC,GSW,Norman Powell,SG,0,43.87,20.329412,17.6,5.0,243.0,0,0,1,35.97,35.433333,10.191011,33.386,8.20071,32.685,12.276044,11.0,14.333333,7.637626,13.0,5.683309,13.6,5.51362,3.0,5.0,0.0,5.0,2.12132,5.8,2.270585,4.0,4.0,3.05505,3.2,3.04959,3.0,2.875181,19.0,20.666667,16.093477,18.4,15.132746,21.1,13.341664,16.0,16.666667,12.0,15.0,9.016651,16.9,8.14862,2.0,2.666667,2.886751,2.0,4.505552,2.2,4.376706,1.0,1.333333,1.527525,1.4,4.207137,2.0,3.047768,0,0,1,13,30.864157,-17.864157
2838,2024,2025-04-27,NYK,DET,OG Anunoby,PF,0,39.05,24.0,25.0,1.0,187.0,0,1,0,42.82,43.113333,17.820158,41.09,17.731586,38.1,16.918771,17.0,15.0,7.81025,15.2,7.231874,15.9,7.572611,8.0,6.666667,4.163332,6.8,3.63318,6.8,3.583915,6.0,4.0,3.464102,4.0,2.683282,4.2,4.45845,24.0,23.666667,11.015141,23.2,13.311649,29.3,16.983979,22.0,18.333333,11.269428,16.6,10.667708,21.7,12.355835,2.0,4.666667,1.527525,5.2,2.774887,5.4,3.900142,0.0,0.666667,0.0,1.4,1.341641,2.2,3.705851,0,0,1,10,27.190142,-17.190142
2439,2024,2025-04-21,NYK,DET,OG Anunoby,PF,0,42.42,24.063636,24.8,-6.0,194.0,0,1,0,44.1,40.07,15.958283,38.248,13.487347,36.913,12.410126,18.0,16.333333,8.660254,16.6,6.745369,17.0,7.763876,8.0,7.333333,4.163332,6.8,3.286335,7.1,2.699794,4.0,4.0,2.309401,3.4,1.788854,4.7,3.155243,30.0,25.0,12.897028,29.6,10.871982,32.3,14.041209,23.0,17.0,11.930353,21.4,9.066422,24.4,10.564616,7.0,6.333333,3.21455,5.8,3.114482,5.4,3.665151,0.0,1.666667,2.309401,2.4,1.788854,2.5,2.828427,0,0,1,17,34.18071,-17.18071
2228,2024,2025-04-19,DEN,LAC,Christian Braun,SG,0,45.12,19.197452,17.4,2.0,222.0,0,1,1,31.4,33.94,5.389122,35.774,6.310741,37.151,7.623109,7.0,9.333333,6.0,11.2,7.092249,11.5,6.795423,1.0,2.666667,3.05505,3.0,2.792848,3.3,2.915476,3.0,3.333333,4.041452,2.8,3.781534,2.4,2.973961,24.0,24.666667,12.288206,28.6,14.720734,27.7,12.493999,11.0,16.0,14.0119,18.8,14.656057,17.6,10.616549,7.0,4.333333,1.154701,5.6,2.645751,6.3,3.777124,6.0,4.333333,1.527525,4.2,1.30384,3.8,2.451757,0,0,1,15,31.720213,-16.720213


In [40]:
rmse = np.sqrt(mean_squared_error(y_test_stat, stat_preds)) # splits[5] = y_test
mae = mean_absolute_error(y_test_stat, stat_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)
df_gms = df_gms.rename(columns={"TRB": "REB", "3PM": "TPM", "3PA": "TPA"})
df_gms['STL_BLK'] = df_gms.STL + df_gms.BLK
df_gms['PR'] = df_gms.PTS + df_gms.REB 
df_gms['PA'] = df_gms.PTS + df_gms.AST
df_gms['RA'] = df_gms.REB + df_gms.AST
df_gms['PRA'] = df_gms.PTS + df_gms.REB + df_gms.AST

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff2'] <= rmse, 1, 0)

print("Total Accuracy (ParlayHit):", ((df_yesterday.ParlayHit == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.ParlayHit == 1).sum(), "/", df_yesterday.shape[0])

print("\nTotal Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.InRMSE_Range == 1).sum(), "/", df_yesterday.shape[0])

df_yesterday = df_yesterday.drop(['Diff', 'Act_Res', 'Pred_Res'], axis=1).sort_values(f'{tgt_stat}_line', ascending=False)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 5.814462309994425
Total Accuracy (ParlayHit): 0.6176470588235294
21 / 34

Total Accuracy (InRMSE_Range): 0.5
17 / 34


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,PRA_line,PRA_proj,PRA,Diff2,ParlayHit,InRMSE_Range
0,2025-12-27,DEN,Nikola Jokic,C,ORL,37.244587,38.02,53.5,31.522596,67,35.477404,0,0
3,2025-12-27,ATL,Jalen Johnson,SF,NYK,34.768074,37.07,45.5,30.498135,41,10.501865,1,0
9,2025-12-27,NYK,Jalen Brunson,PG,ATL,36.708183,37.67,40.5,28.051573,42,13.948427,0,0
1,2025-12-27,NYK,Karl-Anthony Towns,C,ATL,29.840424,29.8,38.5,19.302729,52,32.697271,0,0
2,2025-12-27,CLE,Donovan Mitchell,SG,HOU,31.292988,30.45,37.5,21.881079,25,3.118921,1,1
13,2025-12-27,PHO,Devin Booker,SG,NOP,34.234928,34.1,37.5,26.514217,27,0.485783,1,1
15,2025-12-27,UTA,Keyonte George,PG,SAS,33.336834,34.77,37.5,28.045895,39,10.954105,0,0
18,2025-12-27,ORL,Paolo Banchero,PF,DEN,32.875816,36.0,37.5,28.592785,24,4.592785,1,1
20,2025-12-27,DEN,Jamal Murray,PG,ORL,35.24255,36.62,37.5,28.80798,32,3.19202,1,1
5,2025-12-27,MIN,Anthony Edwards,SG,BRK,37.231415,34.7,36.5,22.389414,37,14.610586,0,0


### Today's predictions

In [41]:
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)

df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
df_pred['Date'] = pd.to_datetime(df_pred.Date)
df_pred['Season'] = 2025
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred[col].astype('category')
df_pred = df_pred.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])
df_pred[tgt_stat] = 0

# Predict minutes
df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP', 'TPM']], on=['Date', 'Team', 'Player'], how='left')
df_temp = df.copy()
df_temp["Team"] = team_encoder.inverse_transform(df_temp["Team"])
df_temp['Player'] = player_encoder.inverse_transform(df_temp["Player"])
df_pred = df_pred.merge(df_temp[['Date', 'Team', 'Player', 'TOV', 'PF', '+/-', 'FGA', 'FG', 'TPA', 'FT', 'FTA',
                             'AST', 'REB', 'PR', 'PA', 'RA', 'STL', 'BLK', 'STL_BLK']], on=['Date', 'Team', 'Player'], how='left')

df_pred = df_pred.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_pred_mins = setup_df_mins(con, df_pred)

df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
df_pred_mins["Opp"] = team_encoder.transform(df_pred_mins["Opp"])
df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
df_pred_mins["Pos"] = position_encoder.transform(df_pred_mins["Pos"])
DM_mins = xgb.DMatrix(df_pred_mins)
df_pred['MP'] = mins_model.predict(DM_mins)
df_pred['N_TPM'] = df_pred.FG - df_pred.TPM
df_pred['PTS'] = (df_pred.FT * 1) + (df_pred.N_TPM * 2) + (df_pred.TPM * 3)
df_pred = setup_df_main(df_pred)
feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
df_pred = df_pred[df_pred.Date == now][feature_cols]

# Predict stat
df_pred["Team"] = team_encoder.transform(df_pred["Team"])
df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
df_pred["Player"] = player_encoder.transform(df_pred["Player"])
df_pred["Pos"] = position_encoder.transform(df_pred["Pos"])
DM_stats = xgb.DMatrix(df_pred)
df_pred[f"{tgt_stat}_proj"] = stat_model.predict(DM_stats)

df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
df_pred['Opp'] = team_encoder.inverse_transform(df_pred["Opp"])
df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])
df_pred['Pos'] = position_encoder.inverse_transform(df_pred["Pos"])

df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

tds_picks = df_pred[~(df_pred[f'{tgt_stat}_line'].isnull())]\
            [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_last_5_avg', f'{tgt_stat}_line', f'{tgt_stat}_proj']]
tds_picks['Diff'] = abs((df_pred[f'{tgt_stat}_line'] - df_pred[f'{tgt_stat}_proj']))
tds_picks['Diff2'] = abs((df_pred['MP'] - df_pred['MP_last_5_avg']))
tds_picks = tds_picks[(tds_picks.Diff >= mae) & (tds_picks.Diff2 <= 8)].sort_values('Diff', ascending=False).drop(['Diff', 'Diff2'], axis=1)
display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/2025/gmday_preds_{tgt_stat}.csv")

Unnamed: 0,Team,Player,Pos,Opp,MP,MP_last_5_avg,PRA_line,PRA_proj
1,LAL,Luka Doncic,PG,SAC,35.485744,35.731909,53.5,26.552055
0,DET,Cade Cunningham,PG,LAC,36.126133,35.855542,42.5,27.377708
25,DET,Jalen Duren,C,LAC,27.187101,29.982531,29.5,18.012392
3,LAC,James Harden,PG,DET,37.331711,33.186158,38.5,27.190819
5,SAC,Russell Westbrook,PG,LAL,25.505262,33.125482,30.5,19.636219
16,LAL,LeBron James,SF,SAC,35.170448,32.810431,36.5,27.452024
29,SAC,Precious Achiuwa,C,LAL,10.727637,18.176271,16.5,8.030124
13,LAC,Kawhi Leonard,SF,DET,37.063404,30.802118,39.5,31.503956
15,DET,Ausar Thompson,SF,LAC,23.884089,26.455087,19.5,13.248004
24,LAL,Deandre Ayton,C,SAC,30.635414,27.253936,26.5,20.847326


../tables/2025/gmday_preds_PRA.csv saved!
