# To do:

 - Figure out how to signal injuries
 - Add team shooting percentages for AST stats
 - Add def team shooting percentages for REB stats
 - Add TOV stats for STL

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import math         # haversine_km()
import os

import xgboost as xgb
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)
tgt_stat = "PTS"
print('Target Stat:', tgt_stat)

Today's date: 2025-12-28
Target Stat: PTS


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model):
    importance = model.get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
def create_baseline_model(df, pred_col, DFS):
    
    train_df, val_df, test_df = DFS

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            'starter', 'bench', 'reserve'
        ]
    else:
        print(f'{pred_col} Stats Model')
        feature_cols = [
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            f'{pred_col}_last_3_avg', f'{pred_col}_last_5_avg', f'{pred_col}_last_10_avg',
            f'Def_{pred_col}', f'Def_L5_{pred_col}'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst

In [5]:
def hyperparam_tuning(DFS, pred_col, n_iter=20, early_stopping_rounds=50):
    """
    Hyperparameter tuning using native XGBoost API and DMatrix,
    with early stopping support (compatible with XGBoost 3.1.2)
    """

    train_df, val_df, test_df = DFS
    feature_cols = [col for col in train_df.columns if col not in ['Date', pred_col]]
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert datasets to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.01, 0.05),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(1, 8),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 2),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 2)
    }

    # Sample n_iter random parameter combinations
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k, v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_mae = float('inf')
    best_params = None
    best_bst = None

    # Manual hyperparameter search
    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')
        params.update({
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device": "cuda",
            "seed": 42
        })
        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        # Predict on validation set to compute MAE
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        mae = mean_absolute_error(y_val, val_preds)
        print(f"Validation MAE: {mae:.4f}")
        if mae < best_mae:
            best_mae = mae
            best_params = params.copy()
            best_bst = bst

    print("\nBest validation MAE:", best_mae)
    print("Best parameters:", best_params)

    # Predict on test set using best model
    preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    test_df[pred_col] = y_test
    test_df[f'{pred_col}_preds'] = preds
    test_df['Team'] = team_encoder.inverse_transform(test_df["Team"])
    test_df['Opp'] = team_encoder.inverse_transform(test_df["Opp"])
    test_df['Player'] = player_encoder.inverse_transform(test_df["Player"])
    test_df['Pos'] = position_encoder.inverse_transform(test_df["Pos"])
    analyze_df = test_df[['Date', 'Team', 'Player', 'Pos', 'Opp', pred_col, f'{pred_col}_preds']]
    print("\nTest Metrics:")
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))

    return best_bst, preds, y_test, analyze_df

### Create Base df

In [6]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
for i in [2022, 2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_schd = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_schd['Season'] = i
    df2 = pd.concat([df2, df_schd])
    
    df_gms = pd.read_csv(f"../tables/{i}/season_gamelogs.csv")
    df_gms['Season'] = i
    df3 = pd.concat([df3, df_gms])
    
    df_inj = pd.read_csv(f"../tables/{i}/injuries.csv")
    df_inj['Season'] = i
    df4 = pd.concat([df4, df_inj])

df['Date'] = pd.to_datetime(df.Date)
df2['Date'] = pd.to_datetime(df2.Date)
df3['Date'] = pd.to_datetime(df3.Date)
df3 = df3[~df3[['Date', 'Team', 'Player']].duplicated(keep='last')]
df4['Date'] = pd.to_datetime(df4.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)].drop(['WrngTm', 'WrngOpp', 'Tms'], axis=1)

df3_temp = df3[['game_id', 'Date', 'Team', 'Player', 'Active', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TOV', 'PF', '+/-']]\
        .rename(columns={"3PA": "TPA", "3P%": "TP%"})
df = df.merge(df3_temp, on=['game_id', 'Date', 'Team', 'Player'])

df_mtch = df2[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Season', 'Date', 'Team', 'Team_type', 'AwayPTS', 'HomePTS', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch = df_mtch.sort_values(["Team", "Date"])
df_mtch['team_game_num'] = df_mtch.groupby(["Team", "Season"]).cumcount() + 1
df_mtch['Spread'] = np.where(df_mtch.Team_type == 'Home', df_mtch.HomePTS - df_mtch.AwayPTS, df_mtch.AwayPTS - df_mtch.HomePTS)
df_mtch['Total'] = df_mtch.AwayPTS + df_mtch.HomePTS
df_mtch['is_Win'] = np.where(df_mtch.Spread > 0, 1, 0)
df_mtch['Szn_Wins'] = df_mtch.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])

df = df.merge(df4[['Date', 'Team', 'Player', 'Status']], on=['Date', 'Team', 'Player'], how='left')
df['Status'] = np.where((df.Active == 1), 'Available', df.Status)
df['Status'] = np.where((df.Active == 0), 'Out', df.Status)
df['Status'] = np.where((df.Status == 'Out') & (df.Active != 0), 'Available', df.Status)

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()
position_encoder = LabelEncoder()

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df["Pos"] = position_encoder.fit_transform(df["Pos"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
df = df[(df.Active == 1) & (df.MP > 0)].sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
print('base df created', datetime.now())

base df created 2025-12-28 20:55:55.301469


# Minutes Projection Model

In [7]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [8]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 'TOV', 'PF', '+/-',
             'Spread', 'Total', 'team_game_num', 'Szn_Wins', 'cup_gm', 'pstszn_gm', 'is_OT']]
    
    for col in ['MP', 'TOV', 'PF', '+/-']:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )
        for N in [3, 5, 10]:
            df[f'{col}_last_{N}_avg'] = (
                df.groupby(['Player', 'Season'])[col]
                  .rolling(window=N, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0, 1], drop=True)
            )
            df[f"{col}_last_{N}_std"] = (
                df.groupby(['Player', 'Season'])[col]
                  .shift(1)
                  .rolling(window=N, min_periods=1)
                  .std()
            )
        df[f"{col}_change_L1"] = df[f"{col}_lst_gm"] - df[f"{col}_last_5_avg"]
        df[f"{col}_change_L3"] = df[f"{col}_last_3_avg"] - df[f"{col}_last_10_avg"]
        df[f"{col}_pct_change"] = (
            (df[f"{col}_lst_gm"] - df[f"{col}_last_10_avg"]) /
            (df[f"{col}_last_10_avg"] + 1e-6)
        )
    df["MP_spike"] = (df["MP_lst_gm"] > df["MP_last_10_avg"] + 8).astype(int)
    df["MP_drop"]  = (df["MP_lst_gm"] < df["MP_last_10_avg"] - 8).astype(int)
    df["MP_trend"] = df["MP_last_3_avg"] - df["MP_last_10_avg"]

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
           
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
      
    df['missed_games'] = (
        df.groupby(['Player', 'Team', 'Season'])['team_game_num']      
          .diff()
          .sub(1)
          .fillna(0)
          .astype(int)
    )
    
    df['blowout'] = np.where(abs(df.Spread >= 15), 1, 0)
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)
    
    df = df.drop(['reserve_td', 'bench_td', 'starter_td', 'Szn_Wins', 'TOV', 'PF', '+/-', 
                  'PrevOpp', 'PrevLocation', 'Location'], axis=1)    
    
    return df

In [9]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

n = len(df_mins)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)
mins_train_df = df_mins.iloc[:train_end]
mins_val_df   = df_mins.iloc[train_end:val_end]
mins_test_df  = df_mins.iloc[val_end:]
mins_DFS = (mins_train_df, mins_val_df, mins_test_df)

mins_model = create_baseline_model(df_mins, "MP", mins_DFS)
mins_model, mins_preds, y_test_mins, analyze_df_mins = hyperparam_tuning(mins_DFS, "MP", n_iter=1)
# feature_importance(mins_model)

Unnamed: 0,Season,Date,Team,Team_type,Opp,Player,Pos,B2B,MP,Spread,Total,team_game_num,cup_gm,pstszn_gm,is_OT,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,MP_change_L1,MP_change_L3,MP_pct_change,TOV_lst_gm,TOV_last_3_avg,TOV_last_3_std,TOV_last_5_avg,TOV_last_5_std,TOV_last_10_avg,TOV_last_10_std,TOV_change_L1,TOV_change_L3,TOV_pct_change,PF_lst_gm,PF_last_3_avg,PF_last_3_std,PF_last_5_avg,PF_last_5_std,PF_last_10_avg,PF_last_10_std,PF_change_L1,PF_change_L3,PF_pct_change,+/-_lst_gm,+/-_last_3_avg,+/-_last_3_std,+/-_last_5_avg,+/-_last_5_std,+/-_last_10_avg,+/-_last_10_std,+/-_change_L1,+/-_change_L3,+/-_pct_change,MP_spike,MP_drop,MP_trend,games_last_7_days,prev_team_mins_pct,reserve,bench,starter,missed_games,blowout,DaysLstGm,travel_km,travel_hours,is_long_trip,same_arena
0,2022,2022-10-21,0,1,21,5,2,0,14.37,10.0,206.0,2,0,0,0,,20.166667,,24.286,,26.067000,,,-5.900333,,,0.666667,,1.0,,1.400000,,,-0.733333,,,3.333333,,2.4,,2.000000,,,1.333333,,,18.333333,,20.2,,14.100000,,,4.233333,,0,0,-5.900333,3,,0,1,0,0,0,0,0.0,0.0,0,0
1,2022,2022-10-21,0,1,21,120,0,0,31.62,10.0,206.0,2,0,0,0,,4.266667,,7.614,,19.136000,,,-14.869333,,,0.000000,,0.0,,0.300000,,,-0.300000,,,0.000000,,0.0,,0.400000,,,-0.400000,,,3.333333,,3.2,,-1.500000,,,4.833333,,0,0,-14.869333,1,,0,0,1,0,0,0,0.0,0.0,0,0
2,2022,2022-10-21,0,1,21,171,3,0,32.53,10.0,206.0,2,0,0,0,,26.603333,,29.612,,30.320000,,,-3.716667,,,1.333333,,2.0,,2.900000,,,-1.566667,,,2.333333,,2.8,,2.300000,,,0.033333,,,8.333333,,6.4,,4.400000,,,3.933333,,0,0,-3.716667,4,,0,0,1,0,0,0,0.0,0.0,0,0
3,2022,2022-10-21,0,1,21,178,4,0,39.62,10.0,206.0,2,0,0,0,,30.693333,,30.206,,28.729000,,,1.964333,,,1.333333,,1.2,,1.000000,,,0.333333,,,3.333333,,3.2,,2.100000,,,1.233333,,,-12.000000,,-6.6,,-1.500000,,,-10.500000,,0,0,1.964333,1,,0,0,1,0,0,0,0.0,0.0,0,0
4,2022,2022-10-21,0,1,21,322,3,0,11.77,10.0,206.0,2,0,0,0,,23.016667,,22.238,,17.904286,,,5.112381,,,2.333333,,1.8,,1.428571,,,0.904762,,,1.333333,,1.4,,1.428571,,,-0.095238,,,-6.000000,,-3.2,,-1.142857,,,-4.857143,,0,0,5.112381,2,,0,1,0,0,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80245,2025,2025-12-26,29,1,27,59,4,0,25.97,21.0,255.0,29,0,0,0,22.57,26.680000,11.049689,28.976,9.501056,26.939000,7.589945,-6.406,-0.259000,-0.162181,3.0,2.333333,1.732051,2.4,1.516575,1.700000,1.370320,0.6,0.633333,0.764705,2.0,1.666667,2.000000,2.8,1.673320,2.600000,1.449138,-0.8,-0.933333,-0.230769,-18.0,-13.000000,7.094599,-5.4,8.792042,-5.600000,7.549099,-12.6,-7.400000,2.214286,0,0,-0.259000,1,0.094042,0,0,1,3,1,8,0.0,0.0,0,0
80246,2025,2025-12-26,29,1,27,89,4,0,30.78,21.0,255.0,29,0,0,0,34.17,34.233333,9.972002,34.696,9.443306,32.890000,7.671929,-0.526,1.343333,0.038918,2.0,1.333333,1.527525,1.2,1.516575,1.700000,1.316561,0.8,-0.366667,0.176470,3.0,2.000000,1.527525,2.2,1.516575,2.200000,1.398412,0.8,-0.200000,0.363636,-2.0,-0.666667,10.969655,1.4,8.105554,-6.200000,8.974656,-3.4,5.533333,-0.677419,0,0,1.343333,3,0.142375,0,0,1,1,1,5,0.0,0.0,0,0
80247,2025,2025-12-26,29,1,27,474,3,0,19.33,21.0,255.0,29,0,0,0,22.12,26.240000,6.830874,27.152,9.137200,26.376000,7.455239,-5.032,-0.136000,-0.161359,1.0,1.666667,1.000000,2.2,1.303840,1.900000,1.333333,-1.2,-0.233333,-0.473684,2.0,2.333333,0.577350,2.8,1.483240,2.700000,1.418136,-0.8,-0.366667,-0.259259,0.0,-7.333333,9.865766,0.2,9.964939,-1.700000,10.163114,-0.2,-5.633333,-1.000001,0,0,-0.136000,1,0.092167,0,0,1,2,1,6,0.0,0.0,0,0
80248,2025,2025-12-26,29,1,27,491,3,0,28.47,21.0,255.0,29,0,0,0,33.27,36.193333,6.712364,34.158,8.371425,32.618000,7.416872,-0.888,3.575333,0.019989,2.0,2.333333,0.577350,2.4,1.140175,2.700000,1.286684,-0.4,-0.366667,-0.259259,5.0,4.666667,1.527525,4.6,1.816590,4.000000,1.563472,0.4,0.666667,0.250000,-14.0,-7.333333,7.571878,-2.2,10.039920,-6.000000,9.843215,-11.8,-1.333333,1.333334,0,0,3.575333,3,0.138625,0,0,1,1,1,5,0.0,0.0,0,0


Minutes Model
Train: 64200 / Validation: 8025 / Test: 8025
RMSE: 5.985580583930471
MAE: 4.586915904582847
R²: 0.6477458857409831

Trial 1/1: {'n_estimators': 947, 'learning_rate': np.float64(0.04222986985376435), 'max_depth': 5, 'min_child_weight': 1, 'subsample': np.float64(0.7400242148692038), 'colsample_bytree': np.float64(0.8168664978927977), 'gamma': np.float64(1.4367156616000394), 'reg_lambda': np.float64(1.2045640695602955), 'reg_alpha': np.float64(1.1551296103085138)}
Validation MAE: 4.8567

Best validation MAE: 4.856668867535457
Best parameters: {'learning_rate': np.float64(0.04222986985376435), 'max_depth': 5, 'min_child_weight': 1, 'subsample': np.float64(0.7400242148692038), 'colsample_bytree': np.float64(0.8168664978927977), 'gamma': np.float64(1.4367156616000394), 'reg_lambda': np.float64(1.2045640695602955), 'reg_alpha': np.float64(1.1551296103085138), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 5.71956610074

In [10]:
rmse = np.sqrt(mean_squared_error(y_test_mins, mins_preds)) # splits[5] = y_test
mae = mean_absolute_error(y_test_mins, mins_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'MP']], on=['Date', 'Team', 'Player'])
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', 'MP_last_5_avg']][df_yesterday.MP > 0]

df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff'] <= rmse, 1, 0)

print("\nYesterday's Results:")
print("Total Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.InRMSE_Range == 1).sum(), '/', df_yesterday.shape[0])

df_yesterday = df_yesterday.drop(['Diff'], axis=1)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 5.719566100748775

Yesterday's Results:
Total Accuracy (InRMSE_Range): nan
0 / 0


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,MP_last_5_avg,InRMSE_Range


# Main Model

In [24]:
def setup_df_main(df):
    
    # Stat dependent features 
    if tgt_stat == 'PTS':
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]
        # Efficiency metrics
        df['three_rate_raw'] =  np.where(df.FGA > 0, df['TPA'] / df['FGA'], 0)
        df['ft_rate_raw']    =  np.where(df.FGA > 0, df['FTA'] / df['FGA'], 0)
        df['eFG_raw'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
        df['TS_raw'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))    
        df['usage_proxy_raw'] =  np.where(df.MP > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
        
        for w in [3, 5, 10]:
            for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
                col = f"{metric}_L{w}"
                df[col] = (
                    df.groupby(['Player','Season'])[f'{metric}_raw']
                      .rolling(w, min_periods=1)
                      .mean()
                      .shift(1)
                      .reset_index(level=[0,1], drop=True)
                )
        for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
            col = f'{metric}_weighted'
            df[col] = (
                0.6 * df[f'{metric}_L3'] +
                0.3 * df[f'{metric}_L5'] +
                0.1 * df[f'{metric}_L10']
            )
            df = df.drop(f'{metric}_raw', axis=1)
        tgt_stat_cols = ['PTS']
        
    elif tgt_stat == 'PRA':
        tgt_stat_cols = ['PTS', 'REB', 'AST']
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]
        
    else:
        tgt_stat_cols = []
        df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 
         'PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK',
         'FG', 'FGA', 'TPA', 'FT', 'FTA', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
         'Spread', 'Total', 'cup_gm', 'pstszn_gm', 'is_OT']]

    
    # Create rolling + lag features    
    for col in ['MP', 'FGA', 'TPA', 'FTA', tgt_stat] + tgt_stat_cols:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )
        for N in [3, 5, 10]:
            df[f'{col}_last_{N}_avg'] = (
                df.groupby(['Player', 'Season'])[col]
                  .rolling(window=N, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0, 1], drop=True)
            )
            df[f"{col}_last_{N}_std"] = (
                df.groupby(['Player', 'Season'])[col]
                  .shift(1)
                  .rolling(window=N, min_periods=1)
                  .std()
            )

    # Role identifiers features
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
    
    for col in categories + ['FG', 'FGA', 'FT', 'FTA', 'TPM', 'TPA', 'reserve_td', 'bench_td', 'starter_td'] + tgt_stat_cols:
        if col == tgt_stat:
            continue
        if col in df.columns:
            df = df.drop(col, axis=1)
        
    return df

In [25]:
df_main = df.copy()
df_main = setup_df_main(df_main)
display(df_main)

n = len(df_main)
train_end = int(0.65 * n)
val_end   = int(0.85 * n)
main_train_df = df_main.iloc[:train_end]
main_val_df   = df_main.iloc[train_end:val_end]
main_test_df  = df_main.iloc[val_end:]
main_DFS = (main_train_df, main_val_df, main_test_df)

# stat_model = create_baseline_model(df_main, tgt_stat, main_DFS)
stat_model, stat_preds, y_test_stat, analyze_df_stat = hyperparam_tuning(main_DFS, tgt_stat, n_iter=1)
# feature_importance(stat_model)

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,B2B,MP,PTS,Def_PTS,Def_L5_PTS,Spread,Total,cup_gm,pstszn_gm,is_OT,three_rate_L3,ft_rate_L3,eFG_L3,TS_L3,usage_proxy_L3,three_rate_L5,ft_rate_L5,eFG_L5,TS_L5,usage_proxy_L5,three_rate_L10,ft_rate_L10,eFG_L10,TS_L10,usage_proxy_L10,three_rate_weighted,ft_rate_weighted,eFG_weighted,TS_weighted,usage_proxy_weighted,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,FGA_lst_gm,FGA_last_3_avg,FGA_last_3_std,FGA_last_5_avg,FGA_last_5_std,FGA_last_10_avg,FGA_last_10_std,TPA_lst_gm,TPA_last_3_avg,TPA_last_3_std,TPA_last_5_avg,TPA_last_5_std,TPA_last_10_avg,TPA_last_10_std,FTA_lst_gm,FTA_last_3_avg,FTA_last_3_std,FTA_last_5_avg,FTA_last_5_std,FTA_last_10_avg,FTA_last_10_std,PTS_lst_gm,PTS_last_3_avg,PTS_last_3_std,PTS_last_5_avg,PTS_last_5_std,PTS_last_10_avg,PTS_last_10_std,reserve,bench,starter
0,2022,2022-10-21,0,21,5,2,0,14.37,4,18.000000,18.0,10.0,206.0,0,0,0,0.559259,0.829630,0.416667,0.519801,0.616138,0.508283,0.691717,0.475758,0.563433,0.552491,0.479785,0.587601,0.523390,0.606242,0.509510,0.536019,0.764053,0.445066,0.541535,0.586381,,20.166667,,24.286,,26.067000,,,11.333333,,11.4,,11.2,,,4.000000,,4.4,,4.400000,,,3.333333,,4.2,,4.500000,,,16.000000,,16.6,,16.800000,,0,1,0
1,2022,2022-10-21,0,21,120,0,0,31.62,8,14.000000,14.0,10.0,206.0,0,0,0,0.666667,0.000000,0.750000,0.750000,0.219526,0.700000,0.000000,0.575000,0.575000,0.294441,0.662500,0.033333,0.655247,0.658118,0.242095,0.676250,0.003333,0.688025,0.688312,0.244257,,4.266667,,7.614,,19.136000,,,1.333333,,2.2,,4.0,,,1.333333,,2.0,,2.900000,,,0.000000,,0.0,,0.300000,,,1.000000,,1.6,,5.100000,,0,0,1
2,2022,2022-10-21,0,21,171,3,0,32.53,12,8.000000,8.0,10.0,206.0,0,0,0,0.380383,0.219298,0.403010,0.445128,0.503382,0.413871,0.277220,0.460781,0.500643,0.492433,0.413396,0.304184,0.502432,0.548425,0.565033,0.393730,0.245163,0.430284,0.472112,0.506262,,26.603333,,29.612,,30.320000,,,12.666667,,13.2,,15.2,,,4.666667,,5.4,,6.200000,,,2.333333,,3.4,,4.600000,,,13.666667,,15.8,,19.800000,,0,0,1
3,2022,2022-10-21,0,21,178,4,0,39.62,20,19.000000,19.0,10.0,206.0,0,0,0,0.000000,0.134680,0.765993,0.743921,0.314865,0.000000,0.080808,0.726263,0.713020,0.298860,0.000000,0.160606,0.727525,0.729702,0.302772,0.000000,0.121111,0.750227,0.733229,0.308854,,30.693333,,30.206,,28.729000,,,8.666667,,8.4,,8.1,,,0.000000,,0.0,,0.000000,,,1.333333,,0.8,,1.500000,,,13.666667,,12.2,,13.100000,,0,0,1
4,2022,2022-10-21,0,21,322,3,0,11.77,4,8.000000,8.0,10.0,206.0,0,0,0,0.564815,0.208333,0.400463,0.423747,0.294991,0.652525,0.361364,0.394823,0.466264,0.339934,0.466089,0.543831,0.495686,0.565859,0.264007,0.581255,0.287792,0.408293,0.450714,0.305375,,23.016667,,22.238,,17.904286,,,6.333333,,6.8,,5.0,,,2.666667,,4.0,,2.857143,,,1.666667,,2.2,,1.857143,,,8.333333,,9.4,,7.285714,,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80245,2025,2025-12-26,29,27,59,4,0,25.97,21,15.933333,16.4,21.0,255.0,0,0,0,0.538624,0.132275,0.243915,0.253706,0.279905,0.534286,0.323810,0.324127,0.367726,0.282941,0.477143,0.418571,0.353730,0.435044,0.288068,0.531175,0.218365,0.278960,0.306046,0.281632,22.57,26.680000,11.049689,28.976,9.501056,26.939000,7.589945,7.0,7.000000,7.234178,7.2,5.310367,6.5,4.779586,4.0,3.666667,2.645751,3.8,2.588436,3.100000,2.131770,2.0,1.000000,2.516611,2.2,2.167948,2.600000,2.054805,3.0,4.000000,13.076697,6.4,9.964939,6.800000,8.350649,0,0,1
80246,2025,2025-12-26,29,27,89,4,0,30.78,21,15.933333,16.4,21.0,255.0,0,0,0,0.500000,0.148611,0.547222,0.566619,0.446228,0.423333,0.179167,0.533333,0.552659,0.503966,0.476431,0.160732,0.536789,0.555656,0.525934,0.474643,0.158990,0.542012,0.561335,0.471520,34.17,34.233333,9.972002,34.696,9.443306,32.890000,7.671929,15.0,14.333333,4.358899,16.2,5.357238,16.1,4.831609,5.0,7.333333,0.577350,6.8,2.073644,7.700000,2.043961,2.0,2.333333,1.154701,3.2,1.949359,2.800000,1.932184,15.0,18.000000,6.244998,19.8,9.884331,19.800000,8.393119,0,0,1
80247,2025,2025-12-26,29,27,474,3,0,19.33,6,17.931034,15.8,21.0,255.0,0,0,0,0.294444,0.275926,0.497222,0.528147,0.446920,0.221667,0.240556,0.505833,0.544184,0.405263,0.310357,0.191706,0.487202,0.518179,0.361884,0.274202,0.256893,0.498804,0.531962,0.425919,22.12,26.240000,6.830874,27.152,9.137200,26.376000,7.455239,9.0,10.333333,4.163332,9.8,5.540758,8.8,4.289522,3.0,3.000000,1.000000,2.2,2.280351,2.500000,2.043961,4.0,2.666667,1.154701,2.2,1.949359,1.600000,2.011080,6.0,12.333333,6.244998,11.8,9.813256,10.100000,8.230026,0,0,1
80248,2025,2025-12-26,29,27,491,3,0,28.47,23,17.931034,15.8,21.0,255.0,0,0,0,0.427885,0.232372,0.521635,0.551437,0.426350,0.525822,0.233969,0.515708,0.541782,0.407127,0.533690,0.181919,0.537832,0.559930,0.358152,0.467846,0.227806,0.521476,0.549390,0.413763,33.27,36.193333,6.712364,34.158,8.371425,32.618000,7.416872,13.0,14.000000,3.055050,12.6,3.435113,10.9,4.301163,7.0,6.000000,2.000000,6.4,1.483240,5.700000,2.170509,3.0,3.333333,1.000000,3.0,1.483240,2.100000,1.828782,14.0,17.666667,4.932883,15.4,5.357238,13.200000,8.055364,0,0,1



Trial 1/1: {'n_estimators': 1138, 'learning_rate': np.float64(0.03487394491104341), 'max_depth': 3, 'min_child_weight': 3, 'subsample': np.float64(0.7602821281787269), 'colsample_bytree': np.float64(0.7315988696375748), 'gamma': np.float64(1.1559489691213587), 'reg_lambda': np.float64(0.2436253715692982), 'reg_alpha': np.float64(1.3319457172942208)}
Validation MAE: 3.6020

Best validation MAE: 3.6020121574401855
Best parameters: {'learning_rate': np.float64(0.03487394491104341), 'max_depth': 3, 'min_child_weight': 3, 'subsample': np.float64(0.7602821281787269), 'colsample_bytree': np.float64(0.7315988696375748), 'gamma': np.float64(1.1559489691213587), 'reg_lambda': np.float64(0.2436253715692982), 'reg_alpha': np.float64(1.3319457172942208), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 4.987448961192779
MAE: 3.730304479598999
R²: 0.6980206966400146


In [26]:
analyze_df_stat = main_test_df.drop([tgt_stat, f'{tgt_stat}_preds'], axis=1)\
                .merge(analyze_df_stat[['Date', 'Team', 'Player', tgt_stat, f'{tgt_stat}_preds']], on=['Date', 'Team', 'Player'])
analyze_df_stat['Diff'] = analyze_df_stat[tgt_stat] - analyze_df_stat[f'{tgt_stat}_preds']
analyze_df_stat[analyze_df_stat.MP > 38].sort_values('Diff', ascending=True).head(15)

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,B2B,MP,Def_PTS,Def_L5_PTS,Spread,Total,cup_gm,pstszn_gm,is_OT,three_rate_L3,ft_rate_L3,eFG_L3,TS_L3,usage_proxy_L3,three_rate_L5,ft_rate_L5,eFG_L5,TS_L5,usage_proxy_L5,three_rate_L10,ft_rate_L10,eFG_L10,TS_L10,usage_proxy_L10,three_rate_weighted,ft_rate_weighted,eFG_weighted,TS_weighted,usage_proxy_weighted,MP_lst_gm,MP_last_3_avg,MP_last_3_std,MP_last_5_avg,MP_last_5_std,MP_last_10_avg,MP_last_10_std,FGA_lst_gm,FGA_last_3_avg,FGA_last_3_std,FGA_last_5_avg,FGA_last_5_std,FGA_last_10_avg,FGA_last_10_std,TPA_lst_gm,TPA_last_3_avg,TPA_last_3_std,TPA_last_5_avg,TPA_last_5_std,TPA_last_10_avg,TPA_last_10_std,FTA_lst_gm,FTA_last_3_avg,FTA_last_3_std,FTA_last_5_avg,FTA_last_5_std,FTA_last_10_avg,FTA_last_10_std,PTS_lst_gm,PTS_last_3_avg,PTS_last_3_std,PTS_last_5_avg,PTS_last_5_std,PTS_last_10_avg,PTS_last_10_std,reserve,bench,starter,PTS,PTS_preds,Diff
9516,2025,2025-11-30,ATL,PHI,Onyeka Okongwu,C,0,47.35,15.190476,14.4,8.0,276.0,0,0,2,0.391534,0.108466,0.607143,0.608324,0.576733,0.433522,0.080464,0.654496,0.657608,0.513632,0.448825,0.225629,0.663438,0.678703,0.493529,0.40986,0.111781,0.626978,0.630147,0.549482,33.7,29.366667,6.204418,29.362,9.444189,29.844,12.926151,14.0,15.333333,7.505553,14.0,7.79102,13.4,6.964194,5.0,6.0,5.686241,6.0,4.358899,6.2,3.95109,1.0,1.666667,2.081666,1.2,4.84768,2.4,5.143496,18.0,19.333333,9.165151,18.8,11.458621,19.6,11.421714,0,0,1,10,28.76804,-18.76804
1874,2024,2025-04-13,CLE,IND,Tristan Thompson,C,0,42.7,14.221053,17.6,-8.0,244.0,0,0,2,0.0,0.083333,0.458333,0.464715,0.471186,0.0,0.05,0.475,0.478829,0.440973,0.0,0.025,0.463542,0.465935,0.358056,0.0,0.0675,0.463854,0.469071,0.450809,19.98,9.793333,9.938236,8.872,9.308119,6.512,11.137116,8.0,3.333333,4.041452,2.8,3.701351,1.9,2.838231,0.0,0.0,0.0,0.0,1.414214,0.0,1.47573,2.0,0.666667,1.154701,0.4,0.894427,0.2,1.523884,7.0,3.0,3.605551,3.0,3.209361,1.9,3.887301,1,0,0,3,20.949051,-17.949051
2532,2024,2025-04-23,MIA,CLE,Bam Adebayo,C,0,42.4,12.891089,12.2,-9.0,233.0,0,1,0,0.227543,0.221861,0.484984,0.493516,0.488495,0.276526,0.279784,0.55099,0.565301,0.520204,0.242249,0.250807,0.544331,0.567886,0.58376,0.243709,0.242133,0.51072,0.522488,0.507534,39.07,39.66,10.221952,35.176,10.700403,34.201,11.016809,22.0,17.333333,6.658328,15.8,7.362065,17.8,6.773314,5.0,4.0,1.0,4.2,2.408319,4.2,3.645393,6.0,4.0,3.05505,4.4,2.44949,4.3,2.065591,24.0,18.666667,9.0185,19.4,9.273618,21.9,6.480741,0,0,1,11,27.862408,-16.862408
10643,2025,2025-12-07,LAL,PHI,Austin Reaves,SG,0,39.13,14.047619,7.8,4.0,220.0,0,0,0,0.517196,0.580688,0.64881,0.715542,0.604655,0.510317,0.628413,0.735952,0.792469,0.556912,0.454022,0.613003,0.685423,0.750524,0.539471,0.508815,0.598237,0.678614,0.742118,0.583814,33.32,35.83,7.681154,37.688,7.805798,37.192,6.604033,18.0,17.0,8.717798,16.2,6.870226,15.8,5.035982,8.0,8.666667,4.163332,8.2,3.63318,7.2,2.635231,17.0,11.0,9.291573,10.8,7.224957,9.7,5.600099,36.0,32.0,19.731531,33.4,14.142136,30.2,10.405661,0,0,1,11,27.791676,-16.791676
7386,2025,2025-11-14,MIL,CHO,Giannis Antetokounmpo,PF,0,40.4,15.384615,11.8,13.0,281.0,1,0,1,0.08162,0.523472,0.571505,0.598167,0.828511,0.06802,0.485512,0.609569,0.623323,0.794738,0.06406,0.577833,0.644246,0.65905,0.767151,0.075784,0.51752,0.590198,0.611802,0.812243,37.38,36.763333,16.469467,33.148,12.310803,33.04875,10.764952,18.0,25.0,9.291573,22.0,6.9857,20.625,6.549809,2.0,2.0,1.154701,1.6,0.894427,1.375,2.936362,12.0,12.333333,6.928203,10.4,5.366563,11.125,3.713339,30.0,36.0,16.772994,32.6,12.409674,33.25,10.87096,0,0,1,25,41.452866,-16.452866
9889,2025,2025-12-02,NYK,BOS,Jalen Brunson,PG,0,39.22,17.25,18.4,-6.0,240.0,0,0,0,0.27193,0.305973,0.514829,0.566411,0.722496,0.343609,0.272306,0.563784,0.612314,0.686039,0.359142,0.278386,0.559474,0.607156,0.67078,0.302155,0.293114,0.53398,0.584256,0.706388,33.73,35.593333,15.328227,35.268,11.11003,34.556,9.300263,19.0,22.666667,7.0,21.6,5.069517,20.7,4.532598,6.0,6.0,1.732051,7.2,1.30384,7.1,2.869379,5.0,6.666667,2.886751,5.8,2.19089,5.8,3.071373,18.0,29.333333,5.507571,29.6,5.770615,28.1,5.291503,0,0,1,15,30.555347,-15.555347
3739,2024,2025-05-26,MIN,OKC,Anthony Edwards,SG,0,40.93,12.891089,19.8,-2.0,254.0,0,1,0,0.477376,0.346908,0.611237,0.641648,0.600132,0.511187,0.263383,0.640075,0.657936,0.571395,0.460463,0.360547,0.561782,0.59797,0.58548,0.485828,0.323214,0.614943,0.642167,0.590046,29.7,35.446667,3.691021,36.154,7.683709,38.754,13.967116,17.0,18.666667,7.023769,18.4,6.797058,19.7,8.962267,8.0,8.333333,4.163332,9.0,3.03315,8.9,3.984693,2.0,6.333333,1.154701,4.8,1.414214,7.0,2.110819,30.0,26.666667,12.055428,26.4,10.382678,26.9,12.547244,0,0,1,16,31.126701,-15.126701
11894,2025,2025-12-25,DEN,MIN,Peyton Watson,SF,0,42.22,16.923077,12.8,4.0,280.0,0,0,1,0.414815,0.414815,0.789815,0.776998,0.554193,0.366061,0.36,0.655707,0.637303,0.484028,0.374102,0.378864,0.624039,0.641772,0.436179,0.396117,0.394775,0.733005,0.721567,0.521342,27.85,18.84,6.957032,23.038,5.162843,28.033,7.933442,9.0,7.666667,2.0,8.6,3.435113,10.2,6.398785,4.0,3.0,1.732051,3.0,4.764452,4.0,3.977716,4.0,4.0,1.154701,3.4,2.280351,3.0,4.157991,20.0,15.333333,4.163332,13.0,6.024948,14.4,9.527154,0,0,1,9,24.064016,-15.064016
3326,2024,2025-05-09,OKC,DEN,Shai Gilgeous-Alexander,PG,0,44.8,15.483333,22.6,-9.0,217.0,0,1,1,0.183761,0.577991,0.649038,0.714316,0.706397,0.233068,0.443612,0.553349,0.619837,0.733287,0.280931,0.373205,0.521308,0.582002,0.749988,0.20827,0.517199,0.607559,0.672741,0.718823,30.18,36.103333,9.420766,37.378,7.485457,34.736,7.439153,13.0,21.0,6.350853,23.6,5.01996,22.8,3.977716,2.0,4.0,4.041452,5.8,2.915476,6.3,2.347576,11.0,11.0,5.196152,9.2,4.38178,7.8,3.771236,34.0,35.0,15.947832,32.6,11.335784,30.1,8.937437,0,0,1,18,32.803684,-14.803684
1951,2024,2025-04-13,LAL,POR,Bronny James,SG,0,38.3,14.833333,12.8,-28.0,190.0,0,0,0,0.416667,0.0,0.9375,0.9375,0.451257,0.43,0.12,0.66875,0.723524,0.486305,0.556667,0.15,0.632292,0.665591,0.498315,0.434667,0.051,0.826354,0.846116,0.466477,4.38,2.74,15.883189,8.96,15.090576,7.874,14.117658,4.0,1.666667,3.511885,3.8,6.760178,3.3,5.812821,1.0,0.666667,1.0,1.6,0.894427,2.0,1.988858,0.0,0.0,2.309401,0.6,2.683282,0.7,2.674987,3.0,2.0,6.506407,5.0,10.256705,4.4,8.769265,1,0,0,4,18.704647,-14.704647


In [27]:
rmse = np.sqrt(mean_squared_error(y_test_stat, stat_preds)) # splits[5] = y_test
mae = mean_absolute_error(y_test_stat, stat_preds)
print('RMSE:', rmse)

df_yesterday = pd.read_csv(f'../tables/2025/gmday_preds_{tgt_stat}.csv')
df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
                .rename(columns={"MP": "MP_proj"})

df_gms = pd.read_csv(f"../tables/2025/season_gamelogs.csv")
df_gms['Date'] = pd.to_datetime(df_gms.Date)
df_gms = df_gms.rename(columns={"TRB": "REB", "3PM": "TPM", "3PA": "TPA"})
df_gms['STL_BLK'] = df_gms.STL + df_gms.BLK
df_gms['PR'] = df_gms.PTS + df_gms.REB 
df_gms['PA'] = df_gms.PTS + df_gms.AST
df_gms['RA'] = df_gms.REB + df_gms.AST
df_gms['PRA'] = df_gms.PTS + df_gms.REB + df_gms.AST

df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
display(df_yesterday)
df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'Pos', 'Opp', 'MP_proj', 'MP', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)

df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff2'] <= rmse, 1, 0)

print("Total Accuracy (ParlayHit):", ((df_yesterday.ParlayHit == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.ParlayHit == 1).sum(), "/", df_yesterday.shape[0])

print("\nTotal Accuracy (InRMSE_Range):", ((df_yesterday.InRMSE_Range == 1).sum() / df_yesterday.shape[0]))
print((df_yesterday.InRMSE_Range == 1).sum(), "/", df_yesterday.shape[0])

df_yesterday = df_yesterday.drop(['Diff', 'Diff2', 'Act_Res', 'Pred_Res'], axis=1).sort_values(f'{tgt_stat}_line', ascending=False)

if df_yesterday.shape[0] >= 50:
    for tm in df_yesterday.Team.unique():
        display(df_yesterday[df_yesterday.Team == tm])
else:
    display(df_yesterday)

RMSE: 4.987448961192779


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP_last_5_avg,PTS_line,PTS_proj,PTS,MP


Total Accuracy (ParlayHit): nan
0 / 0

Total Accuracy (InRMSE_Range): nan
0 / 0


Unnamed: 0,Date,Team,Player,Pos,Opp,MP_proj,MP,PTS_line,PTS_proj,PTS,ParlayHit,InRMSE_Range


### Today's predictions

In [28]:
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)

df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
df_pred['Date'] = pd.to_datetime(df_pred.Date)
df_pred['Season'] = 2025
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred[col].astype('category')
df_pred = df_pred.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])
df_pred[tgt_stat] = 0

# Predict minutes
df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP', 'TPM']], on=['Date', 'Team', 'Player'], how='left')
df_temp = df.copy()
df_temp["Team"] = team_encoder.inverse_transform(df_temp["Team"])
df_temp['Player'] = player_encoder.inverse_transform(df_temp["Player"])
df_pred = df_pred.merge(df_temp[['Date', 'Team', 'Player', 'TOV', 'PF', '+/-', 'FGA', 'FG', 'TPA', 'FT', 'FTA',
                             'AST', 'REB', 'PR', 'PA', 'RA', 'STL', 'BLK', 'STL_BLK']], on=['Date', 'Team', 'Player'], how='left')

df_pred = df_pred.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_pred_mins = setup_df_mins(con, df_pred)

df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
df_pred_mins["Opp"] = team_encoder.transform(df_pred_mins["Opp"])
df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
df_pred_mins["Pos"] = position_encoder.transform(df_pred_mins["Pos"])
DM_mins = xgb.DMatrix(df_pred_mins)
df_pred['MP'] = mins_model.predict(DM_mins)
df_pred['N_TPM'] = df_pred.FG - df_pred.TPM
df_pred['PTS'] = (df_pred.FT * 1) + (df_pred.N_TPM * 2) + (df_pred.TPM * 3)
df_pred = setup_df_main(df_pred)
feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
df_pred = df_pred[df_pred.Date == now][feature_cols]

# Predict stat
df_pred["Team"] = team_encoder.transform(df_pred["Team"])
df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
df_pred["Player"] = player_encoder.transform(df_pred["Player"])
df_pred["Pos"] = position_encoder.transform(df_pred["Pos"])
DM_stats = xgb.DMatrix(df_pred)
df_pred[f"{tgt_stat}_proj"] = stat_model.predict(DM_stats)

df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
df_pred['Opp'] = team_encoder.inverse_transform(df_pred["Opp"])
df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])
df_pred['Pos'] = position_encoder.inverse_transform(df_pred["Pos"])

df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

tds_picks = df_pred[~(df_pred[f'{tgt_stat}_line'].isnull())]\
            [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_last_5_avg', f'{tgt_stat}_line', f'{tgt_stat}_proj']]
tds_picks['Diff'] = abs((df_pred[f'{tgt_stat}_line'] - df_pred[f'{tgt_stat}_proj']))
tds_picks['Diff2'] = abs((df_pred['MP'] - df_pred['MP_last_5_avg']))
tds_picks = tds_picks[(tds_picks.Diff >= mae) & (tds_picks.Diff2 <= 8)].sort_values('Diff', ascending=False).drop(['Diff', 'Diff2'], axis=1)
display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/2025/gmday_preds_{tgt_stat}.csv")

Unnamed: 0,Team,Player,Pos,Opp,MP,MP_last_5_avg,PTS_line,PTS_proj
1,LAL,Luka Doncic,PG,SAC,35.549591,35.775349,35.5,27.726818
12,SAC,Keon Ellis,SG,LAL,9.802944,15.128835,9.5,2.905766
29,SAC,Precious Achiuwa,C,LAL,10.736481,18.521255,8.5,4.2775


../tables/2025/gmday_preds_PTS.csv saved!
