# To do:

 - Figure out how to signal injuries
 - Add shooting percentages features for PTS stats
 - Add team shooting percentages for AST stats
 - Add def team shooting percentages for REB stats
 - Add TOV stats for STL
 - Add field goal attempts and 3 point attempts features FOR pts stats

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import math         # haversine_km()

import xgboost as xgb
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)

Today's date: 2025-12-18


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model):
    importance = model.get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
def create_baseline_model(df, pred_col, train_df, val_df, test_df):

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP'
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            'starter', 'bench', 'reserve'
        ]
    else:
        print('Stats Model')
        feature_cols = [
            tgt_stat,
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            'starter', 'bench', 'reserve',
            f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}',
            f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
            'DaysLstGm'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    feature_cols = [col for col in df.columns if col not in ['Date', pred_col]]

    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst, (X_train, y_train, X_val, y_val, X_test, y_test)

In [5]:
def hyperparam_tuning(splits, n_iter=20, early_stopping_rounds=50):
    """
    Hyperparameter tuning using native XGBoost API and DMatrix,
    with early stopping support (compatible with XGBoost 3.1.2)
    """

    X_train, y_train, X_val, y_val, X_test, y_test = splits

    # Convert datasets to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.01, 0.05),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(1, 8),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 2),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 2)
    }

    # Sample n_iter random parameter combinations
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k, v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_mae = float('inf')
    best_params = None
    best_bst = None

    # Manual hyperparameter search
    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')
        params.update({
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device": "cuda",
            "seed": 42
        })
        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        # Predict on validation set to compute MAE
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        mae = mean_absolute_error(y_val, val_preds)
        print(f"Validation MAE: {mae:.4f}")
        if mae < best_mae:
            best_mae = mae
            best_params = params.copy()
            best_bst = bst

    print("\nBest validation MAE:", best_mae)
    print("Best parameters:", best_params)

    # Predict on test set using best model
    preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    print("\nTest Metrics:")
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))

    return best_bst, preds

### Create Base df

In [28]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
for i in [2022, 2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_schd = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_schd['Season'] = i
    df2 = pd.concat([df2, df_schd])
    
    df_gms = pd.read_csv(f"../tables/{i}/season_gamelogs.csv")
    df_gms['Season'] = i
    df3 = pd.concat([df3, df_gms])

df['Date'] = pd.to_datetime(df.Date)
df2['Date'] = pd.to_datetime(df2.Date)
df3['Date'] = pd.to_datetime(df3.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)].drop(['WrngTm', 'WrngOpp', 'Tms'], axis=1)

df3 = df3[['game_id', 'Date', 'Team', 'Player', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TOV', 'PF', '+/-']]\
        .rename(columns={"3PA": "TPA", "3P%": "TP%"})
df3 = df3[~df3[['Date', 'Team', 'Player']].duplicated(keep='last')]
df = df.merge(df3, on=['game_id', 'Date', 'Team', 'Player'])

df_mtch = df2[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Season', 'Date', 'Team', 'Team_type', 'AwayPTS', 'HomePTS', 'cup_gm', 'pstszn_gm']]
df_mtch = df_mtch.sort_values(["Team", "Date"])
df_mtch['team_game_num'] = df_mtch.groupby(["Team", "Season"]).cumcount() + 1
df_mtch['Spread'] = np.where(df_mtch.Team_type == 'Home', df_mtch.HomePTS - df_mtch.AwayPTS, df_mtch.AwayPTS - df_mtch.HomePTS)
df_mtch['Total'] = df_mtch.AwayPTS + df_mtch.HomePTS
df_mtch['is_Win'] = np.where(df_mtch.Spread > 0, 1, 0)
df_mtch['Szn_Wins'] = df_mtch.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()
position_encoder = LabelEncoder()

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df["Pos"] = position_encoder.fit_transform(df["Pos"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
print('base df created', datetime.now())

base df created 2025-12-18 21:13:20.510792


# Minutes Projection Model

In [29]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 'TOV', 'PF', '+/-',
             'Spread', 'Total', 'team_game_num', 'Szn_Wins', 'cup_gm', 'pstszn_gm']]
    
    for col in ['MP', 'TOV', 'PF', '+/-']:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )

        df[f'{col}_last_5_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=5, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

        df[f'{col}_last_10_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=10, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
        
    
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
      
    df['missed_games'] = (
        df.groupby(['Player', 'Team', 'Season'])['team_game_num']      
          .diff()
          .sub(1)
          .fillna(0)
          .astype(int)
    )

#     df["career_min_mins"] = (
#         df.assign(MP_nonzero=df["MP"].replace(0, np.nan))
#           .groupby("Player")["MP_nonzero"]
#           .cummin()
#           .shift(1)
#     )
#     df["career_max_mins"] = df.groupby("Player")["MP"].cummax().shift(1)
    
    df['blowout'] = np.where(abs(df.Spread >= 15), 1, 0)
    
#     df['Szn_Wins'] = df.groupby(['Player', 'Season', 'Team'])['Szn_Wins'].shift(1).fillna(0)
#     df['Win_Pct'] = df.Szn_Wins / df.team_game_num
    
    # Have to derive OppSzn_Wins and then add it to the df
#     df['OppSzn_Wins'] = df.groupby(['Player', 'Season', 'Opp'])['Szn_Wins'].shift(1).fillna(0)
    
    df = df.drop(['reserve_td', 'bench_td', 'starter_td', 'Szn_Wins', 'TOV', 'PF', '+/-'], axis=1)    
    
    return df

In [30]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

n = len(df_mins)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)
mins_train_df = df_mins.iloc[:train_end]
mins_val_df   = df_mins.iloc[train_end:val_end]
mins_test_df  = df_mins.iloc[val_end:]

mins_model, mins_splits = create_baseline_model(df_mins, "MP", mins_train_df, mins_val_df, mins_test_df)
mins_model, mins_preds = hyperparam_tuning(mins_splits, n_iter=1)
# feature_importance(mins_model)

Unnamed: 0,Season,Date,Team,Team_type,Opp,Player,Pos,B2B,MP,Spread,Total,team_game_num,cup_gm,pstszn_gm,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,TOV_lst_gm,TOV_last_5_avg,TOV_last_10_avg,PF_lst_gm,PF_last_5_avg,PF_last_10_avg,+/-_lst_gm,+/-_last_5_avg,+/-_last_10_avg,games_last_7_days,prev_team_mins_pct,reserve,bench,starter,missed_games,blowout
0,2022,2022-10-21,0,1,21,2,3,0,0.00,10.0,206.0,2,0,0,,25.226,18.698,,0.8,0.7,,1.8,1.6,,-1.0,-1.1,3,,0,1,0,0,0
1,2022,2022-10-21,0,1,21,5,2,0,14.37,10.0,206.0,2,0,0,,24.286,26.067,,1.0,1.4,,2.2,1.9,,17.0,12.5,3,,1,0,0,0,0
2,2022,2022-10-21,0,1,21,120,0,0,31.62,10.0,206.0,2,0,0,,0.934,3.465,,0.0,0.0,,0.0,0.0,,1.8,1.2,3,,0,0,1,0,0
3,2022,2022-10-21,0,1,21,171,3,0,32.53,10.0,206.0,2,0,0,,32.588,32.252,,3.0,3.4,,2.0,2.0,,1.0,2.9,3,,0,0,1,0,0
4,2022,2022-10-21,0,1,21,178,4,0,39.62,10.0,206.0,2,0,0,,29.280,28.101,,1.4,1.0,,2.8,2.0,,-4.0,0.9,2,,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94923,2025,2025-12-17,14,0,17,361,4,0,31.35,6.0,226.0,27,0,0,29.67,28.694,29.841,1.0,1.0,0.8,2.0,2.0,2.1,-6.0,0.6,6.0,2,0.123625,0,0,1,0,0
94924,2025,2025-12-17,14,0,17,381,0,0,29.92,6.0,226.0,27,0,0,27.48,21.520,19.429,0.0,0.8,0.9,1.0,2.8,2.2,29.0,9.6,0.2,2,0.114500,0,1,0,0,0
94925,2025,2025-12-17,14,0,17,457,4,0,23.22,6.0,226.0,27,0,0,23.37,20.348,20.683,0.0,1.4,1.6,1.0,0.4,0.5,30.0,7.8,2.1,2,0.097375,0,1,0,0,0
94926,2025,2025-12-17,14,0,17,680,1,0,34.32,6.0,226.0,27,0,0,30.80,28.738,27.989,1.0,0.8,0.7,1.0,2.0,1.4,6.0,5.4,0.3,2,0.128333,0,0,1,0,0


Minutes Model
Train: 75942 / Validation: 9493 / Test: 9493
RMSE: 5.974635818133943
MAE: 4.337981226472192
R²: 0.7694824183580958

Trial 1/1: {'n_estimators': 303, 'learning_rate': np.float64(0.05069257860381463), 'max_depth': 4, 'min_child_weight': 6, 'subsample': np.float64(0.9988788050446745), 'colsample_bytree': np.float64(0.8481765469183018), 'gamma': np.float64(0.7320437677241025), 'reg_lambda': np.float64(4.999187380460121), 'reg_alpha': np.float64(1.2705156147504577)}
Validation MAE: 4.9864

Best validation MAE: 4.986422777070419
Best parameters: {'learning_rate': np.float64(0.05069257860381463), 'max_depth': 4, 'min_child_weight': 6, 'subsample': np.float64(0.9988788050446745), 'colsample_bytree': np.float64(0.8481765469183018), 'gamma': np.float64(0.7320437677241025), 'reg_lambda': np.float64(4.999187380460121), 'reg_alpha': np.float64(1.2705156147504577), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 5.9941612439500

In [31]:
rmse = np.sqrt(mean_squared_error(mins_splits[5], mins_preds)) # splits[5] = y_test
mins_test_df['MP_pred'] = mins_preds
df_test = mins_test_df.copy()

df_test['Team'] = team_encoder.inverse_transform(df_test['Team'])
df_test['Opp'] = team_encoder.inverse_transform(df_test['Opp'])
df_test['Player'] = player_encoder.inverse_transform(df_test['Player'])
df_test['Pos'] = position_encoder.inverse_transform(df_test['Pos'])

df_test['Diff'] = abs(df_test['MP_pred'] - df_test['MP'])
df_test['InRMSE_Range'] = np.where(df_test['Diff'] <= rmse, 1, 0)

print("Total Accuracy (InRMSE_Range):", ((df_test.InRMSE_Range == 1).sum() / df_test.shape[0]))
print((df_test.InRMSE_Range == 1).sum(), "/", df_test.shape[0])

df_ystrday = df_test[(df_test.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
            [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_pred', 'InRMSE_Range', 'Diff', 'Spread']]
print("\nYesterday's Results:")
print("Total Accuracy (InRMSE_Range):", ((df_ystrday.InRMSE_Range == 1).sum() / df_ystrday.shape[0]))
if df_ystrday.shape[0] >= 50:
    for tm in df_ystrday.Team.unique():
        display(df_ystrday[df_ystrday.Team == tm])
else:
    display(df_ystrday)

Total Accuracy (InRMSE_Range): 0.7492889497524492
7113 / 9493

Yesterday's Results:
Total Accuracy (InRMSE_Range): 0.8181818181818182


Unnamed: 0,Team,Player,Pos,Opp,MP,MP_pred,InRMSE_Range,Diff,Spread
94906,CHI,Coby White,SG,CLE,30.37,28.862825,1,1.507175,16.0
94907,CHI,Isaac Okoro,SG,CLE,30.9,21.867006,0,9.032994,16.0
94908,CHI,Josh Giddey,PG,CLE,33.5,31.645359,1,1.854641,16.0
94909,CHI,Kevin Huerter,SF,CLE,25.92,21.976873,1,3.943127,16.0
94910,CHI,Matas Buzelis,PF,CLE,22.25,27.596657,1,5.346657,16.0
94911,CHI,Nikola Vucevic,C,CLE,28.95,27.948284,1,1.001716,16.0
94912,CHI,Patrick Williams,PF,CLE,10.53,21.503721,0,10.973721,16.0
94913,CHI,Tre Jones,PG,CLE,24.28,25.284369,1,1.004369,16.0
94914,CLE,Darius Garland,PG,CHI,30.18,31.039795,1,0.859795,-16.0
94915,CLE,De'Andre Hunter,SF,CHI,27.77,26.876097,1,0.893903,-16.0


# Main Model

In [32]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [33]:
def setup_df_main(df):
    
    # Minutes based Features
    df['MP_lst_gm'] = (
        df
        .groupby(['Player', 'Season'])['MP']
        .shift(1)
    )

    df['MP_last_5_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=5, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    df['MP_last_10_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=10, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)
    
    # Efficiency metrics
    df['three_rate_raw'] =  np.where(df.FGA > 0, df['TPA'] / df['FGA'], 0)
    df['ft_rate_raw']    =  np.where(df.FGA > 0, df['FTA'] / df['FGA'], 0)
    df['eFG_raw'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS_raw'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))    
    df['usage_proxy_raw'] =  np.where(df.MP > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
    eff_cols = []
    for w in [3, 5, 10]:
        for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
            col = f"{metric}_L{w}"
            df[col] = (
                df.groupby(['Player','Season'])[f'{metric}_raw']
                  .rolling(w, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0,1], drop=True)
            )
            eff_cols.append(col)
    for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
        col = f'{metric}_weighted'
        df[col] = (
            0.6 * df[f'{metric}_L3'] +
            0.3 * df[f'{metric}_L5'] +
            0.1 * df[f'{metric}_L10']
        )
        eff_cols.append(col)
    
    df['FGA_L5_avg'] = df.groupby(['Player', 'Season'])['FGA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['TPA_L5_avg'] = df.groupby(['Player', 'Season'])['TPA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['FTA_L5_avg'] = df.groupby(['Player', 'Season'])['FTA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    
    stat_cols = []
    for col in [f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}']:
        stat_cols.append(col)

    final_cols = ['Date', 'Team', 'Team_type', 'B2B', 'cup_gm', 'pstszn_gm', 'Player', 'Pos', 'Opp', 
                  'MP', 'MP_lst_gm', 'MP_last_5_avg', 'MP_last_10_avg', 'Spread', 'Total',
                  'DaysLstGm', 'travel_km', 'travel_hours', 'PrevLocation', 'is_long_trip', 'same_arena', 
#                   'three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy',
                  'FGA_L5_avg', 'TPA_L5_avg', 'FTA_L5_avg',
                  tgt_stat] + stat_cols + eff_cols
    df = df[final_cols]
    
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(0)
        
    # PRA features
    if tgt_stat == 'PTS':
        pass
    elif tgt_stat == 'PRA':
        df['PRA_per_min'] = np.where(df.MP > 0, df.PRA / df['MP'], 0)
        df['PRA_last_5_per_min_avg'] = df.groupby('Player')['PRA_per_min'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0,1], drop=True)
        df = df.drop(['PRA_per_min'], axis=1)
        
    return df

In [34]:
tgt_stat = "PTS"
df_main = df.copy()
df_main = setup_df_main(df_main)
display(df_main)

n = len(df)
train_end = int(0.65 * n)
val_end   = int(0.85 * n)
main_train_df = df_main.iloc[:train_end]
main_val_df   = df_main.iloc[train_end:val_end]
main_test_df  = df_main.iloc[val_end:]

stat_model, main_splits = create_baseline_model(df_main, tgt_stat, main_train_df, main_val_df, main_test_df)
stat_model, stat_preds = hyperparam_tuning(main_splits, n_iter=1)
# feature_importance(stat_model)

Unnamed: 0,Date,Team,Team_type,B2B,cup_gm,pstszn_gm,Player,Pos,Opp,MP,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,Spread,Total,DaysLstGm,travel_km,travel_hours,PrevLocation,is_long_trip,same_arena,FGA_L5_avg,TPA_L5_avg,FTA_L5_avg,PTS,Off_PTS,Off_L5_PTS,Def_PTS,Def_L5_PTS,three_rate_L3,ft_rate_L3,eFG_L3,TS_L3,usage_proxy_L3,three_rate_L5,ft_rate_L5,eFG_L5,TS_L5,usage_proxy_L5,three_rate_L10,ft_rate_L10,eFG_L10,TS_L10,usage_proxy_L10,three_rate_weighted,ft_rate_weighted,eFG_weighted,TS_weighted,usage_proxy_weighted
0,2022-10-21,0,1,0,0,0,2,3,21,0.00,0.00,25.226,18.698,10.0,206.0,0,0.0,0.0,0.0,0,0,8.4,4.0,5.2,0,0.000000,0.0,8.000000,8.0,0.508995,0.681481,0.505556,0.606562,0.419508,0.486349,0.646984,0.496984,0.583267,0.428189,0.551508,0.369325,0.479742,0.513125,0.425240,0.506452,0.639917,0.500403,0.590230,0.422685
1,2022-10-21,0,1,0,0,0,5,2,21,14.37,0.00,24.286,26.067,10.0,206.0,0,0.0,0.0,0.0,0,0,11.4,4.4,4.2,4,5.000000,5.0,18.000000,18.0,0.559259,0.829630,0.416667,0.519801,0.616138,0.508283,0.691717,0.475758,0.563433,0.552491,0.479785,0.587601,0.523390,0.606242,0.509510,0.536019,0.764053,0.445066,0.541535,0.586381
2,2022-10-21,0,1,0,0,0,120,0,21,31.62,0.00,0.934,3.465,10.0,206.0,0,0.0,0.0,0.0,0,0,0.2,0.2,0.0,8,2.000000,2.0,14.000000,14.0,0.333333,0.000000,1.500000,1.500000,0.071378,0.200000,0.000000,1.500000,1.500000,0.042827,0.300000,0.000000,0.600000,0.600000,0.088741,0.290000,0.000000,1.410000,1.410000,0.064549
3,2022-10-21,0,1,0,0,0,171,3,21,32.53,0.00,32.588,32.252,10.0,206.0,0,0.0,0.0,0.0,0,0,15.0,6.6,4.0,12,22.000000,22.0,8.000000,8.0,0.466264,0.242735,0.561036,0.585356,0.497347,0.444352,0.281526,0.532794,0.576458,0.514305,0.389527,0.322184,0.539072,0.589485,0.616220,0.452017,0.262317,0.550367,0.583100,0.514322
4,2022-10-21,0,1,0,0,0,178,4,21,39.62,0.00,29.280,28.101,10.0,206.0,0,0.0,0.0,0.0,0,0,9.4,0.0,1.0,20,20.000000,20.0,19.000000,19.0,0.000000,0.134680,0.821549,0.799477,0.334001,0.000000,0.098990,0.705051,0.694954,0.338075,0.000000,0.175991,0.721115,0.718415,0.332501,0.000000,0.128104,0.776556,0.760014,0.335073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94923,2025-12-17,14,0,0,0,0,361,4,17,31.35,29.67,28.694,29.841,6.0,226.0,2,0.0,0.0,12.0,0,0,10.2,4.2,5.2,17,12.576923,17.6,15.761905,13.2,0.435897,0.628205,0.571154,0.658394,0.442408,0.419114,0.554701,0.652793,0.709326,0.434508,0.491355,0.560587,0.683579,0.745407,0.395807,0.436408,0.599392,0.606888,0.682375,0.435378
94924,2025-12-17,14,0,0,0,0,381,0,17,29.92,27.48,21.520,19.429,6.0,226.0,2,0.0,0.0,12.0,0,0,6.4,1.8,2.4,20,10.692308,9.6,17.333333,17.8,0.400000,0.609524,0.580952,0.636532,0.337575,0.280000,0.365714,0.613571,0.646919,0.367822,0.281212,0.275887,0.564462,0.588620,0.544887,0.352121,0.503017,0.589089,0.634857,0.367380
94925,2025-12-17,14,0,0,0,0,457,4,17,23.22,23.37,20.348,20.683,6.0,226.0,2,0.0,0.0,12.0,0,0,5.8,2.8,1.0,12,8.083333,6.8,15.761905,13.2,0.441667,0.125000,0.520833,0.536123,0.352644,0.545000,0.475000,0.652500,0.627631,0.307409,0.609167,0.354167,0.632083,0.628206,0.304825,0.489417,0.252917,0.571458,0.572784,0.334292
94926,2025-12-17,14,0,0,0,0,680,1,17,34.32,30.80,28.738,27.989,6.0,226.0,2,0.0,0.0,12.0,0,0,9.0,4.4,3.4,8,13.076923,11.6,17.958333,12.0,0.626263,0.434343,0.546465,0.564226,0.386664,0.482424,0.385051,0.434545,0.477413,0.360514,0.383779,0.344390,0.485441,0.533506,0.389254,0.558863,0.410560,0.506787,0.535110,0.379078


Stats Model
Train: 61703 / Validation: 18985 / Test: 14240
RMSE: 4.606658238367816
MAE: 3.2075507640838623
R²: 0.7477636933326721

Trial 1/1: {'n_estimators': 907, 'learning_rate': np.float64(0.014749498332016211), 'max_depth': 4, 'min_child_weight': 7, 'subsample': np.float64(0.9071104603222299), 'colsample_bytree': np.float64(0.9868507194609104), 'gamma': np.float64(0.2700739863271604), 'reg_lambda': np.float64(0.9817742137557806), 'reg_alpha': np.float64(1.3768846615996062)}
Validation MAE: 3.0562

Best validation MAE: 3.0561838150024414
Best parameters: {'learning_rate': np.float64(0.014749498332016211), 'max_depth': 4, 'min_child_weight': 7, 'subsample': np.float64(0.9071104603222299), 'colsample_bytree': np.float64(0.9868507194609104), 'gamma': np.float64(0.2700739863271604), 'reg_lambda': np.float64(0.9817742137557806), 'reg_alpha': np.float64(1.3768846615996062), 'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 4.5927176

In [35]:
rmse = np.sqrt(mean_squared_error(main_splits[5], stat_preds)) # splits[5] = y_test
mae = mean_absolute_error(main_splits[5], stat_preds)
pred_col = f'{tgt_stat}_Pred'
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[['Date', 'Team', 'Player', f'{tgt_stat}_line']]
main_test_df[pred_col] = stat_preds
main_test_df['Team'] = team_encoder.inverse_transform(main_test_df["Team"])
main_test_df['Player'] = player_encoder.inverse_transform(main_test_df["Player"])

df_test = main_test_df.merge(df_lines, on=['Date', 'Team', 'Player'])
df_test = df_test[[c for c in df_test.columns if c != pred_col] + [pred_col]]

df_test['Diff'] = df_test[f'{tgt_stat}_Pred'] - df_test[f'{tgt_stat}_line']
df_test['Act_Res'] = np.where(df_test[tgt_stat] > df_test[f'{tgt_stat}_line'], 'O', 'U')
df_test['Pred_Res'] = np.where(df_test[pred_col] > df_test[f'{tgt_stat}_line'], 'O', 'U')
df_test['ParlayHit'] = np.where(df_test['Act_Res'] == df_test['Pred_Res'], 1, 0)

df_test['Diff2'] = abs(df_test[f'{tgt_stat}_Pred'] - df_test[tgt_stat])
df_test['InRMSE_Range'] = np.where(df_test['Diff2'] <= rmse, 1, 0)

print("Total Accuracy (ParlayHit):", ((df_test.ParlayHit == 1).sum() / df_test.shape[0]))
print((df_test.ParlayHit == 1).sum(), "/", df_test.shape[0])

print("\nTotal Accuracy (InRMSE_Range):", ((df_test.InRMSE_Range == 1).sum() / df_test.shape[0]))
print((df_test.InRMSE_Range == 1).sum(), "/", df_test.shape[0])

df_ystrday = df_test[(df_test.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
            [['Team', 'Player', tgt_stat, f'{tgt_stat}_line', f'{tgt_stat}_Pred', 'ParlayHit', 'Diff', 'InRMSE_Range', 'Diff2']]\
            .sort_values(f'{tgt_stat}_line', ascending=False)

print("\nYesterday's Results:")
print("Total Accuracy (ParlayHit):", ((df_ystrday.ParlayHit == 1).sum() / df_ystrday.shape[0]))
print("Total Accuracy (InRMSE_Range):", ((df_ystrday.InRMSE_Range == 1).sum() / df_ystrday.shape[0]))
if df_ystrday.shape[0] >= 50:
    for tm in df_ystrday.Team.unique():
        display(df_ystrday[df_ystrday.Team == tm])
else:
    display(df_ystrday)

Total Accuracy (ParlayHit): 0.676056338028169
1392 / 2059

Total Accuracy (InRMSE_Range): 0.6376881981544439
1313 / 2059

Yesterday's Results:
Total Accuracy (ParlayHit): 0.8181818181818182
Total Accuracy (InRMSE_Range): 0.7727272727272727


Unnamed: 0,Team,Player,PTS,PTS_line,PTS_Pred,ParlayHit,Diff,InRMSE_Range,Diff2
2048,CLE,Donovan Mitchell,32,31.5,25.210854,0,-6.289146,0,6.789146
2037,CHI,Coby White,25,21.5,24.607243,1,3.107243,1,0.392757
2045,CLE,Darius Garland,15,20.5,17.24332,1,-3.25668,1,2.24332
2053,MEM,Jaren Jackson Jr.,28,19.5,19.976357,1,0.476357,0,8.023643
2039,CHI,Josh Giddey,23,18.5,22.212685,1,3.712685,1,0.787315
2057,MEM,Santi Aldama,8,15.5,16.969547,0,1.469547,0,8.969547
2052,MEM,Cedric Coward,13,15.5,13.272451,1,-2.227549,1,0.272451
2046,CLE,De'Andre Hunter,12,14.5,13.143043,1,-1.356957,1,1.143043
2049,CLE,Jarrett Allen,14,14.5,14.503229,0,0.003229,1,0.503229
2042,CHI,Nikola Vucevic,20,14.5,17.480221,1,2.980221,1,2.519779


### Today's predictions

In [48]:
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)

df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
df_pred['Date'] = pd.to_datetime(df_pred.Date)
df_pred['Season'] = 2025
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred[col].astype('category')
df_pred = df_pred.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])
df_pred[tgt_stat] = 0

# Predict minutes
df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP', 'TPM']], on=['Date', 'Team', 'Player'], how='left')
df_pred = df_pred.merge(df3[['Date', 'Team', 'Player', 'TOV', 'PF', '+/-', 'FGA', 'FG', 'TPA', 'FT', 'FTA']], on=['Date', 'Team', 'Player'], how='left')

df_pred = df_pred.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_pred_mins = setup_df_mins(con, df_pred)

df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
df_pred_mins["Opp"] = team_encoder.transform(df_pred_mins["Opp"])
df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
df_pred_mins["Pos"] = position_encoder.transform(df_pred_mins["Pos"])
DM_mins = xgb.DMatrix(df_pred_mins)
df_pred['MP'] = mins_model.predict(DM_mins)

df_pred = setup_df_main(df_pred)
feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
df_pred = df_pred[df_pred.Date == now][feature_cols]

# Predict stat
df_pred["Team"] = team_encoder.transform(df_pred["Team"])
df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
df_pred = df_pred[~(df_pred.PrevLocation.isnull())] # Filters out players who are debuting on the year
df_pred["PrevLocation"] = team_encoder.transform(df_pred["PrevLocation"])
df_pred["Player"] = player_encoder.transform(df_pred["Player"])
df_pred["Pos"] = position_encoder.transform(df_pred["Pos"])
df_pred['Team_type'] = team_type_encoder.transform(df_pred['Team_type'])
DM_stats = xgb.DMatrix(df_pred)
df_pred[f"{tgt_stat}_proj"] = stat_model.predict(DM_stats)

df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
df_pred['Opp'] = team_encoder.inverse_transform(df_pred["Opp"])
df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])
df_pred['Pos'] = position_encoder.inverse_transform(df_pred["Pos"])

df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

tds_picks = df_pred[~(df_pred[f'{tgt_stat}_line'].isnull())]\
            [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_last_5_avg', f'{tgt_stat}_line', f'{tgt_stat}_proj']]
tds_picks['Diff'] = abs((df_pred[f'{tgt_stat}_line'] - df_pred[f'{tgt_stat}_proj']))
tds_picks['Diff2'] = abs((df_pred['MP'] - df_pred['MP_last_5_avg']))
display(tds_picks[(tds_picks.Diff >= mae) & (tds_picks.Diff2 <= 3)].sort_values('Diff', ascending=False).drop(['Diff', 'Diff2'], axis=1))

Unnamed: 0,Team,Player,Pos,Opp,MP,MP_last_5_avg,PTS_line,PTS_proj
76,NOP,Saddiq Bey,SF,HOU,32.751221,31.995854,10.5,18.833736
3,OKC,Shai Gilgeous-Alexander,PG,LAC,31.008839,31.633666,30.5,22.176846
47,ATL,Dyson Daniels,SG,CHO,36.886982,34.362868,12.5,18.015385
75,NOP,Trey Murphy III,SF,HOU,33.383663,32.281582,17.5,23.000835
92,CHO,Miles Bridges,PF,ATL,35.561947,34.00483,20.5,25.807915
13,BRK,Egor Demin,PG,MIA,9.215707,10.992871,8.5,3.36436
25,NOP,Jeremiah Fears,PG,HOU,29.892656,30.33221,12.5,17.210964
99,NOP,Zion Williamson,PF,HOU,30.573845,31.383968,18.5,23.141972
103,MIL,Bobby Portis,PF,TOR,19.606655,21.822358,14.5,9.900615
79,MIA,Simone Fontecchio,SF,BRK,14.710382,16.275378,10.5,5.932743
