# To do:

 - Both
     - Fix Injuries data
         - Find a better source for roster data (I found, work on the plyr_pos_xref notebook)
     - Signal Opp Injuries
 - Mins
 - PTS
     - Find more effective way to signal Defensive stats
 - Res_PTS
     - Try some sort of defensive rk L5 - d_rk today feature to measure difficulty increase/decrease
     - Create some short term Lx Average features for the target columns (Res_PTS/Bet)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import os
import json

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from scipy.stats import randint, uniform

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import PartialDependenceDisplay

import joblib
import warnings
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")

cwd = os.path.abspath(os.getcwd()).replace("\\", "/")
if cwd.startswith("C:/Users/Rodolfo/"):
    RUN_LOCATION = "local"
    MDL_PATH = "../ML_models/dev"
else:
    RUN_LOCATION = "cloud"
    MDL_PATH = "../ML_models"
time_offset = {"local": 3, "cloud": -5}
now = str((datetime.now() + timedelta(hours=time_offset[RUN_LOCATION]) + timedelta(hours=-3)).date())
print(f"Today's date:", now)

tgt_stat = "PTS"
print('Target Stat:', tgt_stat)

Today's date: 2026-01-30
Target Stat: PTS


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model, all_features):
    booster = model.get_booster()
    score = booster.get_score(importance_type="gain")

    df_importance = pd.DataFrame({
            "feature": all_features,
            "importance": [score.get(f, 0.0) for f in all_features]
        }).sort_values("importance", ascending=False).reset_index(drop=True)
    
    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    df_importance['importance'] = df_importance['importance'].map('{:.4f}'.format)
    if df_importance.shape[0] >= 50:
        with pd.option_context('display.max_rows', None):
            display(df_importance)
    else:
        display(df_importance)
    
    xgb.plot_importance(model)
    plt.show()

In [4]:
def compute_sample_weights(df, decay=0.99):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    max_date = df['Date'].max()
    df["days_old"] = (max_date - df['Date']).dt.days
    weights = decay ** df["days_old"]
    
    return weights.values

In [5]:
def quantile_loss(y_true, y_pred, q):
    diff = y_true - y_pred
    return np.mean(np.maximum(q * diff, (q - 1) * diff))

In [6]:
def create_baseline_model(df, pred_col, DFS):
    
    train_df, val_df, test_df = DFS

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg', 'game_spread_type'
        ]
    else:
        print(f'{pred_col} Stats Model')
        feature_cols = [
            'MP_L5_avg',
            'MP_L10_avg',
            f'{pred_col}_last_3_avg', f'{pred_col}_last_5_avg', f'{pred_col}_last_10_avg',
            f'Def_{pred_col}', f'Def_L5_{pred_col}'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst

In [7]:
def hyperparam_tuning(DFS, pred_col, is_classification=False, quantile=False, n_iter=20, early_stopping_rounds=50, decay=1, q_val=0.5):
    """
    Hyperparameter tuning for XGBRegressor or XGBClassifier using native XGBoost API
    """
    train_df, val_df, test_df = DFS
    feature_cols = [col for col in train_df.columns if col not in ['Season', 'Date', pred_col]]
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Sample Weights (decay < 1)
    w_train = compute_sample_weights(train_df, decay=decay)
    w_val   = compute_sample_weights(val_df, decay=decay)
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train, enable_categorical=True)
    dval   = xgb.DMatrix(X_val, label=y_val, weight=w_val, enable_categorical=True)
    dtest  = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.005, 0.08),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(3, 10),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.5, 0.5),
        "gamma": uniform(0, 0.8),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 1)
    }

    # Generate n_iter random parameter sets
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k,v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_score = float('inf') if not is_classification else 0
    best_params = None
    best_bst = None

    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')

        # Set objective based on regression or classification
        if is_classification:
            params.update({
                "objective": "binary:logistic",
                "enable_categorical": True,
                "eval_metric": "logloss",
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })
        elif quantile:
            params.update({
                "objective": "reg:quantileerror",
                "quantile_alpha": q_val, 
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        else:
            params.update({
                "objective": "reg:squarederror",
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )

        # Validation scoring
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        if is_classification:
            val_class = (val_preds > 0.5).astype(int)
            score = (val_class == y_val.values).mean()  # accuracy
            print(f"Validation Accuracy: {score:.4f}")
            if score > best_score:
                best_score = score
                best_params = params.copy()
                best_bst = bst
        elif quantile:
            q_loss = quantile_loss(y_val.values, val_preds, q_val)
            print(f"Validation Quantile Loss (q_val={q_val}): {q_loss:.4f}")
            if q_loss < best_score:
                best_score = q_loss
                best_params = params.copy()
                best_bst = bst
        else:
            mae = mean_absolute_error(y_val, val_preds)
            print(f"Validation MAE: {mae:.4f}")
            if mae < best_score:
                best_score = mae
                best_params = params.copy()
                best_bst = bst

    print("\nBest score:", best_score)
    print("Best parameters:", best_params)

    # Test predictions
    test_preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    if is_classification:
        test_class = (test_preds > 0.5).astype(int)
        acc = (test_class == y_test.values).mean()
        print("\nTest Accuracy:", acc)
    else:
        print("\nTest Metrics:")
        if quantile:
            ql = quantile_loss(y_test, test_preds, q_val)
            print("Quantile loss:", ql)
            coverage = np.mean(y_test <= test_preds)
            print(f"Coverage for q_val={q_val}: {coverage:.2f}")
        else:
            print("RMSE:", np.sqrt(mean_squared_error(y_test, test_preds)))
            print("MAE:", mean_absolute_error(y_test, test_preds))
            print("R²:", r2_score(y_test, test_preds))

    return best_params

In [8]:
def refit_model(df, pred_col, params_file, min_train_days=0, rolling_window=None, decay=1):
    df = df.sort_values("Date")
    dates = df["Date"].unique()
    print(f'Rows: {df.shape[0]}, Dates: {len(dates)}, min_train_days: {min_train_days}')

    feature_cols = [c for c in df.columns if c not in ["Season", "Date", pred_col]]

    # Load hyperparameters
    with open(f"{MDL_PATH}/{params_file}.json", "r") as f:
        loaded_params = json.load(f)

    preds, actuals, dates_out, predictions = [], [], [], []
    total_iters = len(dates) - min_train_days

    for idx, i in enumerate(range(min_train_days, len(dates)), start=1):
        test_date = dates[i]
        test_season = df.loc[df["Date"] == test_date, "Season"].iloc[0]

        if rolling_window:
            train_start_idx = max(0, i - rolling_window)
        else:
            train_start_idx = 0

        train_dates = dates[train_start_idx:i]

        train_df = df[df["Date"].isin(train_dates)]
        test_df  = df[df["Date"] == test_date]

        if test_df.empty:
            continue

        X_train, y_train = train_df[feature_cols], train_df[pred_col]
        X_test, y_test   = test_df[feature_cols], test_df[pred_col]

        predictions.append(test_df)
        if pred_col == 'Bet':
            model = XGBClassifier(**loaded_params)
            model.fit(X_train, y_train)
            
            test_df['pred_prob'] = model.predict_proba(X_test)[:,1]
            test_df['pred_class'] = (test_df['pred_prob'] > 0.5).astype(int)
        else:
            model = XGBRegressor(**loaded_params)
            sample_weights = compute_sample_weights(train_df, decay=decay)
            model.fit(X_train, y_train, sample_weight=sample_weights)

            y_pred = model.predict(X_test)
            preds.extend(y_pred)
            actuals.extend(y_test.values)
            dates_out.extend([test_date] * len(y_pred))

        if idx % max(1, total_iters // 20) == 0:
            pct = 100 * idx / total_iters
            print(f"Progress: {pct:6.2f}% ({idx}/{total_iters})")
            
    results = pd.concat(predictions)
    if pred_col == 'Res_PTS':
        results['Actuals'] = actuals
        results['Predictions'] = preds
        mae = mean_absolute_error(actuals, preds)
        print("Walk-forward MAE:", mae)
        results["Correct_Direction"] = (np.sign(results["Predictions"]) == np.sign(results["Actuals"])).astype(int)
        for t in [0, 1, 2, 3]:
            subset = results[results["Predictions"].abs() >= t]
            acc = subset["Correct_Direction"].mean() if len(subset) > 0 else np.nan
            print(f"|Pred| >= {t}: accuracy = {acc:.3f}, n = {len(subset)}")
    elif pred_col == 'Bet':
        cm = confusion_matrix(results['Bet'], results['pred_class'])
        print("Confusion Matrix:\n", cm)
        report = classification_report(results['Bet'], results['pred_class'])
        print("Classification Report:\n", report)
        auc = roc_auc_score(results['Bet'], results['pred_prob'])
        print(f"ROC AUC: {auc:.3f}")

        high_confidence = results.copy()
        high_confidence['pred_prob'] = np.where(high_confidence.pred_prob > 0.5, 1 - high_confidence.pred_prob, high_confidence.pred_prob)
        high_confidence = high_confidence[high_confidence['pred_prob'] <= 0.3]
        if len(high_confidence) > 0:
            hit_rate = (high_confidence['pred_class'] == high_confidence['Bet']).mean()
            print(f"High-confidence hit rate (<= 0.3 & >= 0.7): {hit_rate:.2f}")

    else:
        results['Actuals'] = actuals
        results['Predictions'] = preds
        if loaded_params['objective'] == 'reg:quantileerror':
            ql = quantile_loss(results['Actuals'], results['Predictions'], loaded_params['quantile_alpha'])
            print("Quantile loss:", ql)
            coverage = np.mean(results['Actuals'] <= results['Predictions'])
            print(f"Coverage for q_val={loaded_params['quantile_alpha']}: {coverage:.2f}")
        else:
            mae = mean_absolute_error(actuals, preds)
            rmse = np.sqrt(mean_squared_error(actuals, preds))
            r2 = r2_score(actuals, preds)
            print(f"Walk-forward RMSE: {rmse:.3f}")
            print(f"Walk-forward MAE: {mae:.3f}")
            print(f"Walk-forward R²: {r2:.3f}")

    return model, results

### Create Base df

In [9]:
def load_df(file_name):
    df = pd.DataFrame()
    for i in [2021, 2022, 2023, 2024, 2025]:
        df_temp = pd.read_csv(f"../tables/{i}/{file_name}.csv")
        df_temp['Season'] = i
        df = pd.concat([df, df_temp])
        
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df.Date)
    if file_name == "season_gamelogs":
        df = df[~df[['Date', 'Team', 'Player']].duplicated(keep='last')]
    
    return df

In [10]:
# Load dfs
df = load_df('nba_schedule')
df2 = load_df('season_gamelogs')
# df3 = load_df('REPLACE ME')
df4 = load_df('injuries')
df5 = load_df('plyr_pos_xref')
df6 = load_df('daily_lineups')
gmlog_cols = ['game_id', 'Player', 'MP', 'PF', 'PTS', 'FG', 'FGA', 'FT', 'FTA', '3PM', '3PA', 'ORB', 'TOV']
df7 = load_df('h1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h1", "PTS": "PTS_h1", "FG": "FG_h1", "FGA": "FGA_h1", "FT": "FT_h1", "FTA": "FTA_h1", "3PM": "TPM_h1", "3PA": "TPA_h1", "PF": "PF_h1", "TOV": "TOV_h1", "ORB": "ORB_h1"})
df8 = load_df('h2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h2", "PTS": "PTS_h2", "FG": "FG_h2", "FGA": "FGA_h2", "FT": "FT_h2", "FTA": "FTA_h2", "3PM": "TPM_h2", "3PA": "TPA_h2", "PF": "PF_h2", "TOV": "TOV_h2", "ORB": "ORB_h2"})
df9 = load_df('q1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q1", "PTS": "PTS_q1", "FG": "FG_q1", "FGA": "FGA_q1", "FT": "FT_q1", "FTA": "FTA_q1", "3PM": "TPM_q1", "3PA": "TPA_q1", "PF": "PF_q1", "TOV": "TOV_q1", "ORB": "ORB_q1"})
df10 = load_df('q2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q2", "PTS": "PTS_q2", "FG": "FG_q2", "FGA": "FGA_q2", "FT": "FT_q2", "FTA": "FTA_q2", "3PM": "TPM_q2", "3PA": "TPA_q2", "PF": "PF_q2", "TOV": "TOV_q2", "ORB": "ORB_q2"})
df11 = load_df('q3_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q3", "PTS": "PTS_q3", "FG": "FG_q3", "FGA": "FGA_q3", "FT": "FT_q3", "FTA": "FTA_q3", "3PM": "TPM_q3", "3PA": "TPA_q3", "PF": "PF_q3", "TOV": "TOV_q3", "ORB": "ORB_q3"})
df12 = load_df('q4_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q4", "PTS": "PTS_q4", "FG": "FG_q4", "FGA": "FGA_q4", "FT": "FT_q4", "FTA": "FTA_q4", "3PM": "TPM_q4", "3PA": "TPA_q4", "PF": "PF_q4", "TOV": "TOV_q4", "ORB": "ORB_q4"})

df_mtch = df[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df = pd.concat([df_mtch, df_mtch2])
df = df.sort_values(["Team", "Date"])
df['team_game_num'] = df.groupby(["Team", "Season"]).cumcount() + 1
df['Spread'] = np.where(df.Team_type == 'Home', df.AwayPTS - df.HomePTS, df.HomePTS - df.AwayPTS)
df['Total'] = df.AwayPTS + df.HomePTS
df['is_Win'] = np.where(df.Spread > 0, 1, 0)
df['Szn_Wins'] = df.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.merge(df5, on=['Season', 'Team'])

df2 = df2.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"})
df2['PR'] = df2.PTS + df2.REB 
df2['PA'] = df2.PTS + df2.AST
df2['RA'] = df2.REB + df2.AST
df2['PRA'] = df2.PTS + df2.REB + df2.AST
df2['STL_BLK'] = df2.STL + df2.BLK
df = df.merge(df2.drop(['Pos', 'Opp', 'Team_type'], axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')

df = df.merge(df4[['Date', 'Team', 'Player', 'Status']], on=['Date', 'Team', 'Player'], how='left')
df['Status'] = np.where((df.Active == 1) & (df.Status.isnull()), 'Available', df.Status)
df['Status'] = np.where((df.Active == 0), 'Out', df.Status)
df['Status'] = np.where((df.Status == 'Out') & (df.Active != 0), 'Available', df.Status)

df6['role'] = 1
df = df.merge(df6.drop('Pos', axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')
df['role'] = df.role.fillna(2).astype(int)

# Add gmlog splits
df_gmlog_comb = df7.merge(df8, on=['game_id', 'Player'])
for df_loop in (df9, df10, df11, df12):
    df_gmlog_comb = df_gmlog_comb.merge(df_loop, on=['game_id', 'Player'])
df = df.merge(df_gmlog_comb, on=['game_id', 'Player'], how='left')

df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
df = df.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df['Res_PTS'] = df.PTS - df.PTS_line

df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
df_td = df[df.Date == now]
df = df[(df.Active == 1) & (df.MP > 0)]
df_pred = df.copy()
df_pred = pd.concat([df_pred, df_td])
print('base df created', datetime.now())

base df created 2026-01-30 20:38:03.468369


### Feature Engineering Helper Functions

In [11]:
def create_df_missing(df, pred_col):

    df3 = load_df('season_gamelogs')
    df3 = df3.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"}).drop(['Pos', 'Opp'], axis=1)
    df4 = load_df('injuries')
    
    # Fill missing games from injuries.csv
    team_games = df_pred[['Season', 'Team', 'Date']].drop_duplicates()
    players = df_pred[['Season','Player','Team']].drop_duplicates()
    fabricated = (players.sort_values('Season').groupby('Player', as_index=False).last())
    fabricated['Season'] = fabricated['Season'] + 1
    players = pd.concat([players, fabricated], ignore_index=True).drop_duplicates(['Season','Player','Team'])
    expanded = team_games.merge(players, on=['Season', 'Team'], how='left')

    df5 = load_df('plyr_pos_xref')

    expanded = expanded.merge(df3[['Season', 'Player', 'Date', 'MP']], on=['Season', 'Player', 'Date'], how='left').drop_duplicates(['Season', 'Date', 'Player', 'Team'])
    expanded = expanded[(expanded.MP.isnull()) & (expanded.Date != now)].drop('MP', axis=1)
    expanded = pd.concat([expanded, df4[df4.Status == 'Out'][['Season', 'Team', 'Date', 'Player']]])
    df4 = df4.merge(expanded, on=['Season', 'Date', 'Team', 'Player'], how='right')

    # Grab outs from players season gamelogs
    df4 = df4.merge(df3, on=['Season', 'Date', 'Team', 'Player'], how='outer')
    df4['Status'] = np.where(((df4.Active == 1) | (df4.MP > 0)), 'Available', df4.Status)
    df4['Status'] = np.where(((df4.Active == 0) | (df4.MP == 0) | (df4.MP.isnull())), 'Out', df4.Status)
    df4['Status'] = np.where((df4.Status == 'Out') & (df4.MP > 0), 'Available', df4.Status)
    df4['Status'] = np.where((df4.Status != 'Out') & (df4.MP == 0), 'Out', df4.Status)
    df4 = df4[df4.Status == 'Out'][['Season', 'Date', 'Team', 'Player']].drop_duplicates()
    
    df_missing = df[['Season', 'Date', 'Team', 'Player', 'role', pred_col]].copy()
    df_missing[f'{pred_col}_L10'] = (
        df_missing.sort_values(['Player', 'Date']).groupby(['Player','Season'])[pred_col].shift(1)
                  .transform(lambda x: x.rolling(10, min_periods=10).mean())
    )
    df_missing['role_L10_mode'] = (
        df_missing.sort_values(['Player', 'Date'])
            .groupby(['Player', 'Season'])['role'].shift(1)
            .transform(lambda x: x.rolling(10, min_periods=10)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
    )
    df_missing = pd.merge_asof(df4, df_missing[["Season", "Player", "Date", "role", "role_L10_mode", f"{pred_col}_L10"]], 
                      on="Date", by=["Player", "Season"], direction="backward", allow_exact_matches=True).dropna()   
    df_missing = df_missing.merge(df5, on=['Season', 'Team', 'Player'])
    
    # Filter out old injuries
    df_missing = df_missing.sort_values(["Season", "Team", "Player", "Date"])
    df_missing["team_game_num"] = (df_missing.groupby(["Season", "Team"])["Date"].rank(method="dense").astype(int))
    df_missing["game_break"] = (df_missing.groupby(["Season", "Team", "Player"])["team_game_num"].diff().ne(1))
    df_missing["streak_id"] = (df_missing.groupby(["Season", "Team", "Player"])["game_break"].cumsum())
    df_missing["consecutive_games"] = (df_missing.groupby(["Season", "Team", "Player", "streak_id"]).cumcount().add(1))
    df_missing["eligible_today"] = (df_missing["consecutive_games"] <= 10).astype(int)
    df_missing["role_for_count"] = np.where(df_missing["eligible_today"] == 1, df_missing["role_L10_mode"], np.nan)    
    df_missing[f'{pred_col}_L10'] = np.where(df_missing['role_for_count'] == 1, df_missing[f'{pred_col}_L10'], 0)

#     display(df_missing[(df_missing.Team == 'CLE') & (df_missing.Date == '2026-01-23')].tail(10))

    out_minutes = (
    df_missing
      .groupby(["Season", "Date", "Team"])
      .agg(
          tgt_available=(f"{pred_col}_L10", lambda x: x.sum()),
          starters_out=("role_for_count", lambda x: (x == 1).sum())
      )
      .reset_index()
    ).rename(columns={"tgt_available": f"team_{pred_col}_available"})

    return out_minutes

In [12]:
def filter_out_early_exits(df):
    for N in [3, 5, 10]:
        df[f'MP_L{N}_avg'] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['MP'].shift(1)
             .rolling(window=N, min_periods=N)
             .mean()
        )
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)    
    df['Early_Exit'] = ((df['MP_base'].notna()) & (df['MP_base'] > 0) &
                        (
                          ((df['MP'] - df['MP_base']) / df['MP_base'] <= -0.4) |  
                          ((df['MP_q4'] == 0) & (df['role'] == 1))
                        )).astype(int)    
    df = df[df.Early_Exit == 0]
    df = df.drop('Early_Exit', axis=1)
    
    return df

# Minutes Projection Model

In [13]:
def setup_df_mins(df, fltr_ee=True):

    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'role',
             'MP', 'MP_q4', 'Spread', 'team_game_num', 'is_OT']]    
    cleanup_cols = []
    cold_features = []
    if fltr_ee == True:
        df = filter_out_early_exits(df)
    
    df['team_mins_pct'] = df['MP'] / (240 + (df.is_OT * 25))
    for col in ['MP', 'team_mins_pct']:
        for N in [3, 5, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                 .rolling(window=N, min_periods=N)
                 .mean()
            )
            df[f'is_cold_{col}_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
            cold_features.append(f'is_cold_{col}_L{N}')
            cleanup_cols.append(f'{col}_L{N}_avg')
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    df['MP_tm_pct_base'] = df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    
    df['role'] = np.where((df.role == 2) & (df.MP_base < 13), 3, df.role)
    
    games_last_14_days = df.sort_values(['Player', 'Season', 'Date']).groupby(['Player', 'Season']).rolling('14D', on='Date', closed='left')['MP'].count().reset_index().rename(columns={"MP": "gms_L14_days"})
    games_last_14_days = games_last_14_days.drop_duplicates(
        subset=['Player', 'Season', 'Date']
    )
    df = df.merge(games_last_14_days, on=['Player', 'Season', 'Date'])
    df['gms_L14_days'] = df.gms_L14_days.fillna(0).astype(int)    
    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
    df['games_since_return'] = (df.groupby(['Player', 'Team', 'Season']).apply(
                                    lambda g: (
                                        (g['team_game_num'].diff().sub(1).fillna(0).gt(0))
                                        .cumsum()
                                        .groupby((g['team_game_num'].diff().sub(1).fillna(0).gt(0)).cumsum()).cumcount()
                                    )
                                ).reset_index(level=[0,1,2], drop=True))

    for N in [1, 3, 5]:
        df[f"recent_role_L{N}"] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['role'].shift(1)
              .transform(lambda x: x.rolling(N, min_periods=N)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
        )
        df[f'is_cold_recent_role_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
        cold_features.append(f'is_cold_recent_role_L{N}')
        cleanup_cols.append(f"recent_role_L{N}")
    
#     df['game_spread_type'] = 0
#     df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
#     df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 21), 2, df.game_spread_type) 
#     df['game_spread_type'] = np.where(abs(df.Spread) > 21, 3, df.game_spread_type) 
    
    df2 = create_df_missing(df, 'MP')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_MP_available']:
        df[col] = df[col].fillna(0).astype(int)
        
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df['returning_MP'] = (
        (df['MP_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df['team_MP_available'] = df['team_MP_available'] - df['returning_MP']

    df['MP_Change'] = 0
    MP_Inc_conds = (
#                     ((df.role != 3) & (df.starters_out > 2)) | 
                    ((df.role == 1) & (df.recent_role_L1 > 1.0)).astype(int) + 
                    (df.team_MP_available >= 110).astype(int)
                   )
    
    MP_Dec_conds = (
                    ((df.role > 1) & ((df.recent_role_L1 == 1.0))).astype(int) + 
                    (df.team_MP_available < -23).astype(int)
                   ) * -1
    df['MP_Change'] = MP_Inc_conds + MP_Dec_conds

    df['scenario_mins'] = (
        df.sort_values(['Season','Team','role','Pos','Date'])
          .groupby(['Season','Team','role','Pos'])['MP'].shift(1)
          .expanding()
          .mean() 
          .reset_index(drop=True)
    )
    
    df['MP_trend'] = df['MP_L3_avg'] - df['MP_L10_avg']
    df['Expected_MP'] = (
        (0.8 * df['scenario_mins']) +
        (df['team_MP_available'] * df['MP_tm_pct_base']) + 
        (0.2 * df['MP_base']) + df['MP_trend']
    )
    
    df["is_cold_start"] = (df[cold_features].eq(1).any(axis=1).astype(int))
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['Team_type', 'team_game_num', 'is_OT', 'Spread', 'team_mins_pct', 'MP_tm_pct_base', 
                  'returning_MP', 'scenario_mins', 'MP_q4'] + cleanup_cols + cold_features, axis=1)    


    return df

In [188]:
df_mins = df.copy()
df_mins = setup_df_mins(df_mins)
display(df_mins)

game_dates = (df_mins[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

mins_train_df = df_mins[df_mins['Date'] <= train_end]
mins_val_df   = df_mins[(df_mins['Date'] > train_end) & (df_mins['Date'] <= val_end)]
mins_test_df  = df_mins[df_mins['Date'] > val_end]
mins_DFS = (mins_train_df, mins_val_df, mins_test_df)

# Prev r2/mae/rmse best: 0.7064/4.0508/5.1790 [1/24/2026]
# mins_params = hyperparam_tuning(mins_DFS, "MP", n_iter=25)
# with open(f"{MDL_PATH}/mins_params.json", "w") as f:
#     json.dump(mins_params, f)

# Prev r2/mae/rmse best: 0.729/3.615/4.599 [1/24/2026]
mins_model, mins_results = refit_model(df_mins, 'MP', 'mins_params', min_train_days=915, rolling_window=180)
# feature_importance(mins_model, df_mins.columns.tolist())

mins_model.get_booster().save_model(f"{MDL_PATH}/mins_model.json")
print('Saved Mins booster!')

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,MP_base,gms_L14_days,missed_games,games_since_return,team_MP_available,starters_out,starters_returning,MP_Change,MP_trend,Expected_MP,is_cold_start
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,1,22.98,,0,0,0,0.000,0,0,0,,,1
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,2,3.75,,0,0,0,0.000,0,0,0,,,1
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,2,3.75,,0,0,0,0.000,0,0,0,,,1
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,2,3.75,,0,0,0,0.000,0,0,0,,,1
4,2021,2021-10-19,BRK,MIL,James Harden,PG,1,30.63,,0,0,0,0.000,0,0,0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106669,2025,2026-01-28,UTA,GSW,Jusuf Nurkic,C,1,22.63,31.0421,4,1,0,-43.036,1,2,-1,1.435667,22.015835,0
106670,2025,2026-01-28,UTA,GSW,Keyonte George,PG,1,32.70,36.3345,6,1,0,-43.036,1,2,-1,0.811667,21.511724,0
106671,2025,2026-01-28,UTA,GSW,Kyle Anderson,SF,2,21.88,20.7864,6,0,2,-43.036,1,2,-1,-2.340667,18.010012,0
106672,2025,2026-01-28,UTA,GSW,Kyle Filipowski,C,2,16.83,22.9553,7,0,6,-43.036,1,2,-1,-4.264667,16.115122,0


Rows: 106674, Dates: 945, min_train_days: 910
Progress:   2.86% (1/35)
Progress:   5.71% (2/35)
Progress:   8.57% (3/35)
Progress:  11.43% (4/35)
Progress:  14.29% (5/35)
Progress:  17.14% (6/35)
Progress:  20.00% (7/35)
Progress:  22.86% (8/35)
Progress:  25.71% (9/35)
Progress:  28.57% (10/35)
Progress:  31.43% (11/35)
Progress:  34.29% (12/35)
Progress:  37.14% (13/35)
Progress:  40.00% (14/35)
Progress:  42.86% (15/35)
Progress:  45.71% (16/35)
Progress:  48.57% (17/35)
Progress:  51.43% (18/35)
Progress:  54.29% (19/35)
Progress:  57.14% (20/35)
Progress:  60.00% (21/35)
Progress:  62.86% (22/35)
Progress:  65.71% (23/35)
Progress:  68.57% (24/35)
Progress:  71.43% (25/35)
Progress:  74.29% (26/35)
Progress:  77.14% (27/35)
Progress:  80.00% (28/35)
Progress:  82.86% (29/35)
Progress:  85.71% (30/35)
Progress:  88.57% (31/35)
Progress:  91.43% (32/35)
Progress:  94.29% (33/35)
Progress:  97.14% (34/35)
Progress: 100.00% (35/35)
Walk-forward RMSE: 4.567
Walk-forward MAE: 3.596
Walk

In [15]:
# analyze_df = mins_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df[analyze_df.Date.isin(['2026-01-23'])].sort_values('Diff', ascending=False).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [128]:
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull())]

# df_lines["Team"] = team_encoder.transform(df_lines["Team"])
# df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
# df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
# df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
# df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
# df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
# df_prediction = df_pred.copy()

# # Predict Mins
# mins_booster = xgb.Booster()
# mins_booster.load_model("../ML_models/dev/mins_model.json")
# mins_model = XGBRegressor()
# mins_model._Booster = mins_booster

# df_prediction_mins = setup_df_mins(df_prediction)
# df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))
# df_prediction_mins = df_prediction_mins[df_prediction_mins.Date == now]

# df_prediction_mins['Team'] = team_encoder.inverse_transform(df_prediction_mins["Team"])
# df_prediction_mins['Opp'] = team_encoder.inverse_transform(df_prediction_mins["Opp"])
# df_prediction_mins['Player'] = player_encoder.inverse_transform(df_prediction_mins["Player"])

# if df_prediction_mins.shape[0] >= 50:
#     print(df_prediction_mins.shape[0], 'rows')
#     for tm in df_prediction_mins.Team.unique():
#         display(df_prediction_mins[df_prediction_mins.Team == tm])
# else:
#     display(df_prediction_mins)

# Stats Model

In [20]:
def setup_df_main(df, tgt_stat):
    
    df_prediction_mins = setup_df_mins(df, fltr_ee=False)
    mins_booster = xgb.Booster()
    mins_booster.load_model(f"{MDL_PATH}/mins_model.json")
    mins_model = XGBRegressor()
    mins_model._Booster = mins_booster

    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 'is_OT', 
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'Spread', 'Total',
            'ORB']]
    cleanup_cols = []
    df['MP_proj'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))
    
    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
    df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

#     df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
#         'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
#         'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
#         'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
#     }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
    
    
    # Create rolling + lag features
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    for col in [tgt_stat, 'FGA', 'FTA', 'ORB', 'TOV']:
        df[f'Team{col}'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])[col].transform('sum'))
        if col in [tgt_stat]:
            df[f'Team{col}_pct'] = df[col] / df[f'Team{col}']
            
    df['PlayerPace'] = np.where(df['MP'] > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
    df['TeamPace'] = ((df['TeamFGA'] + 0.44 * df['TeamFTA'] - df['TeamORB'] + df['TeamTOV']) / ((df['is_OT'] * 25) + 240))
    df['Player_Pace_Rel'] = df['PlayerPace'] / df['TeamPace']
    df['Pace_Minutes_Interaction'] = df['PlayerPace'] * df['MP']

    # Create rolling + lag features    
    for col in ['MP', tgt_stat, f'Team{tgt_stat}_pct', f'Def{tgt_stat}', 'FG', 'FGA', 'FT', 'FTA', 'Player_Pace_Rel', 'Pace_Minutes_Interaction']:
        for N in [1, 3, 5, 10]:
            if col == f'Def{tgt_stat}':
                df[f'Def{tgt_stat}_L{N}_avg'] = (
                    df[df.role <= 2]
                      .groupby(['Season', 'Date', 'Opp', 'Pos'])[tgt_stat]
                      .sum()
                      .groupby(['Opp', 'Pos', 'Season'])
                      .shift(1)
                      .rolling(window=N, min_periods=N)
                      .mean()
                      .reindex(df.set_index(['Season', 'Date', 'Opp', 'Pos']).index)
                      .values
                )
            else:
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].mul([0.60, 0.25, 0.15]).sum(axis=1, skipna=True) / df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].notna().mul([0.60, 0.25, 0.15]).sum(axis=1)    

    df2 = create_df_missing(df, tgt_stat)
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', f'team_{tgt_stat}_available']:
        df[col] = df[col].fillna(0)
    
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df[f'returning_{tgt_stat}'] = (
        (df[f'{tgt_stat}_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df[f'team_{tgt_stat}_available'] = df[f'team_{tgt_stat}_available'] - df[f'returning_{tgt_stat}']
    df[f'Player_share_avail_{tgt_stat}'] = (df[f'team_{tgt_stat}_available'] * df[f'Team{tgt_stat}_pct_base'])
        
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
        
    drop_list = ['team_game_num', 'is_OT', 'missed_games', 'MP', 'MP_q4', 'Spread', 'Total', 'role', 
                  f'Team{tgt_stat}', f'Team{tgt_stat}_pct', 'TeamFGA', 'TeamFTA', 'TeamORB', 'TeamTOV', 
                  'PlayerPace', 'TeamPace', 'Player_Pace_Rel', 'Pace_Minutes_Interaction',  
                 'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS', 'ORB', 
                 f'returning_{tgt_stat}', 'starters_out', 'starters_returning', f'team_{tgt_stat}_available'] 
    drop_list.remove(tgt_stat)
    df = df.drop(drop_list + cleanup_cols, axis=1)

    return df

In [37]:
for tgt_col in ['PTS', 'FG', 'FGA']:
    df_main = df.copy()
    df_main = setup_df_main(df_main, tgt_col)
    display(df_main)

    game_dates = (df_main[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
    n_days = len(game_dates)
    train_end = game_dates.loc[int(0.70 * n_days), 'Date']
    val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

    main_train_df = df_main[df_main['Date'] <= train_end]
    main_val_df   = df_main[(df_main['Date'] > train_end) & (df_main['Date'] <= val_end)]
    main_test_df  = df_main[df_main['Date'] > val_end]
    main_DFS = (main_train_df, main_val_df, main_test_df)

    # Prev r2/mae/rmse best: 0.6706/3.7633/5.0858 [1/24/2026]
    stat_params = hyperparam_tuning(main_DFS, tgt_col, n_iter=1, decay=0.99)
    with open(f"{MDL_PATH}/{tgt_col}_params.json", "w") as f:
        json.dump(stat_params, f)

    # Prev r2/mae/rmse best: 0.695/3.628/4.840 [1/21/2026]
    stat_model, stat_results = refit_model(df_main, tgt_col, f'{tgt_col}_params', min_train_days=915, decay=0.99)
    # feature_importance(stat_model, df_main.columns.tolist())

    stat_model.get_booster().save_model(f"{MDL_PATH}/{tgt_col}_model.json")
    print(f"Saved {tgt_col} booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,FG,MP_proj,TeamPTS,MP_base,FG_base,TeamFG_pct_base,DefFG_base,FGA_base,FT_base,FTA_base,Player_Pace_Rel_base,Pace_Minutes_Interaction_base,Player_share_avail_FG
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,2.0,24.448162,104.0,,,,,,,,,,
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,0.0,18.174227,104.0,,,,,,,,,,
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,0.0,17.841209,104.0,,,,,,,,,,
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,0.0,14.692163,104.0,,,,,,,,,,
4,2021,2021-10-19,BRK,MIL,James Harden,PG,6.0,31.534994,104.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127525,2025,2026-01-29,WAS,MIL,Khris Middleton,SF,4.0,26.459738,109.0,26.95825,4.470,0.118050,9.460,11.555,2.650,2.680,1.133867,12.7342,0.0
127526,2025,2026-01-29,WAS,MIL,Kyshawn George,SF,10.0,31.759531,109.0,31.73075,5.495,0.143804,9.460,16.005,3.820,4.245,1.349741,17.8728,0.0
127527,2025,2026-01-29,WAS,MIL,Malaki Branham,SG,2.0,13.204216,109.0,7.88085,2.240,0.052125,8.815,3.795,0.105,0.120,1.033221,3.8478,0.0
127528,2025,2026-01-29,WAS,MIL,Tre Johnson,SG,2.0,30.973494,109.0,33.28730,6.735,0.175929,8.815,15.345,2.075,2.470,1.153070,16.4318,0.0



Trial 1/1: {'n_estimators': 482, 'learning_rate': np.float64(0.03373994851872452), 'max_depth': 5, 'min_child_weight': 5, 'subsample': np.float64(0.9057919392807454), 'colsample_bytree': np.float64(0.858010448926647), 'gamma': np.float64(0.6137127110156679), 'reg_lambda': np.float64(3.079321799590166), 'reg_alpha': np.float64(0.9615682650304967)}
Validation MAE: 1.7115

Best score: 1.7115266185638085
Best parameters: {'learning_rate': np.float64(0.03373994851872452), 'max_depth': 5, 'min_child_weight': 5, 'subsample': np.float64(0.9057919392807454), 'colsample_bytree': np.float64(0.858010448926647), 'gamma': np.float64(0.6137127110156679), 'reg_lambda': np.float64(3.079321799590166), 'reg_alpha': np.float64(0.9615682650304967), 'objective': 'reg:squarederror', 'enable_categorical': True, 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 2.289756777694835
MAE: 1.7689713850710032
R²: 0.4983601459483967
Rows: 127530, Dates: 946, min_train_days: 910
Progress:   2.7

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,FGA,MP_proj,TeamPTS,MP_base,FGA_base,TeamFGA_pct_base,DefFGA_base,FG_base,FT_base,FTA_base,Player_Pace_Rel_base,Pace_Minutes_Interaction_base,Player_share_avail_FGA
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,5.0,24.448162,104.0,,,,,,,,,,
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,0.0,18.174227,104.0,,,,,,,,,,
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,2.0,17.841209,104.0,,,,,,,,,,
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,0.0,14.692163,104.0,,,,,,,,,,
4,2021,2021-10-19,BRK,MIL,James Harden,PG,16.0,31.534994,104.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127525,2025,2026-01-29,WAS,MIL,Khris Middleton,SF,9.0,26.459738,109.0,26.95825,11.555,0.129260,19.370,4.470,2.650,2.680,1.133867,12.7342,0.0
127526,2025,2026-01-29,WAS,MIL,Kyshawn George,SF,24.0,31.759531,109.0,31.73075,16.005,0.175023,19.370,5.495,3.820,4.245,1.349741,17.8728,0.0
127527,2025,2026-01-29,WAS,MIL,Malaki Branham,SG,6.0,13.204216,109.0,7.88085,3.795,0.041813,16.235,2.240,0.105,0.120,1.033221,3.8478,0.0
127528,2025,2026-01-29,WAS,MIL,Tre Johnson,SG,7.0,30.973494,109.0,33.28730,15.345,0.166454,16.235,6.735,2.075,2.470,1.153070,16.4318,0.0



Trial 1/1: {'n_estimators': 448, 'learning_rate': np.float64(0.06556522784380296), 'max_depth': 3, 'min_child_weight': 8, 'subsample': np.float64(0.9055469347840724), 'colsample_bytree': np.float64(0.6456017911362145), 'gamma': np.float64(0.5101605217809464), 'reg_lambda': np.float64(3.6448638891826803), 'reg_alpha': np.float64(0.9833390298465099)}
Validation MAE: 2.6932

Best score: 2.69320584175751
Best parameters: {'learning_rate': np.float64(0.06556522784380296), 'max_depth': 3, 'min_child_weight': 8, 'subsample': np.float64(0.9055469347840724), 'colsample_bytree': np.float64(0.6456017911362145), 'gamma': np.float64(0.5101605217809464), 'reg_lambda': np.float64(3.6448638891826803), 'reg_alpha': np.float64(0.9833390298465099), 'objective': 'reg:squarederror', 'enable_categorical': True, 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
RMSE: 3.6462634390158284
MAE: 2.7989287045595668
R²: 0.6277377548265843
Rows: 127530, Dates: 946, min_train_days: 910
Progress:   

In [125]:
# analyze_df = stat_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df.sort_values('Diff', ascending=False).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Residual PTS

In [58]:
def setup_df_res(df):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 'Res_PTS', 'PTS_line',  
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 
             'Spread', 'Total']]
    
    # Create rolling + lag features
    df3 = load_df('season_gamelogs')
    df3 = con.execute("""SELECT Date, Team, CAST(ROUND(SUM(MP), 0) as INT) as Team_Mins, 
                         CAST(SUM(FGA) as INT) as Team_FGA, CAST(SUM(FTA) as INT) as Team_FTA, CAST(SUM(TOV) as INT) as Team_TOV 
                         FROM df3
                         GROUP BY Date, Team""").fetchdf()
    df = df.merge(df3, on=['Date', 'Team'], how='left')
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['USG'] = (
        (df['FGA'] + 0.44*df['FTA'] + df['TOV']) * (df['Team_Mins'] / 5)
        / (df['MP'] * (df['Team_FGA'] + 0.44*df['Team_FTA'] + df['Team_TOV']))
    )

    LN_cols = []
    for col in ['MP', 'PTS', 'eFG', 'TS', 'USG']:
        for N in [3, 10]:
            if col != 'else':
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )

                if col not in ['MP']:
                    LN_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_trend'] = df[f'{col}_L3_avg'] - df[f'{col}_L10_avg']
    
    df['game_spread_type'] = 0
    df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
    df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 18), 2, df.game_spread_type) 
    df['game_spread_type'] = np.where(abs(df.Spread) > 18, 3, df.game_spread_type) 
    
    df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

    df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
        'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
        'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
        'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
    }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
        
    for col in ['TeamPTS']:
        for N in [3, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                  .rolling(window=N, min_periods=N)
                  .mean()
            )
            df[f'PTS_pct_L{N}'] = df[f'PTS_L{N}_avg'] / df[f'TeamPTS_L{N}_avg']
            df = df.drop(f'TeamPTS_L{N}_avg', axis=1)
        df['PTS_pct_trend'] = df['PTS_pct_L3'] - df['PTS_pct_L10']
        df = df.drop(['PTS_pct_L3', 'PTS_pct_L10'], axis=1)
    
    df2 = create_df_missing(df, 'PTS')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_PTS_available']:
        df[col] = df[col].fillna(0)
    df['starters_out_L1'] = (
        df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['starters_out'].shift(1)
          .rolling(window=1, min_periods=1)
          .mean()
    )
    df['starters_returning'] = np.where(df['starters_out_L1'] > df['starters_out'], df['starters_out_L1'] - df['starters_out'], 0)

    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['team_game_num', 'Spread', 'Total', 'MP_q4', 'TeamPTS', 'Team_FGA', 'Team_FTA', 'Team_TOV', 
                 'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS', 'USG', 
                  'Team_Mins', 'starters_out_L1'] + LN_cols, axis=1)
        
    return df

##### Regressor

In [21]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Prev r2/mae/rmse best: 0.2124/4.3069/5.6478 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Res_PTS', n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_RG_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev mae best: 4.1101 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Res_PTS', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_RG_model.json")
print("Saved Res_PTS_RG booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,Res_PTS,PTS_line,MP_L3_avg,MP_L10_avg,MP_trend,PTS_trend,eFG_trend,TS_trend,USG_trend,game_spread_type,pts_low,pts_mid,pts_high,PTS_pct_trend,team_PTS_available,starters_out,starters_returning
0,2025,2025-11-20,ATL,SAS,Dyson Daniels,SG,1,30.68,-3.5,11.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
1,2025,2025-11-20,ATL,SAS,Jalen Johnson,SF,1,37.95,3.5,22.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
2,2025,2025-11-20,ATL,SAS,Kristaps Porzingis,C,1,29.05,-0.5,16.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
3,2025,2025-11-20,ATL,SAS,Nickeil Alexander-Walker,SG,1,33.65,20.5,17.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
4,2025,2025-11-20,ATL,SAS,Onyeka Okongwu,C,2,25.13,0.5,14.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5754,2025,2026-01-26,POR,BOS,Jrue Holiday,PG,1,24.45,1.5,12.5,20.283333,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5755,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,15.80,0.5,3.5,19.746667,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5756,2025,2026-01-26,POR,BOS,Robert Williams,C,2,19.68,-0.5,6.5,18.656667,16.084,2.572667,-0.400000,,,-0.021068,1,0.8,0.0,0.0,-0.009271,35.0,2.0,0.0
5757,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,34.32,-14.5,23.5,33.440000,32.221,1.219000,1.500000,-0.029100,-0.031318,0.035742,1,0.8,0.0,0.0,0.015482,35.0,2.0,0.0


Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Walk-forward MAE: 4.229058576918787
|Pred| >= 0: accuracy = 0.631, n = 1612
|Pred| >= 1: accuracy = 0.678, n = 1142
|Pred| >= 2: accuracy = 0.735, n = 740
|Pred| >= 3: accuracy = 0.802, n = 449
Saved Res_PTS_RG booster!


##### Classifier

In [22]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
df_res['Bet'] = (df_res['Res_PTS'] > 0).astype(int)  # 1 = over, 0 = under
df_res = df_res.drop('Res_PTS', axis=1)
# display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_cut = int(0.70 * n_days)
val_cut   = int(0.80 * n_days)
train_end = game_dates.loc[train_cut, 'Date']
val_end   = game_dates.loc[val_cut, 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Test Accuracy: 0.6672 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Bet', is_classification=True, n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_CLF_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev roc_auc best: 0.720 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Bet', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_CLF_model.json")
print("Saved Res_PTS_CLF booster!")

Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Confusion Matrix:
 [[548 286]
 [300 478]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.66      0.65       834
           1       0.63      0.61      0.62       778

    accuracy                           0.64      1612
   macro avg       0.64      0.64      0.64      1612
weighted avg       0.64      0.64      0.64      1612

ROC AUC: 0.693
High-confidence hit rate (<= 0.3 & >= 0.7): 0.77
Saved Res_PTS_CLF booster!


# HT Stats

In [65]:
def setup_df_ht(df):
    
    df_prediction_mins = setup_df_mins(df, fltr_ee=False)
    mins_booster = xgb.Booster()
    mins_booster.load_model(f"{MDL_PATH}/mins_model.json")
    mins_model = XGBRegressor()
    mins_model._Booster = mins_booster
    df['MP_proj'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))
    
    for tgt_col in ['PTS', 'FG', 'FGA']:
        df_full = setup_df_main(df, tgt_col)
        stat_booster = xgb.Booster()
        stat_booster.load_model(f"{MDL_PATH}/{tgt_col}_model.json")
        stat_model = XGBRegressor()
        stat_model._Booster = stat_booster
        df[f"{tgt_col}_proj"] = stat_model.predict(df_full.drop(['Season', 'Date', tgt_col], axis=1))
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'MP', 
             'MP_h1', 'PTS_h1', 'PTS_h2', 'FG_h1', 'FGA_h1', 'FGA_h2', 'FT_h1', 'FTA_h1', 'TPM_h1', 'TPA_h1', 'PF_h1', 
             'ORB_h1', 'TOV_h1', 'PTS', 'FGA', 'MP_proj', 'PTS_proj', 'FG_proj', 'FGA_proj']]
    cleanup_cols = []
    for tgt_col in ['MP', 'PTS', 'FG', 'FGA']:
        df[f'{tgt_col}_proj_pct'] = np.where(df[f'{tgt_col}_proj'] > 0, (df[f'{tgt_col}_h1'] - df[f'{tgt_col}_proj']) / df[f'{tgt_col}_proj'], np.nan)

    
    for col in ['PTS', 'FGA', 'FTA', 'ORB', 'TOV']:
        df[f'Team{col}_h1'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])[f'{col}_h1'].transform('sum'))
        if col not in ['FTA', 'ORB', 'TOV']:
            df[f'Team{col}_pct_h1'] = df[f'{col}_h1'] / df[f'Team{col}_h1']
    
    df['OppTeamPTS_h1'] = (
        (
        df.groupby(['Season', 'Date', 'Team'], as_index=True)['TeamPTS_h1']
        .first()
        ).reindex(
        pd.MultiIndex.from_frame(df[['Season', 'Date', 'Opp']])
        ).to_numpy())
    df['Spread_h1'] = df['TeamPTS_h1'] - df['OppTeamPTS_h1']

    df['Player_Pace'] = np.where(df['MP_h1'] > 0, (df['FGA_h1'] + 0.44 * df['FTA_h1']) / df['MP_h1'], 0)
    df['Team_Pace'] = ((df['TeamFGA_h1'] + 0.44 * df['TeamFTA_h1'] - df['TeamORB_h1'] + df['TeamTOV_h1']) / 120)
    df['Player_Pace_Rel'] = df['Player_Pace'] / df['Team_Pace']
    df['Pace_Minutes_Interaction'] = df['Player_Pace'] * df['MP_h1']

    # Create rolling + lag features    
    for col in ['MP', 'PTS', 'FG_h1', 'FGA_h1', 'FGA_h2', 'PTS_h1', 'PTS_h2']:
        for N in [1, 3, 5, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                 .rolling(window=N, min_periods=N)
                 .mean()
            )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
        
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')    
    df = df.drop(['TeamFGA_h1', 'TeamPTS_h1', 'OppTeamPTS_h1', 'TeamORB_h1', 'TeamTOV_h1', 'TeamFTA_h1', 'Player_Pace', 'Team_Pace', 
                  'MP_base', 'PTS_base', 'MP', 'FGA', 'PTS_h2', 'FGA_h2', 'ORB_h1', 'TOV_h1'] + cleanup_cols, axis=1)

    return df

In [69]:
# Feature ideas:
# Shooting trend (compare shooting numbers between q1 vs q2) [PTS, FG, FGA]
# Projected FG% (FG_proj / FGA_proj, if FGA_proj > FG_proj else np.nan)

df_ht = df.copy()
df_ht = setup_df_ht(df_ht)
display(df_ht)

game_dates = (df_ht[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

ht_train_df = df_ht[df_ht['Date'] <= train_end]
ht_val_df   = df_ht[(df_ht['Date'] > train_end) & (df_ht['Date'] <= val_end)]
ht_test_df  = df_ht[df_ht['Date'] > val_end]
ht_DFS = (ht_train_df, ht_val_df, ht_test_df)

# 0.806 / 0.6905 / 0.965 [1/29/26]
ht_train_dict = {
                 'mean': {'quantile': False, 'q_val': 0.5}, 
                 'Qlow': {'quantile': True, 'q_val': 0.15}, 
                 'Qhigh': {'quantile': True, 'q_val': 0.85}
                }
for key in ht_train_dict.keys():
#     ht_params = hyperparam_tuning(ht_DFS, 'PTS', n_iter=1, decay=0.99, quantile=ht_train_dict[key]['quantile'], q_val=ht_train_dict[key]['q_val'])
#     with open(f"{MDL_PATH}/ht_PTS_{key}_params.json", "w") as f:
#         json.dump(ht_params, f)

    ht_model, ht_results = refit_model(df_ht, 'PTS', f'ht_PTS_{key}_params', min_train_days=915, decay=0.99)
#     feature_importance(ht_model, df_ht.columns.tolist())

    ht_model.get_booster().save_model(f"{MDL_PATH}/ht_PTS_{key}_model.json")
    print(f"Saved ht_PTS_{key} booster!")


Trial 1/1: {'n_estimators': 1446, 'learning_rate': np.float64(0.08338593800641908), 'max_depth': 4, 'min_child_weight': 9, 'subsample': np.float64(0.9956142970797353), 'colsample_bytree': np.float64(0.8611460805978861), 'gamma': np.float64(0.46215499445507063), 'reg_lambda': np.float64(1.4958610931644305), 'reg_alpha': np.float64(0.3637481025234386)}
Validation Quantile Loss (q_val=0.85): 1.0699

Best score: 1.0698966236136784
Best parameters: {'learning_rate': np.float64(0.08338593800641908), 'max_depth': 4, 'min_child_weight': 9, 'subsample': np.float64(0.9956142970797353), 'colsample_bytree': np.float64(0.8611460805978861), 'gamma': np.float64(0.46215499445507063), 'reg_lambda': np.float64(1.4958610931644305), 'reg_alpha': np.float64(0.3637481025234386), 'objective': 'reg:quantileerror', 'quantile_alpha': 0.85, 'enable_categorical': True, 'tree_method': 'hist', 'device': 'cuda', 'seed': 42}

Test Metrics:
Quantile loss: 1.1003126513131953
Coverage for q_val=0.85: 0.83
Rows: 127530,

In [56]:
# analyze_df = ht_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df.sort_values('Diff', ascending=True).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [55]:
df_ht = setup_df_ht(df_pred)
df_ht = df_ht[df_ht.Date == now]
display(df_ht)
partition_save_df(df_ht, f"../tables/{YEAR}/ht_api_input.csv")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,MP_h1,PTS_h1,FG_h1,FGA_h1,FT_h1,FTA_h1,TPM_h1,TPA_h1,PF_h1,PTS,MP_proj,PTS_proj,FG_proj,FGA_proj,MP_proj_pct,PTS_proj_pct,FG_proj_pct,FGA_proj_pct,TeamPTS_pct_h1,TeamFGA_pct_h1,Spread_h1,Player_Pace_Rel,Pace_Minutes_Interaction,FG_h1_base,FGA_h1_base,FGA_h2_base,PTS_h1_base,PTS_h2_base
257057,2025,2026-01-30,BOS,SAC,Amari Williams,PF,,,,,,,,,,,16.117100,4.648998,1.404279,3.553517,,,,,,,0.0,,,0.48,0.960,0.96,1.34,0.98
257058,2025,2026-01-30,BOS,SAC,Anfernee Simons,SG,,,,,,,,,,,25.760784,15.555498,5.735978,13.141344,,,,,,,0.0,,,3.00,7.160,5.40,7.89,6.38
257059,2025,2026-01-30,BOS,SAC,Baylor Scheierman,SG,,,,,,,,,,,25.732018,5.946781,2.670132,5.530028,,,,,,,0.0,,,0.62,1.790,1.99,1.76,2.49
257060,2025,2026-01-30,BOS,SAC,Chris Boucher,PF,,,,,,,,,,,11.515678,3.005308,1.601772,2.973618,,,,,,,0.0,,,0.25,0.625,1.75,0.75,1.75
257061,2025,2026-01-30,BOS,SAC,Derrick White,SG,,,,,,,,,,,34.776772,17.628517,6.362100,15.796325,,,,,,,0.0,,,2.26,7.000,6.83,5.67,8.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257369,2025,2026-01-30,WAS,LAL,Skal Labissiere,PF,,,,,,,,,,,13.541380,4.224597,1.983138,3.517972,,,,,,,0.0,,,,,,,
257370,2025,2026-01-30,WAS,LAL,Trae Young,PG,,,,,,,,,,,19.231426,6.760664,2.813472,5.640769,,,,,,,0.0,,,,,,,
257371,2025,2026-01-30,WAS,LAL,Tre Johnson,SG,,,,,,,,,,,25.497757,15.403737,5.483934,11.781894,,,,,,,0.0,,,2.70,7.010,6.11,8.72,7.12
257372,2025,2026-01-30,WAS,LAL,Tristan Vukcevic,C,,,,,,,,,,,11.533594,6.696543,2.359608,5.126812,,,,,,,0.0,,,1.02,2.800,3.12,2.76,4.78


../tables/2025/ht_api_input.csv saved!


# Today's predictions

In [147]:
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_PTS.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
#                 .rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
# df_gms = df_gms.merge(df_lines, on=['Date', 'Team', 'Player'])
# df_gms['Res_PTS'] = df_gms.PTS - df_gms.PTS_line

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'PTS', 'Res_PTS', 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', 'PTS_line', 'PTS_proj', 'PTS', 'Res_PTS_proj', 'Res_PTS', 'pred_prob', 'pred_class']][df_yesterday.MP > 0]

# # Mins
# df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 3, 1, 0)
# print("\nYesterday's Results:")
# print("Total Accuracy (Minutes-in-range):", (df_yesterday.InTgtRange == 1).mean())
# print((df_yesterday.InTgtRange == 1).sum(), '/', df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange'], axis=1)

# # Raw PTS
# df_yesterday['Diff'] = abs(df_yesterday['PTS'] - df_yesterday['PTS_proj'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 3, 1, 0)
# df_yesterday['Act_Res'] = np.where((df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['Pred_Res'] = np.where((df_yesterday.PTS_proj > df_yesterday.PTS_line), 1, 0)
# df_yesterday['PHit1'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (Raw PTS):", (df_yesterday.PHit1 == 1).mean())
# print((df_yesterday.PHit1 == 1).sum(), "/", df_yesterday.shape[0])
# print("Total Accuracy (Raw PTS-in-range):", (df_yesterday.InTgtRange == 1).mean())
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange', 'Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Regression)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday['Res_PTS_proj'] > 0, 'O', 'U')
# df_yesterday['PHit2'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (ResPTS Regression):", (df_yesterday.PHit2 == 1).mean())
# print((df_yesterday.PHit2 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Classifier)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 1, 0)
# df_yesterday['PHit3'] = np.where(df_yesterday['Act_Res'] == df_yesterday['pred_class'], 1, 0)
# df_yesterday['pred_class'] = np.where(df_yesterday['pred_class'] == 1, 'O', 'U')
# print("Total Accuracy (ResPTS Classification):", (df_yesterday.PHit3 == 1).mean())
# print((df_yesterday.PHit3 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res'], axis=1)

# df_yesterday['Majority'] = np.where(((df_yesterday.PTS_proj > df_yesterday.PTS_line).astype(int) + (df_yesterday['Res_PTS_proj'] > 0).astype(int) + (df_yesterday['pred_class'] == 1).astype(int)) >= 2, 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 1) & (df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 0) & (df_yesterday.PTS < df_yesterday.PTS_line), 1, df_yesterday.MajorityHit)
# print("Total Accuracy (MajorityHit):", (df_yesterday.MajorityHit == 1).mean())

# df_yesterday['AllAgree'] = '-'
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 1) & (df_yesterday.PHit2 == 1) & (df_yesterday.PHit3 == 1), 1, df_yesterday['AllAgree'])
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 0) & (df_yesterday.PHit2 == 0) & (df_yesterday.PHit3 == 0), 0, df_yesterday['AllAgree'])
# print("Total Accuracy (AllAgree):", ((df_yesterday.AllAgree == 1).sum() / ((df_yesterday.AllAgree == 0).sum() + (df_yesterday.AllAgree == 1).sum())))

# df_yesterday = df_yesterday.drop(['Majority', 'MajorityHit', 'AllAgree'], axis=1).sort_values('PTS_line', ascending=False)

# # if df_yesterday.shape[0] >= 50:
# #     for tm in df_yesterday.Team.unique():
# #         display(df_yesterday[(df_yesterday.Team == tm)]) #  & (df_yesterday.PHit == 1)
# # else:
# #     display(df_yesterday)

In [59]:
df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull())]

df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_prediction = df_pred.copy()

# Predict Stat
stat_booster = xgb.Booster()
stat_booster.load_model(f"{MDL_PATH}/{tgt_stat}_model.json")
stat_model = XGBRegressor()
stat_model._Booster = stat_booster
res_booster_RG = xgb.Booster()
res_booster_RG.load_model(f"{MDL_PATH}/Res_PTS_RG_model.json")
res_model_RG = XGBRegressor()
res_model_RG._Booster = res_booster_RG
res_model_CLF = XGBClassifier()
res_model_CLF.load_model(f"{MDL_PATH}/Res_PTS_CLF_model.json")

df_prediction = setup_df_main(df_prediction, 'PTS')
feature_cols = [col for col in df_prediction.columns if col not in ['Season', 'Date', 'MP_preds', 'PTS']]
df_prediction = df_prediction[df_prediction.Date == now][feature_cols]
df_prediction["PTS_proj"] = stat_model.predict(df_prediction)

df_prediction2 = df_pred[(~df_pred.PTS_line.isnull())].copy()
df_prediction2 = setup_df_res(df_prediction2)
feature_cols = [col for col in df_prediction2.columns if col not in ['Season', 'Date', 'Res_PTS']]
df_prediction2 = df_prediction2[df_prediction2.Date == now][feature_cols]
df_prediction2["Res_PTS_proj"] = res_model_RG.predict(df_prediction2)

df_prediction2['pred_prob'] = res_model_CLF.predict_proba(df_prediction2.drop('Res_PTS_proj', axis=1))[:,1]
df_prediction2['pred_class'] = (df_prediction2['pred_prob'] > 0.5).astype(int)

# Setup Today's Picks
df_lines = df_lines[df_lines.Date == now][['Team', 'Player', 'PTS_line']]
df_prediction = df_prediction.merge(df_lines, on=['Team', 'Player'])
df_prediction = df_prediction.merge(df_prediction2[['Team', 'Player', 'Res_PTS_proj', 'pred_prob', 'pred_class']], on=['Team', 'Player'])

tds_picks = df_prediction[~(df_prediction['PTS_line'].isnull())].rename(columns={"MP_proj": "MP"})\
            [['Team', 'Player', 'MP', 'PTS_line', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class']]

tds_picks['O/U'] = '-'
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 1) & (tds_picks.PTS_proj > tds_picks.PTS_line) & (tds_picks.Res_PTS_proj > 0)), 'O', tds_picks['O/U'])
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 0) & (tds_picks.PTS_proj < tds_picks.PTS_line) & (tds_picks.Res_PTS_proj < 0)), 'U', tds_picks['O/U'])

tds_picks = tds_picks.sort_values(['O/U', 'Team', 'Player'], ascending=[False, False, False])
if tds_picks.shape[0] >= 50:
    print(tds_picks.shape[0], 'rows')
    for tm in tds_picks.Team.unique():
        display(tds_picks[tds_picks.Team == tm])
else:
    display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/{YEAR}/gmday_preds_PTS.csv")

123 rows


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
118,UTA,Svi Mykhailiuk,25.677044,7.5,8.994377,5.536934,0.874478,1,O
116,UTA,Kyle Anderson,22.869228,6.5,7.799504,5.55453,0.767936,1,O
115,UTA,Keyonte George,34.728851,24.5,26.196766,7.451414,0.725839,1,O
114,UTA,Isaiah Collier,25.718155,8.5,11.018496,6.256546,0.725195,1,O
113,UTA,Cody Williams,27.009525,8.5,8.786075,5.398912,0.734908,1,O
117,UTA,Kyle Filipowski,27.894432,14.5,10.818471,4.164417,0.683584,1,-
112,UTA,Brice Sensabaugh,23.924534,16.5,16.493937,4.498955,0.75263,1,-
111,UTA,Ace Bailey,30.103168,15.5,15.480458,4.584274,0.649284,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
110,TOR,Scottie Barnes,33.29488,17.5,19.055714,4.980743,0.695015,1,O
109,TOR,Sandro Mamukelashvili,23.983885,10.5,11.349528,5.446475,0.894368,1,O
108,TOR,RJ Barrett,27.483448,15.5,16.434553,5.175906,0.828969,1,O
107,TOR,Jamal Shead,22.927189,5.5,6.644709,5.952062,0.8499,1,O
105,TOR,Collin Murray-Boyles,28.720457,8.5,8.603504,5.789605,0.807378,1,O
104,TOR,Brandon Ingram,34.60862,20.5,22.23826,5.675029,0.629996,1,O
106,TOR,Immanuel Quickley,32.090988,16.5,16.003244,5.102156,0.742063,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
102,SAC,Precious Achiuwa,27.291906,6.5,7.717474,6.534595,0.822657,1,O
101,SAC,Nique Clifford,22.53863,7.5,7.623115,4.823729,0.787545,1,O
100,SAC,Maxime Raynaud,26.311338,9.5,10.123234,4.122745,0.598557,1,O
98,SAC,Dennis Schroder,29.838774,13.5,13.528735,4.968122,0.672794,1,O
97,SAC,DeMar DeRozan,35.340824,17.5,19.743551,4.633007,0.659564,1,O
103,SAC,Zach LaVine,33.100948,16.5,18.186356,2.091098,0.494958,0,-
99,SAC,Malik Monk,24.698647,13.5,12.444138,4.859509,0.716686,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
96,POR,Toumani Camara,34.703365,12.5,13.143039,3.942077,0.64508,1,O
95,POR,Shaedon Sharpe,33.1143,21.5,21.827456,4.884027,0.613429,1,O
94,POR,Jrue Holiday,27.822439,11.5,13.360863,7.431259,0.794154,1,O
93,POR,Jerami Grant,26.63558,13.5,14.382161,7.903897,0.642085,1,O
92,POR,Donovan Clingan,28.825783,9.5,11.866452,5.802875,0.643544,1,O
91,POR,Deni Avdija,33.842194,21.5,25.750349,3.930582,0.604806,1,O


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
90,PHO,Royce O'Neale,30.995083,8.5,11.236658,5.084203,0.858841,1,O
88,PHO,Mark Williams,26.726439,9.5,12.089271,6.822806,0.859003,1,O
85,PHO,Grayson Allen,32.979088,18.5,18.72333,6.769444,0.716583,1,O
84,PHO,Dillon Brooks,33.037445,21.5,22.496235,5.846063,0.746311,1,O
83,PHO,Collin Gillespie,30.412436,13.5,15.572682,6.91378,0.831709,1,O
89,PHO,Oso Ighodaro,20.2258,6.5,5.158618,5.693331,0.82158,1,-
87,PHO,Jordan Goodwin,21.6558,7.5,7.117359,5.174699,0.793622,1,-
86,PHO,Jalen Green,14.008404,13.5,5.460039,6.910934,0.723159,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
82,ORL,Wendell Carter Jr.,31.078806,10.5,12.25052,6.907574,0.927597,1,O
81,ORL,Paolo Banchero,36.786259,23.5,24.890781,2.250544,0.634443,1,O
80,ORL,Jalen Suggs,27.516405,14.5,14.799756,5.36734,0.826852,1,O
78,ORL,Anthony Black,34.986698,16.5,17.622414,5.053198,0.993421,1,O
79,ORL,Desmond Bane,35.64743,19.5,19.444912,3.434643,0.657289,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
77,NYK,OG Anunoby,34.40889,15.5,16.190203,6.00343,0.783008,1,O
76,NYK,Mitchell Robinson,19.426762,5.5,5.8006,4.983651,0.816459,1,O
75,NYK,Mikal Bridges,34.711353,14.5,16.852037,5.347328,0.665451,1,O
74,NYK,Landry Shamet,19.923786,7.5,8.091944,7.653614,0.880415,1,O
72,NYK,Josh Hart,34.087433,12.5,13.211774,5.371821,0.626425,1,O
73,NYK,Karl-Anthony Towns,29.721233,19.5,17.11619,4.769833,0.891249,1,-
71,NYK,Jalen Brunson,34.180206,26.5,25.544508,3.028162,0.537799,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
69,NOP,Yves Missi,21.866106,6.5,7.290818,5.225692,0.796158,1,O
67,NOP,Saddiq Bey,31.69647,16.5,19.572887,7.041651,0.928567,1,O
66,NOP,Jose Alvarado,19.006754,5.5,6.762977,6.871393,0.809955,1,O
65,NOP,Jeremiah Fears,20.648163,9.5,10.544068,5.848349,0.828251,1,O
70,NOP,Zion Williamson,30.846575,22.5,21.381413,5.067556,0.834735,1,-
68,NOP,Trey Murphy III,35.678394,23.5,22.112005,7.577463,0.793057,1,-
64,NOP,Herbert Jones,28.889149,9.5,8.227095,6.127775,0.752728,1,-
63,NOP,Derik Queen,26.543676,10.5,10.469733,5.175247,0.836821,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
62,MEM,Vince Williams Jr.,20.048811,7.5,8.049873,3.711741,0.59087,1,O
57,MEM,Cedric Coward,27.803457,14.5,15.340719,5.729885,0.683547,1,O
61,MEM,Kentavious Caldwell-Pope,19.652252,8.5,8.022098,5.757865,0.806637,1,-
60,MEM,Jock Landale,26.802271,14.5,14.426738,6.2483,0.82502,1,-
59,MEM,Jaylen Wells,26.323767,12.5,12.288936,5.617818,0.778193,1,-
58,MEM,Jaren Jackson Jr.,31.839067,22.5,21.009914,6.150821,0.674721,1,-
56,MEM,Cam Spencer,28.421543,12.5,12.259137,6.353702,0.92551,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
54,LAL,Marcus Smart,29.568949,9.5,9.63614,6.925024,0.784476,1,O
51,LAL,Jaxson Hayes,17.445461,6.5,6.633598,6.846673,0.968946,1,O
50,LAL,Jake LaRavia,30.531136,9.5,10.062018,4.575361,0.752402,1,O
48,LAL,Austin Reaves,22.601053,15.5,18.0791,5.375216,0.734337,1,O
55,LAL,Rui Hachimura,23.465979,11.5,10.737215,4.831203,0.836767,1,-
53,LAL,Luka Doncic,35.652592,34.5,32.232166,3.259362,0.689501,1,-
52,LAL,LeBron James,33.266457,22.5,20.259176,7.1487,0.881737,1,-
49,LAL,Deandre Ayton,26.836842,11.5,10.820445,5.504934,0.823474,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
47,LAC,Nicolas Batum,18.795717,3.5,3.870655,5.115123,0.822225,1,O
45,LAC,Kawhi Leonard,32.137024,23.5,25.571623,7.048356,0.867249,1,O
44,LAC,John Collins,29.648497,12.5,13.22909,6.098096,0.922669,1,O
43,LAC,James Harden,36.174141,22.5,22.863646,4.64085,0.725477,1,O
41,LAC,Brook Lopez,16.916676,5.5,6.035843,3.880005,0.632251,1,O
46,LAC,Kris Dunn,28.480093,7.5,7.072093,4.756946,0.873634,1,-
42,LAC,Ivica Zubac,30.620195,13.5,13.06311,3.90396,0.722484,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
40,GSW,Stephen Curry,32.336643,28.5,28.857782,5.11798,0.747256,1,O
39,GSW,Quinten Post,17.473286,7.5,7.791207,6.480171,0.866505,1,O
37,GSW,Gui Santos,18.177795,5.5,6.770164,7.458117,0.719019,1,O
35,GSW,De'Anthony Melton,21.398369,12.5,12.915209,8.311174,0.795428,1,O
34,GSW,Buddy Hield,19.741131,7.5,7.896945,7.349835,0.758081,1,O
32,GSW,Al Horford,26.410759,7.5,8.094808,3.387411,0.699226,1,O
38,GSW,Moses Moody,27.203739,11.5,11.285499,5.073241,0.828917,1,-
36,GSW,Draymond Green,27.20933,7.5,7.2045,4.714188,0.692458,1,-
33,GSW,Brandin Podziemski,29.740448,12.5,12.32741,5.867133,0.765581,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
31,DET,Tobias Harris,29.92247,12.5,13.468185,4.747232,0.72618,1,O
29,DET,Isaiah Stewart,22.079271,7.5,8.666005,5.403678,0.635277,1,O
28,DET,Duncan Robinson,27.925745,9.5,10.467086,5.280965,0.778315,1,O
26,DET,Ausar Thompson,25.734707,8.5,8.994225,3.972944,0.595674,1,O
30,DET,Jalen Duren,28.869825,16.5,15.769996,3.693276,0.658222,1,-
27,DET,Cade Cunningham,35.548389,24.5,24.369274,1.973149,0.747251,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
25,DEN,Tim Hardaway Jr.,26.975237,11.5,12.73611,4.874007,0.761539,1,O
24,DEN,Peyton Watson,35.314293,13.5,19.440807,3.632836,0.765811,1,O
23,DEN,Nikola Jokic,32.809101,21.5,27.690844,3.667847,0.603283,1,O
22,DEN,Jonas Valanciunas,16.873547,10.5,10.642015,4.988082,0.877966,1,O
21,DEN,Jamal Murray,37.341427,23.5,25.054085,4.62127,0.686413,1,O
20,DEN,Jalen Pickett,27.772213,6.5,7.452413,6.006085,0.830874,1,O
19,DEN,Bruce Brown,23.559319,6.5,8.0923,2.858079,0.677746,1,O


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
18,CLE,Sam Merrill,25.101837,10.5,12.374696,5.745294,0.765221,1,O
14,CLE,Dean Wade,26.900541,5.5,6.741318,3.800869,0.754183,1,O
13,CLE,De'Andre Hunter,21.969164,11.5,11.93631,2.84391,0.759148,1,O
17,CLE,Jaylon Tyson,31.17222,16.5,16.409962,5.014479,0.847023,1,-
16,CLE,Jarrett Allen,29.913424,15.5,13.633236,3.370231,0.556824,1,-
15,CLE,Donovan Mitchell,34.403137,30.5,26.937765,2.177552,0.601896,1,-
12,CLE,Craig Porter Jr.,18.45867,5.5,4.061168,4.13716,0.788459,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
10,BRK,Nolan Traore,28.87933,8.5,10.801936,7.768179,0.831824,1,O
11,BRK,Terance Mann,27.198214,9.5,9.0566,5.238827,0.661703,1,-
9,BRK,Drake Powell,20.688896,8.5,7.05661,6.470465,0.733005,1,-
8,BRK,Day'Ron Sharpe,20.109846,9.5,9.209241,9.093816,0.810163,1,-
7,BRK,Danny Wolf,25.895535,13.5,10.732086,7.249903,0.830103,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
6,BOS,Sam Hauser,28.929213,11.5,14.266651,6.782626,0.751966,1,O
1,BOS,Baylor Scheierman,25.732018,5.5,5.946781,5.699912,0.704014,1,O
5,BOS,Payton Pritchard,32.678711,22.5,17.408855,5.085602,0.591553,1,-
4,BOS,Neemias Queta,25.883425,11.5,9.885349,5.207067,0.802519,1,-
3,BOS,Luka Garza,17.474705,9.5,9.404966,6.743474,0.843565,1,-
2,BOS,Derrick White,34.776772,22.5,17.628517,5.486843,0.545001,1,-
0,BOS,Anfernee Simons,25.760784,16.5,15.555498,6.092296,0.722095,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
122,WAS,Will Riley,17.930891,9.5,6.985455,6.430329,0.801019,1,-
121,WAS,Kyshawn George,32.155891,18.5,18.117847,6.848206,0.706195,1,-
120,WAS,Justin Champagnie,29.395548,11.5,11.011437,5.36194,0.892731,1,-
119,WAS,Bilal Coulibaly,27.349087,10.5,9.871196,3.881321,0.679895,1,-


In [144]:
# tds_picks['PTS_proj_mag'] = tds_picks.PTS_proj - tds_picks.PTS_line
# print('O:', tds_picks[tds_picks.PTS_proj_mag > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.PTS_proj_mag < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'PTS_proj_mag'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()
# tds_picks = tds_picks.drop('PTS_proj_mag', axis=1)

In [32]:
# print('O:',tds_picks[tds_picks.Res_PTS_proj > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.Res_PTS_proj < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'Res_PTS_proj'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [33]:
# print('O:', tds_picks[tds_picks.pred_class == 1].shape[0] / tds_picks.shape[0])
# print('U:', tds_picks[tds_picks.pred_class == 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'pred_prob'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Misc.

In [34]:
# # Historical Percentages
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_{tgt_stat}.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday.rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

# df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
# df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
# df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# df_yesterday['Diff3'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff3'] <= 5, 1, 0)

# for day in df_gms.Date.unique():
#     df_temp = df_yesterday[df_yesterday.Date == day]
#     if df_temp.shape[0] > 0:
#         print(f"{day.date()} Total PTS Accuracy:", f"{(df_temp.ParlayHit == 1).sum()}/{df_temp.shape[0]}", ((df_temp.ParlayHit == 1).sum() / df_temp.shape[0]))
#         print(f"{day.date()} Total MP Accuracy:", f"{(df_temp.InRMSE_Range == 1).sum()}/{df_temp.shape[0]}", ((df_temp.InRMSE_Range == 1).sum() / df_temp.shape[0]), "\n")

In [35]:
# df_mins = df.copy()
# df_mins = setup_df_mins(df_mins)

# train_summary = df_mins.drop(['Season', 'Date', 'MP'], axis=1).describe().T
# gameday_summary = df_prediction_mins[df_prediction_mins.Date == now].drop(['Season', 'Date', 'MP'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [36]:
# df_main = df.copy()
# df_main = setup_df_main(df_main, tgt_stat)

# train_summary = df_main.drop(['Season', 'Team', 'Opp', 'Player', 'Pos', 'Date', 'PTS'], axis=1).describe().T
# gameday_summary = df_prediction[(~df_prediction.PTS_line.isnull())].drop(['Pos', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class', 'PTS_line'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [37]:
# df_res = df[(~df.PTS_line.isnull())].copy()
# df_res = setup_df_res(df_res)

# train_summary = df_res.drop(['Date', 'Res_PTS'], axis=1).describe().T
# gameday_summary = df_prediction2[(~df_prediction2.PTS_line.isnull())].describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [38]:
# for col in mins_train_df.drop(['Season', 'Date', 'MP'], axis=1).columns:
#     if col not in ['Team', 'Player', 'Opp', 'Pos']:
#         PartialDependenceDisplay.from_estimator(
#             mins_model,
#             mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#             features=[col],
#             grid_resolution=25
#         )
        
# ##################
# features = [('role')]
# PartialDependenceDisplay.from_estimator(
#     mins_model,
#     mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#     features=features,
#     grid_resolution=25
# )