# To do:

 - Both
     - Fix Injuries data
         - Find a better source for roster data (I found, work on the plyr_pos_xref notebook)
     - Signal Opp Injuries
 - Mins
 - PTS
     - Find more effective way to signal Defensive stats
 - Res_PTS
     - See notes in Res_PTS section

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import os
import json

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from scipy.stats import randint, uniform

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import PartialDependenceDisplay

import joblib
import warnings
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")

cwd = os.path.abspath(os.getcwd()).replace("\\", "/")
if cwd.startswith("C:/Users/Rodolfo/"):
    RUN_LOCATION = "local"
    MDL_PATH = "../ML_models/dev"
else:
    RUN_LOCATION = "cloud"
    MDL_PATH = "../ML_models"
time_offset = {"local": 3, "cloud": -5}
now = str((datetime.now() + timedelta(hours=time_offset[RUN_LOCATION]) + timedelta(hours=-3)).date())
print(f"Today's date:", now)

tgt_stat = "PTS"
print('Target Stat:', tgt_stat)

Today's date: 2026-01-28
Target Stat: PTS


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model, all_features):
    booster = model.get_booster()
    score = booster.get_score(importance_type="gain")

    df_importance = pd.DataFrame({
            "feature": all_features,
            "importance": [score.get(f, 0.0) for f in all_features]
        }).sort_values("importance", ascending=False).reset_index(drop=True)
    
    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    df_importance['importance'] = df_importance['importance'].map('{:.4f}'.format)
    if df_importance.shape[0] >= 50:
        with pd.option_context('display.max_rows', None):
            display(df_importance)
    else:
        display(df_importance)
    
    xgb.plot_importance(model)
    plt.show()

In [4]:
def compute_sample_weights(df, decay=0.99):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    max_date = df['Date'].max()
    df["days_old"] = (max_date - df['Date']).dt.days
    weights = decay ** df["days_old"]
    
    return weights.values

In [5]:
def quantile_loss(y_true, y_pred, q):
    diff = y_true - y_pred
    return np.mean(np.maximum(q * diff, (q - 1) * diff))

In [6]:
def create_baseline_model(df, pred_col, DFS):
    
    train_df, val_df, test_df = DFS

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg', 'game_spread_type'
        ]
    else:
        print(f'{pred_col} Stats Model')
        feature_cols = [
            'MP_L5_avg',
            'MP_L10_avg',
            f'{pred_col}_last_3_avg', f'{pred_col}_last_5_avg', f'{pred_col}_last_10_avg',
            f'Def_{pred_col}', f'Def_L5_{pred_col}'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst

In [7]:
def hyperparam_tuning(DFS, pred_col, is_classification=False, quantile=False, n_iter=20, early_stopping_rounds=50, decay=1, q_val=0.5):
    """
    Hyperparameter tuning for XGBRegressor or XGBClassifier using native XGBoost API
    """
    train_df, val_df, test_df = DFS
    feature_cols = [col for col in train_df.columns if col not in ['Season', 'Date', pred_col]]
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Sample Weights (decay < 1)
    w_train = compute_sample_weights(train_df, decay=decay)
    w_val   = compute_sample_weights(val_df, decay=decay)
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train, enable_categorical=True)
    dval   = xgb.DMatrix(X_val, label=y_val, weight=w_val, enable_categorical=True)
    dtest  = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.005, 0.08),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(3, 10),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.5, 0.5),
        "gamma": uniform(0, 0.8),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 1)
    }

    # Generate n_iter random parameter sets
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k,v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_score = float('inf') if not is_classification else 0
    best_params = None
    best_bst = None

    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')

        # Set objective based on regression or classification
        if is_classification:
            params.update({
                "objective": "binary:logistic",
                "enable_categorical": True,
                "eval_metric": "logloss",
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })
        elif quantile:
            params.update({
                "objective": "reg:quantileerror",
                "quantile_alpha": q_val, 
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        else:
            params.update({
                "objective": "reg:squarederror",
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )

        # Validation scoring
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        if is_classification:
            val_class = (val_preds > 0.5).astype(int)
            score = (val_class == y_val.values).mean()  # accuracy
            print(f"Validation Accuracy: {score:.4f}")
            if score > best_score:
                best_score = score
                best_params = params.copy()
                best_bst = bst
        elif quantile:
            q_loss = quantile_loss(y_val.values, val_preds, q_val)
            print(f"Validation Quantile Loss (q_val={q_val}): {q_loss:.4f}")
            if q_loss < best_score:
                best_score = q_loss
                best_params = params.copy()
                best_bst = bst
        else:
            mae = mean_absolute_error(y_val, val_preds)
            print(f"Validation MAE: {mae:.4f}")
            if mae < best_score:
                best_score = mae
                best_params = params.copy()
                best_bst = bst

    print("\nBest score:", best_score)
    print("Best parameters:", best_params)

    # Test predictions
    test_preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    if is_classification:
        test_class = (test_preds > 0.5).astype(int)
        acc = (test_class == y_test.values).mean()
        print("\nTest Accuracy:", acc)
    else:
        print("\nTest Metrics:")
        if quantile:
            ql = quantile_loss(y_test, test_preds, q_val)
            print("Quantile loss:", ql)
            coverage = np.mean(y_test <= test_preds)
            print(f"Coverage for q_val={q_val}: {coverage:.2f}")
        else:
            print("RMSE:", np.sqrt(mean_squared_error(y_test, test_preds)))
            print("MAE:", mean_absolute_error(y_test, test_preds))
            print("R²:", r2_score(y_test, test_preds))

    return best_params

In [8]:
def refit_model(df, pred_col, params_file, min_train_days=0, rolling_window=None, decay=1):
    df = df.sort_values("Date")
    dates = df["Date"].unique()
    print(f'Rows: {df.shape[0]}, Dates: {len(dates)}, min_train_days: {min_train_days}')

    feature_cols = [c for c in df.columns if c not in ["Season", "Date", pred_col]]

    # Load hyperparameters
    with open(f"{MDL_PATH}/{params_file}.json", "r") as f:
        loaded_params = json.load(f)

    preds, actuals, dates_out, predictions = [], [], [], []
    total_iters = len(dates) - min_train_days

    for idx, i in enumerate(range(min_train_days, len(dates)), start=1):
        test_date = dates[i]
        test_season = df.loc[df["Date"] == test_date, "Season"].iloc[0]

        if rolling_window:
            train_start_idx = max(0, i - rolling_window)
        else:
            train_start_idx = 0

        train_dates = dates[train_start_idx:i]

        train_df = df[df["Date"].isin(train_dates)]
        test_df  = df[df["Date"] == test_date]

        if test_df.empty:
            continue

        X_train, y_train = train_df[feature_cols], train_df[pred_col]
        X_test, y_test   = test_df[feature_cols], test_df[pred_col]

        predictions.append(test_df)
        if pred_col == 'Bet':
            model = XGBClassifier(**loaded_params)
            model.fit(X_train, y_train)
            
            test_df['pred_prob'] = model.predict_proba(X_test)[:,1]
            test_df['pred_class'] = (test_df['pred_prob'] > 0.5).astype(int)
        else:
            model = XGBRegressor(**loaded_params)
            sample_weights = compute_sample_weights(train_df, decay=decay)
            model.fit(X_train, y_train, sample_weight=sample_weights)

            y_pred = model.predict(X_test)
            preds.extend(y_pred)
            actuals.extend(y_test.values)
            dates_out.extend([test_date] * len(y_pred))

        if idx % max(1, total_iters // 20) == 0:
            pct = 100 * idx / total_iters
            print(f"Progress: {pct:6.2f}% ({idx}/{total_iters})")
            
    results = pd.concat(predictions)
    if pred_col == 'Res_PTS':
        results['Actuals'] = actuals
        results['Predictions'] = preds
        mae = mean_absolute_error(actuals, preds)
        print("Walk-forward MAE:", mae)
        results["Correct_Direction"] = (np.sign(results["Predictions"]) == np.sign(results["Actuals"])).astype(int)
        for t in [0, 1, 2, 3]:
            subset = results[results["Predictions"].abs() >= t]
            acc = subset["Correct_Direction"].mean() if len(subset) > 0 else np.nan
            print(f"|Pred| >= {t}: accuracy = {acc:.3f}, n = {len(subset)}")
    elif pred_col == 'Bet':
        cm = confusion_matrix(results['Bet'], results['pred_class'])
        print("Confusion Matrix:\n", cm)
        report = classification_report(results['Bet'], results['pred_class'])
        print("Classification Report:\n", report)
        auc = roc_auc_score(results['Bet'], results['pred_prob'])
        print(f"ROC AUC: {auc:.3f}")

        high_confidence = results.copy()
        high_confidence['pred_prob'] = np.where(high_confidence.pred_prob > 0.5, 1 - high_confidence.pred_prob, high_confidence.pred_prob)
        high_confidence = high_confidence[high_confidence['pred_prob'] <= 0.3]
        if len(high_confidence) > 0:
            hit_rate = (high_confidence['pred_class'] == high_confidence['Bet']).mean()
            print(f"High-confidence hit rate (<= 0.3 & >= 0.7): {hit_rate:.2f}")

    else:
        results['Actuals'] = actuals
        results['Predictions'] = preds
        if loaded_params['objective'] == 'reg:quantileerror':
            ql = quantile_loss(results['Actuals'], results['Predictions'], loaded_params['quantile_alpha'])
            print("Quantile loss:", ql)
            coverage = np.mean(results['Actuals'] <= results['Predictions'])
            print(f"Coverage for q_val={loaded_params['quantile_alpha']}: {coverage:.2f}")
        else:
            mae = mean_absolute_error(actuals, preds)
            rmse = np.sqrt(mean_squared_error(actuals, preds))
            r2 = r2_score(actuals, preds)
            print(f"Walk-forward RMSE: {rmse:.3f}")
            print(f"Walk-forward MAE: {mae:.3f}")
            print(f"Walk-forward R²: {r2:.3f}")

    return model, results

### Create Base df

In [9]:
def load_df(file_name):
    df = pd.DataFrame()
    for i in [2021, 2022, 2023, 2024, 2025]:
        df_temp = pd.read_csv(f"../tables/{i}/{file_name}.csv")
        df_temp['Season'] = i
        df = pd.concat([df, df_temp])
        
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df.Date)
    if file_name == "season_gamelogs":
        df = df[~df[['Date', 'Team', 'Player']].duplicated(keep='last')]
    
    return df

In [10]:
# Load dfs
df = load_df('nba_schedule')
df2 = load_df('season_gamelogs')
# df3 = load_df('REPLACE ME')
df4 = load_df('injuries')
df5 = load_df('plyr_pos_xref')
df6 = load_df('daily_lineups')
gmlog_cols = ['game_id', 'Player', 'MP', 'PF', 'PTS', 'FG', 'FGA', 'FT', 'FTA', '3PM', '3PA', 'ORB', 'TOV']
df7 = load_df('h1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h1", "PTS": "PTS_h1", "FG": "FG_h1", "FGA": "FGA_h1", "FT": "FT_h1", "FTA": "FTA_h1", "3PM": "TPM_h1", "3PA": "TPA_h1", "PF": "PF_h1", "TOV": "TOV_h1", "ORB": "ORB_h1"})
df8 = load_df('h2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h2", "PTS": "PTS_h2", "FG": "FG_h2", "FGA": "FGA_h2", "FT": "FT_h2", "FTA": "FTA_h2", "3PM": "TPM_h2", "3PA": "TPA_h2", "PF": "PF_h2", "TOV": "TOV_h2", "ORB": "ORB_h2"})
df9 = load_df('q1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q1", "PTS": "PTS_q1", "FG": "FG_q1", "FGA": "FGA_q1", "FT": "FT_q1", "FTA": "FTA_q1", "3PM": "TPM_q1", "3PA": "TPA_q1", "PF": "PF_q1", "TOV": "TOV_q1", "ORB": "ORB_q1"})
df10 = load_df('q2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q2", "PTS": "PTS_q2", "FG": "FG_q2", "FGA": "FGA_q2", "FT": "FT_q2", "FTA": "FTA_q2", "3PM": "TPM_q2", "3PA": "TPA_q2", "PF": "PF_q2", "TOV": "TOV_q2", "ORB": "ORB_q2"})
df11 = load_df('q3_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q3", "PTS": "PTS_q3", "FG": "FG_q3", "FGA": "FGA_q3", "FT": "FT_q3", "FTA": "FTA_q3", "3PM": "TPM_q3", "3PA": "TPA_q3", "PF": "PF_q3", "TOV": "TOV_q3", "ORB": "ORB_q3"})
df12 = load_df('q4_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q4", "PTS": "PTS_q4", "FG": "FG_q4", "FGA": "FGA_q4", "FT": "FT_q4", "FTA": "FTA_q4", "3PM": "TPM_q4", "3PA": "TPA_q4", "PF": "PF_q4", "TOV": "TOV_q4", "ORB": "ORB_q4"})

df_mtch = df[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df = pd.concat([df_mtch, df_mtch2])
df = df.sort_values(["Team", "Date"])
df['team_game_num'] = df.groupby(["Team", "Season"]).cumcount() + 1
df['Spread'] = np.where(df.Team_type == 'Home', df.AwayPTS - df.HomePTS, df.HomePTS - df.AwayPTS)
df['Total'] = df.AwayPTS + df.HomePTS
df['is_Win'] = np.where(df.Spread > 0, 1, 0)
df['Szn_Wins'] = df.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.merge(df5, on=['Season', 'Team'])

df2 = df2.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"})
df2['PR'] = df2.PTS + df2.REB 
df2['PA'] = df2.PTS + df2.AST
df2['RA'] = df2.REB + df2.AST
df2['PRA'] = df2.PTS + df2.REB + df2.AST
df2['STL_BLK'] = df2.STL + df2.BLK
df = df.merge(df2.drop(['Pos', 'Opp', 'Team_type'], axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')

df = df.merge(df4[['Date', 'Team', 'Player', 'Status']], on=['Date', 'Team', 'Player'], how='left')
df['Status'] = np.where((df.Active == 1) & (df.Status.isnull()), 'Available', df.Status)
df['Status'] = np.where((df.Active == 0), 'Out', df.Status)
df['Status'] = np.where((df.Status == 'Out') & (df.Active != 0), 'Available', df.Status)

df6['role'] = 1
df = df.merge(df6.drop('Pos', axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')
df['role'] = df.role.fillna(2).astype(int)

# Add gmlog splits
df_gmlog_comb = df7.merge(df8, on=['game_id', 'Player'])
for df_loop in (df9, df10, df11, df12):
    df_gmlog_comb = df_gmlog_comb.merge(df_loop, on=['game_id', 'Player'])
df = df.merge(df_gmlog_comb, on=['game_id', 'Player'], how='left')

df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
df = df.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df['Res_PTS'] = df.PTS - df.PTS_line

df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
df_td = df[df.Date == now]
df = df[(df.Active == 1) & (df.MP > 0)]
df_pred = df.copy()
df_pred = pd.concat([df_pred, df_td])
print('base df created', datetime.now())

base df created 2026-01-28 22:03:25.623240


### Feature Engineering Helper Functions

In [11]:
def create_df_missing(df, pred_col):

    df3 = load_df('season_gamelogs')
    df3 = df3.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"}).drop(['Pos', 'Opp'], axis=1)
    df4 = load_df('injuries')
    
    # Fill missing games from injuries.csv
    team_games = df_pred[['Season', 'Team', 'Date']].drop_duplicates()
    players = df_pred[['Season','Player','Team']].drop_duplicates()
    fabricated = (players.sort_values('Season').groupby('Player', as_index=False).last())
    fabricated['Season'] = fabricated['Season'] + 1
    players = pd.concat([players, fabricated], ignore_index=True).drop_duplicates(['Season','Player','Team'])
    expanded = team_games.merge(players, on=['Season', 'Team'], how='left')

    df5 = load_df('plyr_pos_xref')

    expanded = expanded.merge(df3[['Season', 'Player', 'Date', 'MP']], on=['Season', 'Player', 'Date'], how='left').drop_duplicates(['Season', 'Date', 'Player', 'Team'])
    expanded = expanded[(expanded.MP.isnull()) & (expanded.Date != now)].drop('MP', axis=1)
    expanded = pd.concat([expanded, df4[df4.Status == 'Out'][['Season', 'Team', 'Date', 'Player']]])
    df4 = df4.merge(expanded, on=['Season', 'Date', 'Team', 'Player'], how='right')

    # Grab outs from players season gamelogs
    df4 = df4.merge(df3, on=['Season', 'Date', 'Team', 'Player'], how='outer')
    df4['Status'] = np.where(((df4.Active == 1) | (df4.MP > 0)), 'Available', df4.Status)
    df4['Status'] = np.where(((df4.Active == 0) | (df4.MP == 0) | (df4.MP.isnull())), 'Out', df4.Status)
    df4['Status'] = np.where((df4.Status == 'Out') & (df4.MP > 0), 'Available', df4.Status)
    df4['Status'] = np.where((df4.Status != 'Out') & (df4.MP == 0), 'Out', df4.Status)
    df4 = df4[df4.Status == 'Out'][['Season', 'Date', 'Team', 'Player']].drop_duplicates()
    
    df_missing = df[['Season', 'Date', 'Team', 'Player', 'role', pred_col]].copy()
    df_missing[f'{pred_col}_L10'] = (
        df_missing.sort_values(['Player', 'Date']).groupby(['Player','Season'])[pred_col].shift(1)
                  .transform(lambda x: x.rolling(10, min_periods=10).mean())
    )
    df_missing['role_L10_mode'] = (
        df_missing.sort_values(['Player', 'Date'])
            .groupby(['Player', 'Season'])['role'].shift(1)
            .transform(lambda x: x.rolling(10, min_periods=10)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
    )
    df_missing = pd.merge_asof(df4, df_missing[["Season", "Player", "Date", "role", "role_L10_mode", f"{pred_col}_L10"]], 
                      on="Date", by=["Player", "Season"], direction="backward", allow_exact_matches=True).dropna()   
    df_missing = df_missing.merge(df5, on=['Season', 'Team', 'Player'])
    
    # Filter out old injuries
    df_missing = df_missing.sort_values(["Season", "Team", "Player", "Date"])
    df_missing["team_game_num"] = (df_missing.groupby(["Season", "Team"])["Date"].rank(method="dense").astype(int))
    df_missing["game_break"] = (df_missing.groupby(["Season", "Team", "Player"])["team_game_num"].diff().ne(1))
    df_missing["streak_id"] = (df_missing.groupby(["Season", "Team", "Player"])["game_break"].cumsum())
    df_missing["consecutive_games"] = (df_missing.groupby(["Season", "Team", "Player", "streak_id"]).cumcount().add(1))
    df_missing["eligible_today"] = (df_missing["consecutive_games"] <= 10).astype(int)
    df_missing["role_for_count"] = np.where(df_missing["eligible_today"] == 1, df_missing["role_L10_mode"], np.nan)    
    df_missing[f'{pred_col}_L10'] = np.where(df_missing['role_for_count'] == 1, df_missing[f'{pred_col}_L10'], 0)

#     display(df_missing[(df_missing.Team == 'CLE') & (df_missing.Date == '2026-01-23')].tail(10))

    out_minutes = (
    df_missing
      .groupby(["Season", "Date", "Team"])
      .agg(
          tgt_available=(f"{pred_col}_L10", lambda x: x.sum()),
          starters_out=("role_for_count", lambda x: (x == 1).sum())
      )
      .reset_index()
    ).rename(columns={"tgt_available": f"team_{pred_col}_available"})

    return out_minutes

In [12]:
def filter_out_early_exits(df):
    for N in [3, 5, 10]:
        df[f'MP_L{N}_avg'] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['MP'].shift(1)
             .rolling(window=N, min_periods=N)
             .mean()
        )
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)    
    df['Early_Exit'] = ((df['MP_base'].notna()) & (df['MP_base'] > 0) &
                        (
                          ((df['MP'] - df['MP_base']) / df['MP_base'] <= -0.4) |  
                          ((df['MP_q4'] == 0) & (df['role'] == 1))
                        )).astype(int)    
    df = df[df.Early_Exit == 0]
    df = df.drop('Early_Exit', axis=1)
    
    return df

# Minutes Projection Model

In [13]:
def setup_df_mins(con, df):

    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'role',
             'MP', 'MP_q4', 'Spread', 'team_game_num', 'is_OT']]    
    cleanup_cols = []
    cold_features = []
    df = filter_out_early_exits(df)
    
    df['team_mins_pct'] = df['MP'] / (240 + (df.is_OT * 25))
    for col in ['MP', 'team_mins_pct']:
        for N in [3, 5, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                 .rolling(window=N, min_periods=N)
                 .mean()
            )
            df[f'is_cold_{col}_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
            cold_features.append(f'is_cold_{col}_L{N}')
            cleanup_cols.append(f'{col}_L{N}_avg')
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    df['MP_tm_pct_base'] = df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    
    df['role'] = np.where((df.role == 2) & (df.MP_base < 13), 3, df.role)
    
    games_last_14_days = df.sort_values(['Player', 'Season', 'Date']).groupby(['Player', 'Season']).rolling('14D', on='Date', closed='left')['MP'].count().reset_index().rename(columns={"MP": "gms_L14_days"})
    games_last_14_days = games_last_14_days.drop_duplicates(
        subset=['Player', 'Season', 'Date']
    )
    df = df.merge(games_last_14_days, on=['Player', 'Season', 'Date'])
    df['gms_L14_days'] = df.gms_L14_days.fillna(0).astype(int)    
    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
    df['games_since_return'] = (df.groupby(['Player', 'Team', 'Season']).apply(
                                    lambda g: (
                                        (g['team_game_num'].diff().sub(1).fillna(0).gt(0))
                                        .cumsum()
                                        .groupby((g['team_game_num'].diff().sub(1).fillna(0).gt(0)).cumsum()).cumcount()
                                    )
                                ).reset_index(level=[0,1,2], drop=True))

    for N in [1, 3, 5]:
        df[f"recent_role_L{N}"] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['role'].shift(1)
              .transform(lambda x: x.rolling(N, min_periods=N)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
        )
        df[f'is_cold_recent_role_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
        cold_features.append(f'is_cold_recent_role_L{N}')
        cleanup_cols.append(f"recent_role_L{N}")
    
#     df['game_spread_type'] = 0
#     df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
#     df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 21), 2, df.game_spread_type) 
#     df['game_spread_type'] = np.where(abs(df.Spread) > 21, 3, df.game_spread_type) 
    
    df2 = create_df_missing(df, 'MP')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_MP_available']:
        df[col] = df[col].fillna(0).astype(int)
        
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df['returning_MP'] = (
        (df['MP_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df['team_MP_available'] = df['team_MP_available'] - df['returning_MP']

    df['MP_Change'] = 0
    MP_Inc_conds = (
#                     ((df.role != 3) & (df.starters_out > 2)) | 
                    ((df.role == 1) & (df.recent_role_L1 > 1.0)).astype(int) + 
                    (df.team_MP_available >= 110).astype(int)
                   )
    
    MP_Dec_conds = (
                    ((df.role > 1) & ((df.recent_role_L1 == 1.0))).astype(int) + 
                    (df.team_MP_available < -23).astype(int)
                   ) * -1
    df['MP_Change'] = MP_Inc_conds + MP_Dec_conds

    df['scenario_mins'] = (
        df.sort_values(['Season','Team','role','Pos','Date'])
          .groupby(['Season','Team','role','Pos'])['MP'].shift(1)
          .expanding()
          .mean() 
          .reset_index(drop=True)
    )
    
    df['MP_trend'] = df['MP_L3_avg'] - df['MP_L10_avg']
    df['Expected_MP'] = (
        (0.8 * df['scenario_mins']) +
        (df['team_MP_available'] * df['MP_tm_pct_base']) + 
        (0.2 * df['MP_base']) + df['MP_trend']
    )
    
    df["is_cold_start"] = (df[cold_features].eq(1).any(axis=1).astype(int))
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['Team_type', 'team_game_num', 'is_OT', 'Spread', 'team_mins_pct', 'MP_tm_pct_base', 
                  'returning_MP', 'scenario_mins', 'MP_q4'] + cleanup_cols + cold_features, axis=1)    


    return df

In [14]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

game_dates = (df_mins[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

mins_train_df = df_mins[df_mins['Date'] <= train_end]
mins_val_df   = df_mins[(df_mins['Date'] > train_end) & (df_mins['Date'] <= val_end)]
mins_test_df  = df_mins[df_mins['Date'] > val_end]
mins_DFS = (mins_train_df, mins_val_df, mins_test_df)

# Prev r2/mae/rmse best: 0.7064/4.0508/5.1790 [1/24/2026]
# mins_params = hyperparam_tuning(mins_DFS, "MP", n_iter=25)
# with open(f"{MDL_PATH}/mins_params.json", "w") as f:
#     json.dump(mins_params, f)

# Prev r2/mae/rmse best: 0.729/3.615/4.599 [1/24/2026]
mins_model, mins_results = refit_model(df_mins, 'MP', 'mins_params', min_train_days=910, rolling_window=180)
# feature_importance(mins_model, df_mins.columns.tolist())

mins_model.get_booster().save_model(f"{MDL_PATH}/mins_model.json")
print('Saved Mins booster!')

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,MP_base,gms_L14_days,missed_games,games_since_return,team_MP_available,starters_out,starters_returning,MP_Change,MP_trend,Expected_MP,is_cold_start
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,1,22.98,,0,0,0,0.0,0,0,0,,,1
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,2,3.75,,0,0,0,0.0,0,0,0,,,1
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,2,3.75,,0,0,0,0.0,0,0,0,,,1
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,2,3.75,,0,0,0,0.0,0,0,0,,,1
4,2021,2021-10-19,BRK,MIL,James Harden,PG,1,30.63,,0,0,0,0.0,0,0,0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106379,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,15.80,16.9059,6,0,19,65.0,2,0,0,-2.325667,25.539790,0
106380,2025,2026-01-26,POR,BOS,Robert Williams,C,2,19.68,17.5051,4,1,0,65.0,2,0,0,1.450667,29.598207,0
106381,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,34.32,32.4946,6,0,28,65.0,2,0,0,1.219000,36.424146,0
106382,2025,2026-01-26,POR,BOS,Sidy Cissoko,SF,1,24.77,26.2039,6,0,16,65.0,2,0,0,2.881000,35.124206,0


Rows: 106384, Dates: 943, min_train_days: 910
Progress:   3.03% (1/33)
Progress:   6.06% (2/33)
Progress:   9.09% (3/33)
Progress:  12.12% (4/33)
Progress:  15.15% (5/33)
Progress:  18.18% (6/33)
Progress:  21.21% (7/33)
Progress:  24.24% (8/33)
Progress:  27.27% (9/33)
Progress:  30.30% (10/33)
Progress:  33.33% (11/33)
Progress:  36.36% (12/33)
Progress:  39.39% (13/33)
Progress:  42.42% (14/33)
Progress:  45.45% (15/33)
Progress:  48.48% (16/33)
Progress:  51.52% (17/33)
Progress:  54.55% (18/33)
Progress:  57.58% (19/33)
Progress:  60.61% (20/33)
Progress:  63.64% (21/33)
Progress:  66.67% (22/33)
Progress:  69.70% (23/33)
Progress:  72.73% (24/33)
Progress:  75.76% (25/33)
Progress:  78.79% (26/33)
Progress:  81.82% (27/33)
Progress:  84.85% (28/33)
Progress:  87.88% (29/33)
Progress:  90.91% (30/33)
Progress:  93.94% (31/33)
Progress:  96.97% (32/33)
Progress: 100.00% (33/33)
Walk-forward RMSE: 4.579
Walk-forward MAE: 3.603
Walk-forward R²: 0.732
Saved Mins booster!


In [15]:
# analyze_df = mins_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df[analyze_df.Date.isin(['2026-01-23'])].sort_values('Diff', ascending=False).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [16]:
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull())]

# df_lines["Team"] = team_encoder.transform(df_lines["Team"])
# df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
# df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
# df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
# df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
# df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
# df_prediction = df_pred.copy()

# # Predict Mins
# mins_booster = xgb.Booster()
# mins_booster.load_model("../ML_models/dev/mins_model.json")
# mins_model = XGBRegressor()
# mins_model._Booster = mins_booster

# df_prediction_mins = setup_df_mins(con, df_prediction)
# df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))
# df_prediction_mins = df_prediction_mins[df_prediction_mins.Date == now]

# df_prediction_mins['Team'] = team_encoder.inverse_transform(df_prediction_mins["Team"])
# df_prediction_mins['Opp'] = team_encoder.inverse_transform(df_prediction_mins["Opp"])
# df_prediction_mins['Player'] = player_encoder.inverse_transform(df_prediction_mins["Player"])

# if df_prediction_mins.shape[0] >= 50:
#     print(df_prediction_mins.shape[0], 'rows')
#     for tm in df_prediction_mins.Team.unique():
#         display(df_prediction_mins[df_prediction_mins.Team == tm])
# else:
#     display(df_prediction_mins)

# Stats Model

In [71]:
def setup_df_main(df, tgt_stat):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 'is_OT', 
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'Spread', 'Total',
            'ORB']]
    cleanup_cols = []

    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
#     df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

#     df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
#         'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
#         'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
#         'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
#     }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
    
    
    # Create rolling + lag features
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    for col in ['PTS', 'FGA', 'FTA', 'ORB', 'TOV']:
        df[f'Team{col}'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])[col].transform('sum'))
        if col in ['PTS']:
            df[f'Team{col}_pct'] = df[col] / df[f'Team{col}']
            
    df['PlayerPace'] = np.where(df['MP'] > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
    df['TeamPace'] = ((df['TeamFGA'] + 0.44 * df['TeamFTA'] - df['TeamORB'] + df['TeamTOV']) / ((df['is_OT'] * 25) + 240))
    df['Player_Pace_Rel'] = df['PlayerPace'] / df['TeamPace']
    df['Pace_Minutes_Interaction'] = df['PlayerPace'] * df['MP']

    # Create rolling + lag features    
    for col in ['MP', tgt_stat, 'TeamPTS_pct', 'FGA', 'Player_Pace_Rel', 'Pace_Minutes_Interaction']:
        for N in [1, 3, 5, 10]:
            if col == f'Def{tgt_stat}':
                df[f'Def{tgt_stat}_L{N}_avg'] = (
                    df[df.role <= 2]
                      .groupby(['Season', 'Date', 'Opp', 'Pos'])[tgt_stat]
                      .sum()
                      .groupby(['Opp', 'Pos', 'Season'])
                      .shift(1)
                      .rolling(window=N, min_periods=N)
                      .mean()
                      .reindex(df.set_index(['Season', 'Date', 'Opp', 'Pos']).index)
                      .values
                )
            else:
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].mul([0.60, 0.25, 0.15]).sum(axis=1, skipna=True) / df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].notna().mul([0.60, 0.25, 0.15]).sum(axis=1)
    
    df['role'] = np.where((df.role == 2) & (df.MP_base < 13), 3, df.role)
    

    df2 = create_df_missing(df, tgt_stat)
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', f'team_{tgt_stat}_available']:
        df[col] = df[col].fillna(0)
    
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df[f'returning_{tgt_stat}'] = (
        (df[f'{tgt_stat}_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df[f'team_{tgt_stat}_available'] = df[f'team_{tgt_stat}_available'] - df[f'returning_{tgt_stat}']
    
    df[f'scenario_{tgt_stat}'] = (
        df.sort_values(['Season','Team','role','Pos','Date'])
          .groupby(['Season','Team','role','Pos'])[tgt_stat].shift(1)
          .expanding()
          .mean() 
          .reset_index(drop=True)
    )
    
    df[f'Expected_{tgt_stat}'] = (
        (0.8 * df[f'scenario_{tgt_stat}']) +
        (df[f'team_{tgt_stat}_available'] * df['TeamPTS_pct_base']) + 
        (0.2 * df[f'{tgt_stat}_base'])
    )
    
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
        
    df = df.drop(['team_game_num', 'is_OT', 'missed_games', 'MP_q4', 'Spread', 'Total', 'role', 'MP_base', 
                  'TeamPTS', 'TeamPTS_pct', 'TeamFGA', 'TeamFTA', 'TeamORB', 'TeamTOV', 
                  'PlayerPace', 'TeamPace', 'Player_Pace_Rel', 'Pace_Minutes_Interaction',  
                 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS', 'ORB', 
                 f'returning_{tgt_stat}', 'starters_out', 'starters_returning', f'scenario_{tgt_stat}', f'team_{tgt_stat}_available'
                 ] + cleanup_cols, axis=1)

    return df

In [72]:
df_main = df.copy()
df_main = setup_df_main(df_main, tgt_stat)
display(df_main)

game_dates = (df_main[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

main_train_df = df_main[df_main['Date'] <= train_end]
main_val_df   = df_main[(df_main['Date'] > train_end) & (df_main['Date'] <= val_end)]
main_test_df  = df_main[df_main['Date'] > val_end]
main_DFS = (main_train_df, main_val_df, main_test_df)

# Prev r2/mae/rmse best: 0.6706/3.7633/5.0858 [1/24/2026]
# stat_params = hyperparam_tuning(main_DFS, tgt_stat, n_iter=1, decay=0.99)
# with open(f"{MDL_PATH}/{tgt_stat}_params.json", "w") as f:
#     json.dump(stat_params, f)

# Prev r2/mae/rmse best: 0.695/3.628/4.840 [1/21/2026]
stat_model, stat_results = refit_model(df_main, tgt_stat, f'{tgt_stat}_params', min_train_days=910, decay=0.99)
# feature_importance(stat_model, df_main.columns.tolist())

stat_model.get_booster().save_model(f"{MDL_PATH}/{tgt_stat}_model.json")
print(f"Saved {tgt_stat} booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,MP,PTS,PTS_base,TeamPTS_pct_base,FGA_base,Player_Pace_Rel_base,Pace_Minutes_Interaction_base,Expected_PTS
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,22.98,6.0,,,,,,
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,3.75,0.0,,,,,,
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,3.75,2.0,,,,,,
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,3.75,0.0,,,,,,
4,2021,2021-10-19,BRK,MIL,James Harden,PG,30.63,20.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127189,2025,2026-01-27,WAS,POR,Khris Middleton,SF,26.77,19.0,11.285,0.106367,10.935,1.082780,11.8238,8.690205
127190,2025,2026-01-27,WAS,POR,Kyshawn George,SF,36.60,19.0,16.865,0.159349,15.700,1.386299,17.3742,8.757108
127191,2025,2026-01-27,WAS,POR,Malaki Branham,SG,2.13,3.0,5.220,0.046381,3.940,0.912303,4.0940,8.664842
127192,2025,2026-01-27,WAS,POR,Tre Johnson,SG,32.03,18.0,18.335,0.171625,15.255,1.173034,16.2384,8.808023


Rows: 127194, Dates: 944, min_train_days: 910
Progress:   2.94% (1/34)
Progress:   5.88% (2/34)
Progress:   8.82% (3/34)
Progress:  11.76% (4/34)
Progress:  14.71% (5/34)
Progress:  17.65% (6/34)
Progress:  20.59% (7/34)
Progress:  23.53% (8/34)
Progress:  26.47% (9/34)
Progress:  29.41% (10/34)
Progress:  32.35% (11/34)
Progress:  35.29% (12/34)
Progress:  38.24% (13/34)
Progress:  41.18% (14/34)
Progress:  44.12% (15/34)
Progress:  47.06% (16/34)
Progress:  50.00% (17/34)
Progress:  52.94% (18/34)
Progress:  55.88% (19/34)
Progress:  58.82% (20/34)
Progress:  61.76% (21/34)
Progress:  64.71% (22/34)
Progress:  67.65% (23/34)
Progress:  70.59% (24/34)
Progress:  73.53% (25/34)
Progress:  76.47% (26/34)
Progress:  79.41% (27/34)
Progress:  82.35% (28/34)
Progress:  85.29% (29/34)
Progress:  88.24% (30/34)
Progress:  91.18% (31/34)
Progress:  94.12% (32/34)
Progress:  97.06% (33/34)
Progress: 100.00% (34/34)
Walk-forward RMSE: 4.817
Walk-forward MAE: 3.608
Walk-forward R²: 0.692
Saved P

In [68]:
# analyze_df = stat_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df.sort_values('Diff', ascending=True).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Residual PTS

##### Try some sort of defensive rk L5 - d_rk today feature to measure difficulty increase/decrease
##### Create some short term Lx Average features for the target columns (Res_PTS/Bet)

In [95]:
def setup_df_res(df):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 'Res_PTS', 'PTS_line',  
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 
             'Spread', 'Total']]
    
    # Create rolling + lag features
    df3 = load_df('season_gamelogs')
    df3 = con.execute("""SELECT Date, Team, CAST(ROUND(SUM(MP), 0) as INT) as Team_Mins, 
                         CAST(SUM(FGA) as INT) as Team_FGA, CAST(SUM(FTA) as INT) as Team_FTA, CAST(SUM(TOV) as INT) as Team_TOV 
                         FROM df3
                         GROUP BY Date, Team""").fetchdf()
    df = df.merge(df3, on=['Date', 'Team'], how='left')
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['USG'] = (
        (df['FGA'] + 0.44*df['FTA'] + df['TOV']) * (df['Team_Mins'] / 5)
        / (df['MP'] * (df['Team_FGA'] + 0.44*df['Team_FTA'] + df['Team_TOV']))
    )

    LN_cols = []
    for col in ['MP', 'PTS', 'eFG', 'TS', 'USG']:
        for N in [3, 10]:
            if col != 'else':
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )

                if col not in ['MP']:
                    LN_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_trend'] = df[f'{col}_L3_avg'] - df[f'{col}_L10_avg']
    
    df['game_spread_type'] = 0
    df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
    df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 18), 2, df.game_spread_type) 
    df['game_spread_type'] = np.where(abs(df.Spread) > 18, 3, df.game_spread_type) 
    
    df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

    df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
        'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
        'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
        'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
    }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
        
    for col in ['TeamPTS']:
        for N in [3, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                  .rolling(window=N, min_periods=N)
                  .mean()
            )
            df[f'PTS_pct_L{N}'] = df[f'PTS_L{N}_avg'] / df[f'TeamPTS_L{N}_avg']
            df = df.drop(f'TeamPTS_L{N}_avg', axis=1)
        df['PTS_pct_trend'] = df['PTS_pct_L3'] - df['PTS_pct_L10']
        df = df.drop(['PTS_pct_L3', 'PTS_pct_L10'], axis=1)
    
    df2 = create_df_missing(df, 'PTS')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_PTS_available']:
        df[col] = df[col].fillna(0)
    df['starters_out_L1'] = (
        df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['starters_out'].shift(1)
          .rolling(window=1, min_periods=1)
          .mean()
    )
    df['starters_returning'] = np.where(df['starters_out_L1'] > df['starters_out'], df['starters_out_L1'] - df['starters_out'], 0)

    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['team_game_num', 'Spread', 'Total', 'MP_q4', 'TeamPTS', 'Team_FGA', 'Team_FTA', 'Team_TOV', 
                 'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS', 'USG', 
                  'Team_Mins', 'starters_out_L1'] + LN_cols, axis=1)
        
    return df

##### Regressor

In [21]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Prev r2/mae/rmse best: 0.2124/4.3069/5.6478 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Res_PTS', n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_RG_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev mae best: 4.1101 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Res_PTS', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_RG_model.json")
print("Saved Res_PTS_RG booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,Res_PTS,PTS_line,MP_L3_avg,MP_L10_avg,MP_trend,PTS_trend,eFG_trend,TS_trend,USG_trend,game_spread_type,pts_low,pts_mid,pts_high,PTS_pct_trend,team_PTS_available,starters_out,starters_returning
0,2025,2025-11-20,ATL,SAS,Dyson Daniels,SG,1,30.68,-3.5,11.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
1,2025,2025-11-20,ATL,SAS,Jalen Johnson,SF,1,37.95,3.5,22.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
2,2025,2025-11-20,ATL,SAS,Kristaps Porzingis,C,1,29.05,-0.5,16.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
3,2025,2025-11-20,ATL,SAS,Nickeil Alexander-Walker,SG,1,33.65,20.5,17.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
4,2025,2025-11-20,ATL,SAS,Onyeka Okongwu,C,2,25.13,0.5,14.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5754,2025,2026-01-26,POR,BOS,Jrue Holiday,PG,1,24.45,1.5,12.5,20.283333,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5755,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,15.80,0.5,3.5,19.746667,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5756,2025,2026-01-26,POR,BOS,Robert Williams,C,2,19.68,-0.5,6.5,18.656667,16.084,2.572667,-0.400000,,,-0.021068,1,0.8,0.0,0.0,-0.009271,35.0,2.0,0.0
5757,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,34.32,-14.5,23.5,33.440000,32.221,1.219000,1.500000,-0.029100,-0.031318,0.035742,1,0.8,0.0,0.0,0.015482,35.0,2.0,0.0


Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Walk-forward MAE: 4.229058576918787
|Pred| >= 0: accuracy = 0.631, n = 1612
|Pred| >= 1: accuracy = 0.678, n = 1142
|Pred| >= 2: accuracy = 0.735, n = 740
|Pred| >= 3: accuracy = 0.802, n = 449
Saved Res_PTS_RG booster!


##### Classifier

In [22]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
df_res['Bet'] = (df_res['Res_PTS'] > 0).astype(int)  # 1 = over, 0 = under
df_res = df_res.drop('Res_PTS', axis=1)
# display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_cut = int(0.70 * n_days)
val_cut   = int(0.80 * n_days)
train_end = game_dates.loc[train_cut, 'Date']
val_end   = game_dates.loc[val_cut, 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Test Accuracy: 0.6672 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Bet', is_classification=True, n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_CLF_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev roc_auc best: 0.720 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Bet', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_CLF_model.json")
print("Saved Res_PTS_CLF booster!")

Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Confusion Matrix:
 [[548 286]
 [300 478]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.66      0.65       834
           1       0.63      0.61      0.62       778

    accuracy                           0.64      1612
   macro avg       0.64      0.64      0.64      1612
weighted avg       0.64      0.64      0.64      1612

ROC AUC: 0.693
High-confidence hit rate (<= 0.3 & >= 0.7): 0.77
Saved Res_PTS_CLF booster!


# HT Stats

In [92]:
def setup_df_ht(df):
    
    df_full = setup_df_main(df, 'PTS')
    stat_booster = xgb.Booster()
    stat_booster.load_model(f"../ML_models/dev/{tgt_stat}_model.json")
    stat_model = XGBRegressor()
    stat_model._Booster = stat_booster

    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 
             'MP_h1', 'PTS_h1', 'PTS_h2', 'FG_h1', 'FGA_h1', 'FGA_h2', 'FT_h1', 'FTA_h1', 'TPM_h1', 'TPA_h1', 'PF_h1', 
             'ORB_h1', 'TOV_h1', 'PTS', 'FGA']]
    df["PTS_proj"] = stat_model.predict(df_full.drop(['Season', 'Date', 'PTS'], axis=1))

    cleanup_cols = []
    df['MP_h2'] = df['MP'] - df['MP_h1']
    
    for col in ['PTS', 'FGA', 'FTA', 'ORB', 'TOV']:
        df[f'Team{col}_h1'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])[f'{col}_h1'].transform('sum'))
        if col not in ['FTA', 'ORB', 'TOV']:
            df[f'Team{col}_pct_h1'] = df[f'{col}_h1'] / df[f'Team{col}_h1']
    
    df['OppTeamPTS_h1'] = (
        (
        df.groupby(['Season', 'Date', 'Team'], as_index=True)['TeamPTS_h1']
        .first()
        ).reindex(
        pd.MultiIndex.from_frame(df[['Season', 'Date', 'Opp']])
        ).to_numpy())
    df['Spread_h1'] = df['TeamPTS_h1'] - df['OppTeamPTS_h1']

    df['Player_Pace'] = np.where(df['MP_h1'] > 0, (df['FGA_h1'] + 0.44 * df['FTA_h1']) / df['MP_h1'], 0)
    df['Team_Pace'] = ((df['TeamFGA_h1'] + 0.44 * df['TeamFTA_h1'] - df['TeamORB_h1'] + df['TeamTOV_h1']) / 120)
    df['Player_Pace_Rel'] = df['Player_Pace'] / df['Team_Pace']
    df['Pace_Minutes_Interaction'] = df['Player_Pace'] * df['MP_h1']

    # Create rolling + lag features    
    for col in ['MP', 'PTS', 'FG_h1', 'FGA_h1', 'FGA_h2', 'PTS_h1', 'PTS_h2']:
        for N in [1, 3, 5, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                 .rolling(window=N, min_periods=N)
                 .mean()
            )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
        
    for col in ['PTS', 'FG', 'FGA']:
        df[f'{col}Diff'] = df[f'{col}_h1'] - df[f'{col}_h1_base']
        
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')    
    df = df.drop(['TeamFGA_h1', 'TeamPTS_h1', 'OppTeamPTS_h1', 'TeamORB_h1', 'TeamTOV_h1', 'TeamFTA_h1', 'Player_Pace', 'Team_Pace', 
                  'MP_base', 'PTS_base', 'PTS_h2', 'MP', 'FGA', 'FGA_h2', 'ORB_h1', 'TOV_h1', 'MP_h2'] + cleanup_cols, axis=1)

    return df

In [103]:
df_ht = df.copy()
df_ht = setup_df_ht(df_ht)
display(df_ht)

game_dates = (df_ht[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

ht_train_df = df_ht[df_ht['Date'] <= train_end]
ht_val_df   = df_ht[(df_ht['Date'] > train_end) & (df_ht['Date'] <= val_end)]
ht_test_df  = df_ht[df_ht['Date'] > val_end]
ht_DFS = (ht_train_df, ht_val_df, ht_test_df)

# 0.843 / 0.651 / 0.826
ht_train_dict = {
                 'mean': {'quantile': False, 'q_val': 0.5}, 
                 'Qlow': {'quantile': True, 'q_val': 0.15}, 
                 'Qhigh': {'quantile': True, 'q_val': 0.85}
                }
for key in ht_train_dict.keys():
#     ht_params = hyperparam_tuning(ht_DFS, 'PTS', n_iter=10, decay=0.99, quantile=ht_train_dict[key]['quantile'], q_val=ht_train_dict[key]['q_val'])
#     with open(f"{MDL_PATH}/ht_PTS_{key}_params.json", "w") as f:
#         json.dump(ht_params, f)

    ht_model, ht_results = refit_model(df_ht, 'PTS', f'ht_PTS_{key}_params', min_train_days=910, decay=0.99)
#     feature_importance(ht_model, df_ht.columns.tolist())

    ht_model.get_booster().save_model(f"{MDL_PATH}/ht_PTS_{key}_model.json")
    print(f"Saved ht_PTS_{key} booster!")

Rows: 127194, Dates: 944, min_train_days: 910
Progress:   2.94% (1/34)
Progress:   5.88% (2/34)
Progress:   8.82% (3/34)
Progress:  11.76% (4/34)
Progress:  14.71% (5/34)
Progress:  17.65% (6/34)
Progress:  20.59% (7/34)
Progress:  23.53% (8/34)
Progress:  26.47% (9/34)
Progress:  29.41% (10/34)
Progress:  32.35% (11/34)
Progress:  35.29% (12/34)
Progress:  38.24% (13/34)
Progress:  41.18% (14/34)
Progress:  44.12% (15/34)
Progress:  47.06% (16/34)
Progress:  50.00% (17/34)
Progress:  52.94% (18/34)
Progress:  55.88% (19/34)
Progress:  58.82% (20/34)
Progress:  61.76% (21/34)
Progress:  64.71% (22/34)
Progress:  67.65% (23/34)
Progress:  70.59% (24/34)
Progress:  73.53% (25/34)
Progress:  76.47% (26/34)
Progress:  79.41% (27/34)
Progress:  82.35% (28/34)
Progress:  85.29% (29/34)
Progress:  88.24% (30/34)
Progress:  91.18% (31/34)
Progress:  94.12% (32/34)
Progress:  97.06% (33/34)
Progress: 100.00% (34/34)
Walk-forward RMSE: 3.433
Walk-forward MAE: 2.605
Walk-forward R²: 0.843
Saved h

In [107]:
# analyze_df = ht_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df.sort_values('Diff', ascending=True).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [108]:
df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull())]

df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_prediction = df_pred.copy()

mins_booster = xgb.Booster()
mins_booster.load_model(f"{MDL_PATH}/mins_model.json")
mins_model = XGBRegressor()
mins_model._Booster = mins_booster
df_prediction_mins = setup_df_mins(con, df_prediction)
df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))

df_ht = setup_df_ht(df_pred)
df_ht = df_ht[df_ht.Date == now]
df_ht = df_ht.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'])
df_ht['MP'] = df_ht['MP_preds']
df_ht = df_ht.drop('MP_preds', axis=1)
display(df_ht)
partition_save_df(df_ht, f"../tables/{YEAR}/ht_api_input.csv")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP_h1,PTS_h1,FG_h1,FGA_h1,FT_h1,FTA_h1,TPM_h1,TPA_h1,PF_h1,PTS,PTS_proj,TeamPTS_pct_h1,TeamFGA_pct_h1,Spread_h1,Player_Pace_Rel,Pace_Minutes_Interaction,FG_h1_base,FGA_h1_base,FGA_h2_base,PTS_h1_base,PTS_h2_base,PTSDiff,FGDiff,FGADiff,MP
0,2025,2026-01-28,ATL,BOS,Asa Newell,PF,2,,,,,,,,,,,18.183426,,,0.0,,,0.770000,1.670000,2.150,1.870000,2.760000,,,,12.499912
1,2025,2026-01-28,ATL,BOS,CJ McCollum,PG,2,,,,,,,,,,,20.154320,,,0.0,,,4.250000,8.000000,6.125,10.875000,7.375000,,,,25.929760
2,2025,2026-01-28,ATL,BOS,Caleb Houstan,SF,2,,,,,,,,,,,16.500847,,,0.0,,,0.120000,0.340000,1.110,0.480000,1.650000,,,,10.868979
3,2025,2026-01-28,ATL,BOS,Christian Koloko,C,2,,,,,,,,,,,15.659017,,,0.0,,,1.333333,1.666667,2.000,2.666667,3.666667,,,,16.663277
4,2025,2026-01-28,ATL,BOS,Corey Kispert,SF,1,,,,,,,,,,,18.008902,,,0.0,,,1.500000,3.875000,3.250,4.000000,4.000000,,,,24.454361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,2025,2026-01-28,UTA,GSW,Oscar Tshiebwe,C,2,,,,,,,,,,,21.127884,,,0.0,,,,,,,,,,,12.916346
313,2025,2026-01-28,UTA,GSW,Svi Mykhailiuk,SF,1,,,,,,,,,,,16.458509,,,0.0,,,1.550000,3.030000,2.190,4.410000,3.300000,,,,26.354691
314,2025,2026-01-28,UTA,GSW,Taylor Hendricks,PF,2,,,,,,,,,,,16.141790,,,0.0,,,0.350000,1.140000,1.730,1.050000,1.530000,,,,17.160366
315,2025,2026-01-28,UTA,GSW,Walker Kessler,C,2,,,,,,,,,,,16.974335,,,0.0,,,3.500000,4.750000,3.500,8.875000,6.000000,,,,21.350645


../tables/2025/ht_api_input.csv saved!


In [149]:
def get_live_stat():
    response = requests.get("https://site.api.espn.com/apis/site/v2/sports/basketball/nba/scoreboard")
    data = response.json()
    
    games = data.get('events', [])
    rows = []

    for game in games:
        game_id = game['id']
        summary = requests.get(f"https://site.api.espn.com/apis/site/v2/sports/basketball/nba/summary?event={game_id}").json()
        boxscore = summary.get("boxscore", {})

        players_teams = boxscore.get("players", [])
        if len(players_teams) < 2:
            continue

        for i, team_data in enumerate(players_teams):
            team = team_data.get("team", {})
            team_abbr = team.get("abbreviation", "")
            opp_abbr = players_teams[1-i]["team"].get("abbreviation", "")
            
            stats_blocks = team_data.get("statistics", [])
            if not stats_blocks:
                continue

            stats_block = stats_blocks[0]
            labels = stats_block.get("labels", stats_block.get("names", []))
            athletes = stats_block.get("athletes", [])

            for p in athletes:
                stats = p.get("stats", [])
                if not stats:
                    continue

                row = dict(zip(labels, stats))
                row["PLAYER"] = p["athlete"]["displayName"]
                row["TEAM"] = team_abbr
                row["OPP"] = opp_abbr
                row["STARTER"] = p.get("starter", False)
                rows.append(row)

    df = pd.DataFrame(rows)
    for col in ['FG', '3PT', 'FT']:
        if col in df.columns:
            df[f'{col}M'] = df[col].str.split('-').str[0]
            df[f'{col}A'] = df[col].str.split('-').str[1]
    df = df.drop(['FG', '3PT', 'FT'], axis=1, errors='ignore')
    df = df.rename(columns={"MIN": "MP", "3PTM": "TPM", "3PTA": "TPA", "FGM": "FG", "FTM": "FT", "OREB": "ORB", "TO": "TOV"})
    for col in df.columns.difference(['TEAM', 'PLAYER', 'STARTER', 'OPP']):
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    for col in ['PTS', 'FGA', 'FTA', 'ORB', 'TOV']:
        df[f'Team{col}'] = (df.sort_values(['TEAM']).groupby(['TEAM'])[col].transform('sum'))
        if col in ['PTS', 'FGA']:
            df[f'Team{col}_pct'] = df[f'{col}'] / df[f'Team{col}']
    
    df['OppTeamPTS'] = df['OPP'].map(df.groupby('TEAM')['TeamPTS'].first().to_dict())
    df['Spread'] = df['TeamPTS'] - df['OppTeamPTS']
    
    df['Player_Pace'] = df.apply(lambda row: (row['FGA'] + 0.44 * row['FTA']) / row['MP'] if row['MP'] > 0 else 0, axis=1)
    df['Team_Pace'] = ((df['TeamFGA'] + 0.44 * df['TeamFTA'] - df['TeamORB'] + df['TeamTOV']) / 120)
    df['Player_Pace_Rel'] = df['Player_Pace'] / df['Team_Pace']
    df['Pace_Minutes_Interaction'] = df['Player_Pace'] * df['MP']
    
    df = df.drop(['TeamPTS', 'OppTeamPTS', 'TeamFGA'], axis=1)
    
    return df

In [150]:
def predict(plyr):
    ht_booster_mean = xgb.Booster()
    ht_booster_mean.load_model("../ML_models/dev/ht_PTS_mean_model.json")
    ht_model_mean = XGBRegressor()
    ht_model_mean._Booster = ht_booster_mean

    ht_booster_Qlow = xgb.Booster()
    ht_booster_Qlow.load_model("../ML_models/dev/ht_PTS_Qlow_model.json")
    ht_model_Qlow = XGBRegressor()
    ht_model_Qlow._Booster = ht_booster_Qlow

    ht_booster_Qhigh = xgb.Booster()
    ht_booster_Qhigh.load_model("../ML_models/dev/ht_PTS_Qhigh_model.json")
    ht_model_Qhigh = XGBRegressor()
    ht_model_Qhigh._Booster = ht_booster_Qhigh
    
    df = pd.read_csv("../tables/2025/ht_api_input.csv")
    df['Date'] = pd.to_datetime(df.Date)
    
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')

    time = datetime.now() + timedelta(hours=-8)
    df = df[(df.Date == str(time.date())) & (df.Player == plyr)].drop(['Season', 'Date', 'PTS'], axis=1)

    df_preds = pd.read_csv("../tables/2025/gmday_preds_PTS.csv")
    df_preds['Date'] = pd.to_datetime(df_preds.Date)
    df_preds = df_preds[(df_preds.Date == str(time.date())) & (df_preds.Player == plyr)]
    if df_preds.shape[0]> 0:
        pregm_mins = float(round(df_preds['MP'].iloc[0], 1))
        pregm_pts = float(round(df_preds['PTS_proj'].iloc[0], 2))
    else:
        pregm_mins = 0
        pregm_pts = 0

    df_ht = get_live_stat()
    df_ht = df_ht[df_ht.PLAYER == plyr]
    if df_ht.shape[0] > 0:
        for catg in ['MP', 'PTS', 'FG', 'FGA', 'FT', 'FTA', 'TPM', 'TPA', 'PF', 'TeamPTS_pct', 'TeamFGA_pct', 'Spread']:
            if catg in ['TeamPTS_pct', 'TeamFGA_pct']:
                ht_stat = df_ht[catg].iloc[0]
            else:
                ht_stat = int(df_ht[catg].iloc[0])
            df.loc[df['Player'] == plyr, f'{catg}_h1'] = ht_stat
        df.loc[df['Player'] == plyr, 'PTS_proj'] = pregm_pts
        df.loc[df['Player'] == plyr, 'Player_Pace_Rel'] = df_ht['Player_Pace_Rel'].iloc[0]
        df.loc[df['Player'] == plyr, 'Pace_Minutes_Interaction'] = df_ht['Pace_Minutes_Interaction'].iloc[0]
        for col in ['PTS', 'FG', 'FGA']:
            df.loc[df['Player'] == plyr, f'{col}Diff'] = df[f'{col}_h1'] - df[f'{col}_h1_base']
        display(df)
        df = df.drop(['MP'], axis=1)
        pts_prediction_qlow = int(round(ht_model_Qlow.predict(df)[0], 0))
        pts_prediction_mean = int(round(ht_model_mean.predict(df)[0], 0))
        pts_prediction_qhigh = int(round(ht_model_Qhigh.predict(df)[0], 0))
        
        spread_factor = (df['Spread_h1'].abs() / 20).clip(lower=0, upper=1).iloc[0]
        if df['Spread_h1'].iloc[0] < 0:
            # favored → ceiling matters more
            pts_prediction = int(round(pts_prediction_mean * (1 - spread_factor) +
                                pts_prediction_qhigh * spread_factor, 0))
        else:
            # underdog → floor matters more
            pts_prediction = int(round(pts_prediction_mean * (1 - spread_factor) +
                                pts_prediction_qlow * spread_factor, 0))

        return {
            "player": plyr,
            "current_mins": int(df_ht['MP'].iloc[0]),
            "current_pts": int(df_ht['PTS'].iloc[0]),
            "predicted_final_pts": pts_prediction,
            "pregame_mins_preds": pregm_mins,
            "pregame_pts_preds": pregm_pts
        }
    else:
        return {
            "player": "Player Unavailable",
            "current_mins": 0,
            "current_pts": 0,
            "predicted_final_pts": 0,
            "pregame_mins_preds": 0,
            "pregame_pts_preds": 0
        }
import requests
predict('Lauri Markkanen')

Unnamed: 0,Team,Opp,Player,Pos,role,MP_h1,PTS_h1,FG_h1,FGA_h1,FT_h1,FTA_h1,TPM_h1,TPA_h1,PF_h1,PTS_proj,TeamPTS_pct_h1,TeamFGA_pct_h1,Spread_h1,Player_Pace_Rel,Pace_Minutes_Interaction,FG_h1_base,FGA_h1_base,FGA_h2_base,PTS_h1_base,PTS_h2_base,PTSDiff,FGDiff,FGADiff,MP
311,UTA,GSW,Lauri Markkanen,PF,1,33.0,18.0,6.0,14.0,5.0,5.0,1.0,6.0,1.0,28.52,0.145161,0.155556,-16.0,0.537688,16.2,4.63,10.16,8.52,13.58,13.08,4.42,1.37,3.84,35.18273


{'player': 'Lauri Markkanen',
 'current_mins': 33,
 'current_pts': 18,
 'predicted_final_pts': 32,
 'pregame_mins_preds': 35.2,
 'pregame_pts_preds': 28.52}

# Today's predictions

In [15]:
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_PTS.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
#                 .rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
# df_gms = df_gms.merge(df_lines, on=['Date', 'Team', 'Player'])
# df_gms['Res_PTS'] = df_gms.PTS - df_gms.PTS_line

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'PTS', 'Res_PTS', 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', 'PTS_line', 'PTS_proj', 'PTS', 'Res_PTS_proj', 'Res_PTS', 'pred_prob', 'pred_class']][df_yesterday.MP > 0]

# # Mins
# df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 5, 1, 0)
# print("\nYesterday's Results:")
# print("Total Accuracy (Minutes-in-range):", (df_yesterday.InTgtRange == 1).mean())
# print((df_yesterday.InTgtRange == 1).sum(), '/', df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange'], axis=1)

# # Raw PTS
# df_yesterday['Diff'] = abs(df_yesterday['PTS'] - df_yesterday['PTS_proj'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 3, 1, 0)
# df_yesterday['Act_Res'] = np.where((df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['Pred_Res'] = np.where((df_yesterday.PTS_proj > df_yesterday.PTS_line), 1, 0)
# df_yesterday['PHit1'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (Raw PTS):", (df_yesterday.PHit1 == 1).mean())
# print((df_yesterday.PHit1 == 1).sum(), "/", df_yesterday.shape[0])
# print("Total Accuracy (Raw PTS-in-range):", (df_yesterday.InTgtRange == 1).mean())
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange', 'Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Regression)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday['Res_PTS_proj'] > 0, 'O', 'U')
# df_yesterday['PHit2'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (ResPTS Regression):", (df_yesterday.PHit2 == 1).mean())
# print((df_yesterday.PHit2 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Classifier)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 1, 0)
# df_yesterday['PHit3'] = np.where(df_yesterday['Act_Res'] == df_yesterday['pred_class'], 1, 0)
# df_yesterday['pred_class'] = np.where(df_yesterday['pred_class'] == 1, 'O', 'U')
# print("Total Accuracy (ResPTS Classification):", (df_yesterday.PHit3 == 1).mean())
# print((df_yesterday.PHit3 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res'], axis=1)

# df_yesterday['Majority'] = np.where(((df_yesterday.PTS_proj > df_yesterday.PTS_line).astype(int) + (df_yesterday['Res_PTS_proj'] > 0).astype(int) + (df_yesterday['pred_class'] == 1).astype(int)) >= 2, 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 1) & (df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 0) & (df_yesterday.PTS < df_yesterday.PTS_line), 1, df_yesterday.MajorityHit)
# print("Total Accuracy (MajorityHit):", (df_yesterday.MajorityHit == 1).mean())

# df_yesterday['AllAgree'] = '-'
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 1) & (df_yesterday.PHit2 == 1) & (df_yesterday.PHit3 == 1), 1, df_yesterday['AllAgree'])
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 0) & (df_yesterday.PHit2 == 0) & (df_yesterday.PHit3 == 0), 0, df_yesterday['AllAgree'])
# print("Total Accuracy (AllAgree):", ((df_yesterday.AllAgree == 1).sum() / ((df_yesterday.AllAgree == 0).sum() + (df_yesterday.AllAgree == 1).sum())))

# df_yesterday = df_yesterday.drop(['Majority', 'MajorityHit', 'AllAgree'], axis=1).sort_values('PTS_line', ascending=False)

# # if df_yesterday.shape[0] >= 50:
# #     for tm in df_yesterday.Team.unique():
# #         display(df_yesterday[(df_yesterday.Team == tm)]) #  & (df_yesterday.PHit == 1)
# # else:
# #     display(df_yesterday)

In [96]:
df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull())]

df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_prediction = df_pred.copy()

# Predict Mins
mins_booster = xgb.Booster()
mins_booster.load_model(f"{MDL_PATH}/mins_model.json")
mins_model = XGBRegressor()
mins_model._Booster = mins_booster

df_prediction_mins = setup_df_mins(con, df_prediction)
df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))

# Predict Stat
stat_booster = xgb.Booster()
stat_booster.load_model(f"{MDL_PATH}/{tgt_stat}_model.json")
stat_model = XGBRegressor()
stat_model._Booster = stat_booster
res_booster_RG = xgb.Booster()
res_booster_RG.load_model(f"{MDL_PATH}/Res_PTS_RG_model.json")
res_model_RG = XGBRegressor()
res_model_RG._Booster = res_booster_RG
res_model_CLF = XGBClassifier()
res_model_CLF.load_model(f"{MDL_PATH}/Res_PTS_CLF_model.json")

df_prediction = setup_df_main(df_prediction, 'PTS')
df_prediction = df_prediction.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'], how='left')
df_prediction['MP'] = df_prediction.MP.fillna(df_prediction.MP_preds)
feature_cols = [col for col in df_prediction.columns if col not in ['Season', 'Date', 'MP_preds', 'PTS']]
df_prediction = df_prediction[df_prediction.Date == now][feature_cols]
df_prediction["PTS_proj"] = stat_model.predict(df_prediction)

df_prediction2 = df_pred[(~df_pred.PTS_line.isnull())].copy()
df_prediction2 = setup_df_res(df_prediction2)
df_prediction2 = df_prediction2.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'], how='left')
df_prediction2['Player'] = df_prediction2['Player'].astype('category')
df_prediction2['MP'] = df_prediction2.MP.fillna(df_prediction2.MP_preds)
feature_cols = [col for col in df_prediction2.columns if col not in ['Season', 'Date', 'MP_preds', 'Res_PTS']]
df_prediction2 = df_prediction2[df_prediction2.Date == now][feature_cols]
df_prediction2["Res_PTS_proj"] = res_model_RG.predict(df_prediction2)

df_prediction2['pred_prob'] = res_model_CLF.predict_proba(df_prediction2.drop('Res_PTS_proj', axis=1))[:,1]
df_prediction2['pred_class'] = (df_prediction2['pred_prob'] > 0.5).astype(int)

# Setup Today's Picks
df_lines = df_lines[df_lines.Date == now][['Team', 'Player', 'PTS_line']]
df_prediction = df_prediction.merge(df_lines, on=['Team', 'Player'])
df_prediction = df_prediction.merge(df_prediction2[['Team', 'Player', 'Res_PTS_proj', 'pred_prob', 'pred_class']], on=['Team', 'Player'])

tds_picks = df_prediction[~(df_prediction['PTS_line'].isnull())]\
            [['Team', 'Player', 'MP', 'PTS_line', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class']]

tds_picks['O/U'] = '-'
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 1) & (tds_picks.PTS_proj > tds_picks.PTS_line) & (tds_picks.Res_PTS_proj > 0)), 'O', tds_picks['O/U'])
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 0) & (tds_picks.PTS_proj < tds_picks.PTS_line) & (tds_picks.Res_PTS_proj < 0)), 'U', tds_picks['O/U'])

tds_picks = tds_picks.sort_values(['O/U', 'Team', 'Player'], ascending=[False, False, False])
if tds_picks.shape[0] >= 50:
    print(tds_picks.shape[0], 'rows')
    for tm in tds_picks.Team.unique():
        display(tds_picks[tds_picks.Team == tm])
else:
    display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/{YEAR}/gmday_preds_PTS.csv")

134 rows


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
128,UTA,Cody Williams,21.306187,6.5,6.171829,-0.493916,0.418416,0,U
133,UTA,Svi Mykhailiuk,26.354691,6.5,9.619009,2.400501,0.808317,1,O
132,UTA,Lauri Markkanen,35.182732,22.5,28.52359,0.809306,0.644454,1,O
131,UTA,Keyonte George,36.178844,22.5,25.759762,3.818277,0.553919,1,O
130,UTA,Jusuf Nurkic,29.147781,12.5,14.119341,2.703654,0.675811,1,O
126,UTA,Ace Bailey,29.706282,11.5,13.761344,0.312431,0.550956,1,O
129,UTA,Isaiah Collier,24.097258,8.5,8.422895,0.279352,0.387068,0,-
127,UTA,Brice Sensabaugh,23.460093,12.5,13.116328,1.064627,0.490736,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
122,TOR,Jamal Shead,23.400576,6.5,6.398445,-0.489943,0.449122,0,U
123,TOR,RJ Barrett,28.032782,14.5,18.160366,2.101881,0.510027,1,O
121,TOR,Ja'Kobe Walter,19.252653,5.5,6.897011,1.58056,0.523634,1,O
120,TOR,Immanuel Quickley,32.250401,16.5,17.724289,0.340351,0.511274,1,O
125,TOR,Scottie Barnes,33.691101,17.5,18.087616,0.362321,0.450286,0,-
124,TOR,Sandro Mamukelashvili,24.088079,10.5,12.080569,-0.449995,0.460613,0,-
119,TOR,Gradey Dick,18.960562,5.5,7.355232,-0.864902,0.402344,0,-
118,TOR,Collin Murray-Boyles,28.127724,8.5,9.809435,-0.162781,0.531734,1,-
117,TOR,Brandon Ingram,34.865784,20.5,21.00071,0.31627,0.292175,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
110,SAS,Devin Vassell,23.87056,10.5,10.310769,-0.606641,0.407405,0,U
114,SAS,Luke Kornet,20.311012,5.5,6.671309,1.666428,0.636001,1,O
113,SAS,Keldon Johnson,25.810066,11.5,13.92502,0.605305,0.650118,1,O
112,SAS,Julian Champagnie,31.147444,9.5,14.327088,0.945551,0.580772,1,O
116,SAS,Victor Wembanyama,31.387625,23.5,21.408607,1.718103,0.539197,1,-
115,SAS,Stephon Castle,32.100159,14.5,15.918314,-0.863739,0.212546,0,-
111,SAS,Harrison Barnes,27.067879,6.5,9.087519,0.509471,0.415011,0,-
109,SAS,De'Aaron Fox,33.516754,16.5,17.306877,-2.05433,0.25894,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
106,ORL,Moritz Wagner,16.948359,8.5,6.862045,-0.337324,0.409669,0,U
108,ORL,Wendell Carter Jr.,31.856203,11.5,12.365339,2.614792,0.635908,1,O
103,ORL,Anthony Black,35.34124,15.5,17.719006,4.433024,0.973828,1,O
107,ORL,Paolo Banchero,36.795315,23.5,24.068604,-2.262488,0.414397,0,-
105,ORL,Jalen Suggs,26.885761,14.5,14.511093,0.948709,0.472085,0,-
104,ORL,Desmond Bane,35.697788,19.5,21.056488,-1.619485,0.418017,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
102,NYK,OG Anunoby,34.851376,16.5,15.533647,-2.642068,0.409211,0,U
98,NYK,Karl-Anthony Towns,32.257942,20.5,19.285894,-1.842853,0.463174,0,U
101,NYK,Miles McBride,26.257391,10.5,12.270311,2.494807,0.670341,1,O
99,NYK,Landry Shamet,20.771441,8.5,9.288602,1.189191,0.651909,1,O
100,NYK,Mikal Bridges,35.162853,15.5,16.729496,0.023171,0.331138,0,-
97,NYK,Josh Hart,34.464626,11.5,13.54886,0.740651,0.491701,0,-
96,NYK,Jalen Brunson,36.117271,24.5,26.580009,-4.78986,0.304016,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
95,MIN,Rudy Gobert,31.732119,10.5,9.699678,-0.200298,0.479051,0,U
92,MIN,Jaden McDaniels,32.784672,13.5,13.876552,0.456564,0.683089,1,O
91,MIN,Donte DiVincenzo,32.50407,12.5,13.215135,0.196107,0.590183,1,O
94,MIN,Naz Reid,25.844696,14.5,13.847713,0.202226,0.534677,1,-
93,MIN,Julius Randle,33.737961,21.5,19.763731,1.110258,0.489447,0,-
90,MIN,Anthony Edwards,36.815159,28.5,28.456247,2.863827,0.653932,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
80,MEM,Vince Williams Jr.,20.230556,7.5,6.944017,-1.620516,0.436372,0,U
79,MEM,Santi Aldama,26.447964,13.5,13.120292,-0.723129,0.370907,0,U
73,MEM,Cam Spencer,27.56414,12.5,12.918036,2.823987,0.682889,1,O
78,MEM,Kentavious Caldwell-Pope,20.583794,8.5,8.033516,1.421992,0.73475,1,-
77,MEM,Jock Landale,26.989328,13.5,13.622349,0.95025,0.494492,0,-
76,MEM,Jaylen Wells,26.692636,11.5,11.260164,1.138114,0.655214,1,-
75,MEM,Jaren Jackson Jr.,32.002899,22.5,21.415703,-0.054336,0.590891,1,-
74,MEM,Cedric Coward,27.946356,14.5,14.457175,0.363391,0.505623,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
72,LAL,Rui Hachimura,25.760555,11.5,10.796467,-1.237734,0.438629,0,U
71,LAL,Marcus Smart,30.860548,9.5,11.51764,1.3726,0.584,1,O
69,LAL,LeBron James,34.77972,23.5,23.874508,0.942439,0.626993,1,O
68,LAL,Jaxson Hayes,17.132568,5.5,6.131641,1.25515,0.673754,1,O
67,LAL,Jake LaRavia,32.27026,8.5,11.032856,1.262001,0.621955,1,O
66,LAL,Deandre Ayton,29.163366,10.5,14.698052,1.499971,0.656301,1,O
70,LAL,Luka Doncic,37.157677,33.5,33.125084,1.254814,0.534179,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
61,IND,Bennedict Mathurin,22.492777,15.5,10.975784,-4.16624,0.180255,0,U
65,IND,T.J. McConnell,17.614693,8.5,10.186738,1.855935,0.514099,1,O
64,IND,Pascal Siakam,33.892799,24.5,24.806604,1.161651,0.634898,1,O
63,IND,Johnny Furphy,25.518938,5.5,8.223293,2.216329,0.618165,1,O
62,IND,Jarace Walker,28.902828,11.5,13.442444,3.381226,0.529835,1,O
60,IND,Andrew Nembhard,32.764053,16.5,16.587137,2.352827,0.511664,1,O
59,IND,Aaron Nesmith,30.808008,13.5,13.699953,-1.821951,0.228205,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
39,DAL,Cooper Flagg,28.80304,19.5,17.641901,-1.410859,0.474677,0,U
43,DAL,Naji Marshall,34.335163,18.5,20.043348,4.143479,0.934382,1,O
42,DAL,Max Christie,32.380116,15.5,16.240276,2.489353,0.687705,1,O
41,DAL,Klay Thompson,22.059982,11.5,11.577646,1.096285,0.679109,1,O
38,DAL,Caleb Martin,25.083073,6.5,7.314785,0.650877,0.540068,1,O
44,DAL,P.J. Washington,32.064095,13.5,12.956758,0.882267,0.429849,0,-
40,DAL,Dwight Powell,22.647526,5.5,5.584686,-0.152935,0.447695,0,-
37,DAL,Brandon Williams,25.300819,16.5,15.388064,1.40917,0.624373,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
34,CLE,Jarrett Allen,30.460924,16.5,12.958956,-1.963901,0.271438,0,U
33,CLE,Donovan Mitchell,35.03764,31.5,26.499044,-2.501465,0.470947,0,U
31,CLE,De'Andre Hunter,22.71706,13.5,12.754716,-2.167869,0.370479,0,U
30,CLE,Craig Porter Jr.,19.42251,5.5,5.264653,-0.665511,0.324083,0,U
36,CLE,Sam Merrill,27.138739,12.5,14.613729,2.342907,0.526429,1,O
32,CLE,Dean Wade,27.548403,6.5,7.443713,1.203988,0.596353,1,O
35,CLE,Jaylon Tyson,30.796,16.5,16.063614,2.299672,0.753486,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
9,BOS,Derrick White,35.785286,18.5,17.934444,-0.203596,0.447693,0,U
13,BOS,Sam Hauser,29.572908,10.5,13.869758,4.561625,0.6639,1,O
12,BOS,Payton Pritchard,33.409725,16.5,17.330622,1.243518,0.521252,1,O
10,BOS,Jaylen Brown,36.665852,29.5,32.173462,2.680717,0.613811,1,O
8,BOS,Anfernee Simons,27.367434,14.5,17.265013,1.36408,0.519782,1,O
11,BOS,Luka Garza,16.820793,9.5,8.381854,-0.586634,0.516288,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
3,ATL,Jalen Johnson,36.897755,22.5,21.379076,-2.067246,0.367365,0,U
6,ATL,Nickeil Alexander-Walker,33.524307,17.5,19.734295,3.715534,0.736963,1,O
7,ATL,Onyeka Okongwu,34.207535,16.5,16.711864,-1.450481,0.56871,1,-
5,ATL,Mouhamed Gueye,18.144245,4.5,4.811485,-1.420933,0.250698,0,-
4,ATL,Luke Kennard,23.003637,8.5,8.938796,1.26637,0.471703,0,-
2,ATL,Dyson Daniels,34.997303,11.5,12.50599,1.185488,0.392133,0,-
1,ATL,Corey Kispert,24.454361,7.5,9.824321,1.496033,0.493926,0,-
0,ATL,CJ McCollum,25.92976,17.5,12.223768,0.093695,0.366904,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
89,MIA,Simone Fontecchio,20.126413,7.5,8.656631,1.302751,0.584271,1,O
88,MIA,Pelle Larsson,28.185017,10.5,11.253278,1.273979,0.517354,1,O
85,MIA,Kasparas Jakucionis,22.669115,5.5,7.254601,1.511533,0.646035,1,O
84,MIA,Jaime Jaquez Jr.,28.605949,13.5,14.076456,1.816188,0.56261,1,O
82,MIA,Bam Adebayo,33.56797,18.5,20.052637,2.091797,0.671585,1,O
81,MIA,Andrew Wiggins,32.122559,13.5,16.65526,1.648292,0.500152,1,O
87,MIA,Norman Powell,31.805313,21.5,23.782722,0.620323,0.374659,0,-
86,MIA,Kel'el Ware,17.003757,7.5,7.194916,0.913896,0.499279,0,-
83,MIA,Davion Mitchell,25.520702,8.5,8.187589,1.585737,0.508714,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
58,HOU,Tari Eason,27.754837,11.5,13.057947,3.557105,0.744239,1,O
57,HOU,Reed Sheppard,23.173861,10.5,12.528052,0.440893,0.535194,1,O
55,HOU,Jabari Smith Jr.,36.128723,13.5,13.945484,1.059096,0.638172,1,O
56,HOU,Kevin Durant,38.704124,24.5,27.42786,0.52169,0.417597,0,-
54,HOU,Amen Thompson,37.868938,16.5,16.411047,1.962657,0.594819,1,-
53,HOU,Alperen Sengun,35.115601,18.5,20.631247,-1.192946,0.616146,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
50,GSW,Quinten Post,17.177095,7.5,8.477519,1.402624,0.666822,1,O
49,GSW,Moses Moody,27.040905,12.5,13.127312,0.861361,0.512838,1,O
48,GSW,Draymond Green,28.666599,8.5,9.888937,0.951479,0.588456,1,O
46,GSW,Brandin Podziemski,29.391123,13.5,14.890303,1.378177,0.516624,1,O
45,GSW,Al Horford,25.119907,8.5,9.209018,1.728902,0.608461,1,O
52,GSW,Will Richard,19.891607,5.5,7.191288,-0.899505,0.488706,0,-
51,GSW,Stephen Curry,32.727139,29.5,29.661711,-0.447781,0.558773,1,-
47,GSW,De'Anthony Melton,20.334774,13.5,12.486566,0.563936,0.371545,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
28,CHO,Moussa Diabate,29.230457,7.5,9.993827,0.955827,0.667756,1,O
25,CHO,Kon Knueppel,32.476696,17.5,19.276728,1.943693,0.586502,1,O
23,CHO,Collin Sexton,19.046099,11.5,13.121642,2.126216,0.714891,1,O
22,CHO,Brandon Miller,33.249443,20.5,22.849684,0.559426,0.521448,1,O
29,CHO,Ryan Kalkbrenner,20.815607,6.5,8.294676,-0.612254,0.619088,1,-
27,CHO,Miles Bridges,31.857027,16.5,18.871027,-0.747506,0.419393,0,-
26,CHO,LaMelo Ball,28.289276,17.5,20.876068,0.860339,0.473576,0,-
24,CHO,Grant Williams,19.559223,5.5,7.685681,-0.097282,0.354551,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
20,CHI,Matas Buzelis,29.422186,14.5,16.187704,1.955983,0.651568,1,O
19,CHI,Kevin Huerter,24.673458,8.5,11.268626,2.093251,0.735445,1,O
18,CHI,Josh Giddey,31.103487,17.5,18.405123,1.045653,0.576828,1,O
16,CHI,Isaac Okoro,26.789392,8.5,9.491913,1.03661,0.601694,1,O
15,CHI,Coby White,32.700916,20.5,22.559694,2.192879,0.684837,1,O
14,CHI,Ayo Dosunmu,24.845274,12.5,14.169738,0.916641,0.683964,1,O
21,CHI,Nikola Vucevic,32.329231,16.5,17.959341,1.81943,0.479854,0,-
17,CHI,Jalen Smith,23.717878,10.5,11.207622,0.955025,0.456001,0,-


../tables/2025/gmday_preds_PTS.csv saved!


In [31]:
# tds_picks['PTS_proj_mag'] = tds_picks.PTS_proj - tds_picks.PTS_line
# print('O:', tds_picks[tds_picks.PTS_proj_mag > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.PTS_proj_mag < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'PTS_proj_mag'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()
# tds_picks = tds_picks.drop('PTS_proj_mag', axis=1)

In [32]:
# print('O:',tds_picks[tds_picks.Res_PTS_proj > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.Res_PTS_proj < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'Res_PTS_proj'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [33]:
# print('O:', tds_picks[tds_picks.pred_class == 1].shape[0] / tds_picks.shape[0])
# print('U:', tds_picks[tds_picks.pred_class == 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'pred_prob'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Misc.

In [34]:
# # Historical Percentages
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_{tgt_stat}.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday.rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

# df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
# df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
# df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# df_yesterday['Diff3'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff3'] <= 5, 1, 0)

# for day in df_gms.Date.unique():
#     df_temp = df_yesterday[df_yesterday.Date == day]
#     if df_temp.shape[0] > 0:
#         print(f"{day.date()} Total PTS Accuracy:", f"{(df_temp.ParlayHit == 1).sum()}/{df_temp.shape[0]}", ((df_temp.ParlayHit == 1).sum() / df_temp.shape[0]))
#         print(f"{day.date()} Total MP Accuracy:", f"{(df_temp.InRMSE_Range == 1).sum()}/{df_temp.shape[0]}", ((df_temp.InRMSE_Range == 1).sum() / df_temp.shape[0]), "\n")

In [35]:
# df_mins = df.copy()
# df_mins = setup_df_mins(con, df_mins)

# train_summary = df_mins.drop(['Season', 'Date', 'MP'], axis=1).describe().T
# gameday_summary = df_prediction_mins[df_prediction_mins.Date == now].drop(['Season', 'Date', 'MP'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [36]:
# df_main = df.copy()
# df_main = setup_df_main(df_main, tgt_stat)

# train_summary = df_main.drop(['Season', 'Team', 'Opp', 'Player', 'Pos', 'Date', 'PTS'], axis=1).describe().T
# gameday_summary = df_prediction[(~df_prediction.PTS_line.isnull())].drop(['Pos', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class', 'PTS_line'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [37]:
# df_res = df[(~df.PTS_line.isnull())].copy()
# df_res = setup_df_res(df_res)

# train_summary = df_res.drop(['Date', 'Res_PTS'], axis=1).describe().T
# gameday_summary = df_prediction2[(~df_prediction2.PTS_line.isnull())].describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [38]:
# for col in mins_train_df.drop(['Season', 'Date', 'MP'], axis=1).columns:
#     if col not in ['Team', 'Player', 'Opp', 'Pos']:
#         PartialDependenceDisplay.from_estimator(
#             mins_model,
#             mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#             features=[col],
#             grid_resolution=25
#         )
        
# ##################
# features = [('role')]
# PartialDependenceDisplay.from_estimator(
#     mins_model,
#     mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#     features=features,
#     grid_resolution=25
# )