# To do:

 - Both
     - Fix Injuries data
         - Find a better source for roster data (I found, work on the plyr_pos_xref notebook)
     - Signal Opp Injuries
 - Mins
 - PTS
     - Find more effective way to signal Defensive stats
 - Res_PTS
     - See notes in Res_PTS section

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import os
import json

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from scipy.stats import randint, uniform

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import PartialDependenceDisplay

import joblib
import warnings
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")

cwd = os.path.abspath(os.getcwd()).replace("\\", "/")
if cwd.startswith("C:/Users/Rodolfo/"):
    RUN_LOCATION = "local"
    MDL_PATH = "../ML_models/dev"
else:
    RUN_LOCATION = "cloud"
    MDL_PATH = "../ML_models"
time_offset = {"local": 3, "cloud": -5}
now = str((datetime.now() + timedelta(hours=time_offset[RUN_LOCATION]) + timedelta(hours=-3)).date())
print(f"Today's date:", now)

tgt_stat = "PTS"
print('Target Stat:', tgt_stat)

Today's date: 2026-01-27
Target Stat: PTS


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model, all_features):
    booster = model.get_booster()
    score = booster.get_score(importance_type="gain")

    df_importance = pd.DataFrame({
            "feature": all_features,
            "importance": [score.get(f, 0.0) for f in all_features]
        }).sort_values("importance", ascending=False).reset_index(drop=True)
    
    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    df_importance['importance'] = df_importance['importance'].map('{:.4f}'.format)
    if df_importance.shape[0] >= 50:
        with pd.option_context('display.max_rows', None):
            display(df_importance)
    else:
        display(df_importance)
    
    xgb.plot_importance(model)
    plt.show()

In [4]:
def compute_sample_weights(df, decay=0.99):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    max_date = df['Date'].max()
    df["days_old"] = (max_date - df['Date']).dt.days
    weights = decay ** df["days_old"]
    
    return weights.values

In [5]:
def quantile_loss(y_true, y_pred, q):
    diff = y_true - y_pred
    return np.mean(np.maximum(q * diff, (q - 1) * diff))

In [6]:
def create_baseline_model(df, pred_col, DFS):
    
    train_df, val_df, test_df = DFS

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg', 'game_spread_type'
        ]
    else:
        print(f'{pred_col} Stats Model')
        feature_cols = [
            'MP_L5_avg',
            'MP_L10_avg',
            f'{pred_col}_last_3_avg', f'{pred_col}_last_5_avg', f'{pred_col}_last_10_avg',
            f'Def_{pred_col}', f'Def_L5_{pred_col}'
        ]
    
    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))
    
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Convert to DMatrix (XGBoost internal format)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test, label=y_test)

    params = {
        "objective": "reg:squarederror",
        "max_depth": 5,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    # Train using native XGBoost API with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict on test set
    preds = bst.predict(dtest)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return bst

In [7]:
def hyperparam_tuning(DFS, pred_col, is_classification=False, quantile=False, n_iter=20, early_stopping_rounds=50, decay=1, q_val=0.5):
    """
    Hyperparameter tuning for XGBRegressor or XGBClassifier using native XGBoost API
    """
    train_df, val_df, test_df = DFS
    feature_cols = [col for col in train_df.columns if col not in ['Season', 'Date', pred_col]]
    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]

    # Sample Weights (decay < 1)
    w_train = compute_sample_weights(train_df, decay=decay)
    w_val   = compute_sample_weights(val_df, decay=decay)
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train, enable_categorical=True)
    dval   = xgb.DMatrix(X_val, label=y_val, weight=w_val, enable_categorical=True)
    dtest  = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

    # Hyperparameter search space
    param_dist = {
        "n_estimators": randint(300, 1500),
        "learning_rate": uniform(0.005, 0.08),
        "max_depth": randint(3, 6),
        "min_child_weight": randint(3, 10),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.5, 0.5),
        "gamma": uniform(0, 0.8),
        "reg_lambda": uniform(0, 5),
        "reg_alpha": uniform(0, 1)
    }

    # Generate n_iter random parameter sets
    param_list = []
    for _ in range(n_iter):
        sample = {k: (v.rvs() if hasattr(v, "rvs") else v) for k,v in param_dist.items()}
        sample['n_estimators'] = int(sample['n_estimators'])
        sample['max_depth'] = int(sample['max_depth'])
        sample['min_child_weight'] = int(sample['min_child_weight'])
        param_list.append(sample)

    best_score = float('inf') if not is_classification else 0
    best_params = None
    best_bst = None

    for i, params in enumerate(param_list):
        print(f"\nTrial {i+1}/{n_iter}: {params}")
        num_boost_round = params.pop('n_estimators')

        # Set objective based on regression or classification
        if is_classification:
            params.update({
                "objective": "binary:logistic",
                "enable_categorical": True,
                "eval_metric": "logloss",
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })
        elif quantile:
            params.update({
                "objective": "reg:quantileerror",
                "quantile_alpha": q_val, 
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        else:
            params.update({
                "objective": "reg:squarederror",
                "enable_categorical": True,
                "tree_method": "hist",
                "device": "cuda",
                "seed": 42
            })

        evals = [(dtrain, 'train'), (dval, 'val')]
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )

        # Validation scoring
        val_preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        if is_classification:
            val_class = (val_preds > 0.5).astype(int)
            score = (val_class == y_val.values).mean()  # accuracy
            print(f"Validation Accuracy: {score:.4f}")
            if score > best_score:
                best_score = score
                best_params = params.copy()
                best_bst = bst
        elif quantile:
            q_loss = quantile_loss(y_val.values, val_preds, q_val)
            print(f"Validation Quantile Loss (q_val={q_val}): {q_loss:.4f}")
            if q_loss < best_score:
                best_score = q_loss
                best_params = params.copy()
                best_bst = bst
        else:
            mae = mean_absolute_error(y_val, val_preds)
            print(f"Validation MAE: {mae:.4f}")
            if mae < best_score:
                best_score = mae
                best_params = params.copy()
                best_bst = bst

    print("\nBest score:", best_score)
    print("Best parameters:", best_params)

    # Test predictions
    test_preds = best_bst.predict(dtest, iteration_range=(0, best_bst.best_iteration))
    if is_classification:
        test_class = (test_preds > 0.5).astype(int)
        acc = (test_class == y_test.values).mean()
        print("\nTest Accuracy:", acc)
    else:
        print("\nTest Metrics:")
        if quantile:
            ql = quantile_loss(y_test, test_preds, q_val)
            print("Quantile loss:", ql)
            coverage = np.mean(y_test <= test_preds)
            print(f"Coverage for q_val={q_val}: {coverage:.2f}")
        else:
            print("RMSE:", np.sqrt(mean_squared_error(y_test, test_preds)))
            print("MAE:", mean_absolute_error(y_test, test_preds))
            print("R²:", r2_score(y_test, test_preds))

    return best_params

In [8]:
def refit_model(df, pred_col, params_file, min_train_days=0, rolling_window=None, decay=1):
    df = df.sort_values("Date")
    dates = df["Date"].unique()
    print(f'Rows: {df.shape[0]}, Dates: {len(dates)}, min_train_days: {min_train_days}')

    feature_cols = [c for c in df.columns if c not in ["Season", "Date", pred_col]]

    # Load hyperparameters
    with open(f"{MDL_PATH}/{params_file}.json", "r") as f:
        loaded_params = json.load(f)

    preds, actuals, dates_out, predictions = [], [], [], []
    total_iters = len(dates) - min_train_days

    for idx, i in enumerate(range(min_train_days, len(dates)), start=1):
        test_date = dates[i]
        test_season = df.loc[df["Date"] == test_date, "Season"].iloc[0]

        if rolling_window:
            train_start_idx = max(0, i - rolling_window)
        else:
            train_start_idx = 0

        train_dates = dates[train_start_idx:i]

        train_df = df[df["Date"].isin(train_dates)]
        test_df  = df[df["Date"] == test_date]

        if test_df.empty:
            continue

        X_train, y_train = train_df[feature_cols], train_df[pred_col]
        X_test, y_test   = test_df[feature_cols], test_df[pred_col]

        predictions.append(test_df)
        if pred_col == 'Bet':
            model = XGBClassifier(**loaded_params)
            model.fit(X_train, y_train)
            
            test_df['pred_prob'] = model.predict_proba(X_test)[:,1]
            test_df['pred_class'] = (test_df['pred_prob'] > 0.5).astype(int)
        else:
            model = XGBRegressor(**loaded_params)
            sample_weights = compute_sample_weights(train_df, decay=decay)
            model.fit(X_train, y_train, sample_weight=sample_weights)

            y_pred = model.predict(X_test)
            preds.extend(y_pred)
            actuals.extend(y_test.values)
            dates_out.extend([test_date] * len(y_pred))

        if idx % max(1, total_iters // 20) == 0:
            pct = 100 * idx / total_iters
            print(f"Progress: {pct:6.2f}% ({idx}/{total_iters})")
            
    results = pd.concat(predictions)
    if pred_col == 'Res_PTS':
        results['Actuals'] = actuals
        results['Predictions'] = preds
        mae = mean_absolute_error(actuals, preds)
        print("Walk-forward MAE:", mae)
        results["Correct_Direction"] = (np.sign(results["Predictions"]) == np.sign(results["Actuals"])).astype(int)
        for t in [0, 1, 2, 3]:
            subset = results[results["Predictions"].abs() >= t]
            acc = subset["Correct_Direction"].mean() if len(subset) > 0 else np.nan
            print(f"|Pred| >= {t}: accuracy = {acc:.3f}, n = {len(subset)}")
    elif pred_col == 'Bet':
        cm = confusion_matrix(results['Bet'], results['pred_class'])
        print("Confusion Matrix:\n", cm)
        report = classification_report(results['Bet'], results['pred_class'])
        print("Classification Report:\n", report)
        auc = roc_auc_score(results['Bet'], results['pred_prob'])
        print(f"ROC AUC: {auc:.3f}")

        high_confidence = results.copy()
        high_confidence['pred_prob'] = np.where(high_confidence.pred_prob > 0.5, 1 - high_confidence.pred_prob, high_confidence.pred_prob)
        high_confidence = high_confidence[high_confidence['pred_prob'] <= 0.3]
        if len(high_confidence) > 0:
            hit_rate = (high_confidence['pred_class'] == high_confidence['Bet']).mean()
            print(f"High-confidence hit rate (<= 0.3 & >= 0.7): {hit_rate:.2f}")

    else:
        results['Actuals'] = actuals
        results['Predictions'] = preds
        if loaded_params['objective'] == 'reg:quantileerror':
            ql = quantile_loss(results['Actuals'], results['Predictions'], loaded_params['quantile_alpha'])
            print("Quantile loss:", ql)
            coverage = np.mean(results['Actuals'] <= results['Predictions'])
            print(f"Coverage for q_val={loaded_params['quantile_alpha']}: {coverage:.2f}")
        else:
            mae = mean_absolute_error(actuals, preds)
            rmse = np.sqrt(mean_squared_error(actuals, preds))
            r2 = r2_score(actuals, preds)
            print(f"Walk-forward RMSE: {rmse:.3f}")
            print(f"Walk-forward MAE: {mae:.3f}")
            print(f"Walk-forward R²: {r2:.3f}")

    return model, results

### Create Base df

In [9]:
def load_df(file_name):
    df = pd.DataFrame()
    for i in [2021, 2022, 2023, 2024, 2025]:
        df_temp = pd.read_csv(f"../tables/{i}/{file_name}.csv")
        df_temp['Season'] = i
        df = pd.concat([df, df_temp])
        
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df.Date)
    if file_name == "season_gamelogs":
        df = df[~df[['Date', 'Team', 'Player']].duplicated(keep='last')]
    
    return df

In [10]:
# Load dfs
df = load_df('nba_schedule')
df2 = load_df('season_gamelogs')
# df3 = load_df('REPLACE ME')
df4 = load_df('injuries')
df5 = load_df('plyr_pos_xref')
df6 = load_df('daily_lineups')
gmlog_cols = ['game_id', 'Player', 'MP', 'PF', 'PTS', 'FG', 'FGA', 'FT', 'FTA', '3PM', '3PA']
df7 = load_df('h1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h1", "PF": "PF_h1", "PTS": "PTS_h1", "FG": "FG_h1", "FGA": "FGA_h1", "FT": "FT_h1", "FTA": "FTA_h1", "3PM": "TPM_h1", "3PA": "TPA_h1"})
df8 = load_df('h2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_h2", "PF": "PF_h2", "PTS": "PTS_h2", "FG": "FG_h2", "FGA": "FGA_h2", "FT": "FT_h2", "FTA": "FTA_h2", "3PM": "TPM_h2", "3PA": "TPA_h2"})
df9 = load_df('q1_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q1", "PF": "PF_q1", "PTS": "PTS_q1", "FG": "FG_q1", "FGA": "FGA_q1", "FT": "FT_q1", "FTA": "FTA_q1", "3PM": "TPM_q1", "3PA": "TPA_q1"})
df10 = load_df('q2_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q2", "PF": "PF_q2", "PTS": "PTS_q2", "FG": "FG_q2", "FGA": "FGA_q2", "FT": "FT_q2", "FTA": "FTA_q2", "3PM": "TPM_q2", "3PA": "TPA_q2"})
df11 = load_df('q3_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q3", "PF": "PF_q3", "PTS": "PTS_q3", "FG": "FG_q3", "FGA": "FGA_q3", "FT": "FT_q3", "FTA": "FTA_q3", "3PM": "TPM_q3", "3PA": "TPA_q3"})
df12 = load_df('q4_season_gamelogs')[gmlog_cols].rename(columns={"MP": "MP_q4", "PF": "PF_q4", "PTS": "PTS_q4", "FG": "FG_q4", "FGA": "FGA_q4", "FT": "FT_q4", "FTA": "FTA_q4", "3PM": "TPM_q4", "3PA": "TPA_q4"})

df_mtch = df[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'is_OT', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df = pd.concat([df_mtch, df_mtch2])
df = df.sort_values(["Team", "Date"])
df['team_game_num'] = df.groupby(["Team", "Season"]).cumcount() + 1
df['Spread'] = np.where(df.Team_type == 'Home', df.AwayPTS - df.HomePTS, df.HomePTS - df.AwayPTS)
df['Total'] = df.AwayPTS + df.HomePTS
df['is_Win'] = np.where(df.Spread > 0, 1, 0)
df['Szn_Wins'] = df.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.merge(df5, on=['Season', 'Team'])

df2 = df2.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"})
df2['PR'] = df2.PTS + df2.REB 
df2['PA'] = df2.PTS + df2.AST
df2['RA'] = df2.REB + df2.AST
df2['PRA'] = df2.PTS + df2.REB + df2.AST
df2['STL_BLK'] = df2.STL + df2.BLK
df = df.merge(df2.drop(['Pos', 'Opp', 'Team_type'], axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')

df = df.merge(df4[['Date', 'Team', 'Player', 'Status']], on=['Date', 'Team', 'Player'], how='left')
df['Status'] = np.where((df.Active == 1) & (df.Status.isnull()), 'Available', df.Status)
df['Status'] = np.where((df.Active == 0), 'Out', df.Status)
df['Status'] = np.where((df.Status == 'Out') & (df.Active != 0), 'Available', df.Status)

df6['role'] = 1
df = df.merge(df6.drop('Pos', axis=1), on=['Season', 'Date', 'Team', 'Player'], how='left')
df['role'] = df.role.fillna(2).astype(int)

# Add gmlog splits
df_gmlog_comb = df7.merge(df8, on=['game_id', 'Player'])
for df_loop in (df9, df10, df11, df12):
    df_gmlog_comb = df_gmlog_comb.merge(df_loop, on=['game_id', 'Player'])
df = df.merge(df_gmlog_comb, on=['game_id', 'Player'], how='left')

df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
df = df.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
df['Res_PTS'] = df.PTS - df.PTS_line

df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
df_td = df[df.Date == now]
df = df[(df.Active == 1) & (df.MP > 0)]
df_pred = df.copy()
df_pred = pd.concat([df_pred, df_td])
print('base df created', datetime.now())

base df created 2026-01-27 17:31:25.165188


### Feature Engineering Helper Functions

In [11]:
def create_df_missing(df, pred_col):

    df3 = load_df('season_gamelogs')
    df3 = df3.rename(columns={"3PM": "TPM", "3PA": "TPA", "3P%": "TP%", "TRB": "REB"}).drop(['Pos', 'Opp'], axis=1)
    df4 = load_df('injuries')
    
    # Fill missing games from injuries.csv
    team_games = df_pred[['Season', 'Team', 'Date']].drop_duplicates()
    players = df_pred[['Season','Player','Team']].drop_duplicates()
    fabricated = (players.sort_values('Season').groupby('Player', as_index=False).last())
    fabricated['Season'] = fabricated['Season'] + 1
    players = pd.concat([players, fabricated], ignore_index=True).drop_duplicates(['Season','Player','Team'])
    expanded = team_games.merge(players, on=['Season', 'Team'], how='left')

    df5 = load_df('plyr_pos_xref')

    expanded = expanded.merge(df3[['Season', 'Player', 'Date', 'MP']], on=['Season', 'Player', 'Date'], how='left').drop_duplicates(['Season', 'Date', 'Player', 'Team'])
    expanded = expanded[(expanded.MP.isnull()) & (expanded.Date != now)].drop('MP', axis=1)
    expanded = pd.concat([expanded, df4[df4.Status == 'Out'][['Season', 'Team', 'Date', 'Player']]])
    df4 = df4.merge(expanded, on=['Season', 'Date', 'Team', 'Player'], how='right')

    # Grab outs from players season gamelogs
    df4 = df4.merge(df3, on=['Season', 'Date', 'Team', 'Player'], how='outer')
    df4['Status'] = np.where(((df4.Active == 1) | (df4.MP > 0)), 'Available', df4.Status)
    df4['Status'] = np.where(((df4.Active == 0) | (df4.MP == 0) | (df4.MP.isnull())), 'Out', df4.Status)
    df4['Status'] = np.where((df4.Status == 'Out') & (df4.MP > 0), 'Available', df4.Status)
    df4['Status'] = np.where((df4.Status != 'Out') & (df4.MP == 0), 'Out', df4.Status)
    df4 = df4[df4.Status == 'Out'][['Season', 'Date', 'Team', 'Player']].drop_duplicates()
    
    df_missing = df[['Season', 'Date', 'Team', 'Player', 'role', pred_col]].copy()
    df_missing[f'{pred_col}_L10'] = (
        df_missing.sort_values(['Player', 'Date']).groupby(['Player','Season'])[pred_col].shift(1)
                  .transform(lambda x: x.rolling(10, min_periods=10).mean())
    )
    df_missing['role_L10_mode'] = (
        df_missing.sort_values(['Player', 'Date'])
            .groupby(['Player', 'Season'])['role'].shift(1)
            .transform(lambda x: x.rolling(10, min_periods=10)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
    )
    df_missing = pd.merge_asof(df4, df_missing[["Season", "Player", "Date", "role", "role_L10_mode", f"{pred_col}_L10"]], 
                      on="Date", by=["Player", "Season"], direction="backward", allow_exact_matches=True).dropna()   
    df_missing = df_missing.merge(df5, on=['Season', 'Team', 'Player'])
    
    # Filter out old injuries
    df_missing = df_missing.sort_values(["Season", "Team", "Player", "Date"])
    df_missing["team_game_num"] = (df_missing.groupby(["Season", "Team"])["Date"].rank(method="dense").astype(int))
    df_missing["game_break"] = (df_missing.groupby(["Season", "Team", "Player"])["team_game_num"].diff().ne(1))
    df_missing["streak_id"] = (df_missing.groupby(["Season", "Team", "Player"])["game_break"].cumsum())
    df_missing["consecutive_games"] = (df_missing.groupby(["Season", "Team", "Player", "streak_id"]).cumcount().add(1))
    df_missing["eligible_today"] = (df_missing["consecutive_games"] <= 10).astype(int)
    df_missing["role_for_count"] = np.where(df_missing["eligible_today"] == 1, df_missing["role_L10_mode"], np.nan)    
    df_missing[f'{pred_col}_L10'] = np.where(df_missing['role_for_count'] == 1, df_missing[f'{pred_col}_L10'], 0)

#     display(df_missing[(df_missing.Team == 'CLE') & (df_missing.Date == '2026-01-23')].tail(10))

    out_minutes = (
    df_missing
      .groupby(["Season", "Date", "Team"])
      .agg(
          tgt_available=(f"{pred_col}_L10", lambda x: x.sum()),
          starters_out=("role_for_count", lambda x: (x == 1).sum())
      )
      .reset_index()
    ).rename(columns={"tgt_available": f"team_{pred_col}_available"})

    return out_minutes

In [12]:
def filter_out_early_exits(df):
    for N in [3, 5, 10]:
        df[f'MP_L{N}_avg'] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['MP'].shift(1)
             .rolling(window=N, min_periods=N)
             .mean()
        )
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)    
    df['Early_Exit'] = ((df['MP_base'].notna()) & (df['MP_base'] > 0) &
                        (
                          ((df['MP'] - df['MP_base']) / df['MP_base'] <= -0.4) |  
                          ((df['MP_q4'] == 0) & (df['role'] == 1))
                        )).astype(int)    
    df = df[df.Early_Exit == 0]
    df = df.drop('Early_Exit', axis=1)
    
    return df

# Minutes Projection Model

In [13]:
def setup_df_mins(con, df):

    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'role',
             'MP', 'MP_q4', 'Spread', 'team_game_num', 'is_OT']]    
    cleanup_cols = []
    cold_features = []
    df = filter_out_early_exits(df)
    
    df['team_mins_pct'] = df['MP'] / (240 + (df.is_OT * 25))
    for col in ['MP', 'team_mins_pct']:
        for N in [3, 5, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                 .rolling(window=N, min_periods=N)
                 .mean()
            )
            df[f'is_cold_{col}_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
            cold_features.append(f'is_cold_{col}_L{N}')
            cleanup_cols.append(f'{col}_L{N}_avg')
    df['MP_base'] = df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['MP_L3_avg', 'MP_L5_avg', 'MP_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    df['MP_tm_pct_base'] = df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[['team_mins_pct_L3_avg', 'team_mins_pct_L5_avg', 'team_mins_pct_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
    
    df['role'] = np.where((df.role == 2) & (df.MP_base < 13), 3, df.role)
    
    games_last_14_days = df.sort_values(['Player', 'Season', 'Date']).groupby(['Player', 'Season']).rolling('14D', on='Date', closed='left')['MP'].count().reset_index().rename(columns={"MP": "gms_L14_days"})
    games_last_14_days = games_last_14_days.drop_duplicates(
        subset=['Player', 'Season', 'Date']
    )
    df = df.merge(games_last_14_days, on=['Player', 'Season', 'Date'])
    df['gms_L14_days'] = df.gms_L14_days.fillna(0).astype(int)    
    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
    df['games_since_return'] = (df.groupby(['Player', 'Team', 'Season']).apply(
                                    lambda g: (
                                        (g['team_game_num'].diff().sub(1).fillna(0).gt(0))
                                        .cumsum()
                                        .groupby((g['team_game_num'].diff().sub(1).fillna(0).gt(0)).cumsum()).cumcount()
                                    )
                                ).reset_index(level=[0,1,2], drop=True))

    for N in [1, 3, 5]:
        df[f"recent_role_L{N}"] = (
            df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['role'].shift(1)
              .transform(lambda x: x.rolling(N, min_periods=N)
                            .apply(lambda y: np.bincount(y.astype(np.int8), minlength=4).argmax(), raw=True))
        )
        df[f'is_cold_recent_role_L{N}'] = (df.groupby(['Player', 'Season']).cumcount() < N).astype(int)
        cold_features.append(f'is_cold_recent_role_L{N}')
        cleanup_cols.append(f"recent_role_L{N}")
    
#     df['game_spread_type'] = 0
#     df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
#     df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 21), 2, df.game_spread_type) 
#     df['game_spread_type'] = np.where(abs(df.Spread) > 21, 3, df.game_spread_type) 
    
    df2 = create_df_missing(df, 'MP')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_MP_available']:
        df[col] = df[col].fillna(0).astype(int)
        
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df['returning_MP'] = (
        (df['MP_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df['team_MP_available'] = df['team_MP_available'] - df['returning_MP']

    df['MP_Change'] = 0
    MP_Inc_conds = (
#                     ((df.role != 3) & (df.starters_out > 2)) | 
                    ((df.role == 1) & (df.recent_role_L1 > 1.0)).astype(int) + 
                    (df.team_MP_available >= 110).astype(int)
                   )
    
    MP_Dec_conds = (
                    ((df.role > 1) & ((df.recent_role_L1 == 1.0))).astype(int) + 
                    (df.team_MP_available < -23).astype(int)
                   ) * -1
    df['MP_Change'] = MP_Inc_conds + MP_Dec_conds

    df['scenario_mins'] = (
        df.sort_values(['Season','Team','role','Pos','Date'])
          .groupby(['Season','Team','role','Pos'])['MP'].shift(1)
          .expanding()
          .mean() 
          .reset_index(drop=True)
    )
    
    df['MP_trend'] = df['MP_L3_avg'] - df['MP_L10_avg']
    df['Expected_MP'] = (
        (0.8 * df['scenario_mins']) +
        (df['team_MP_available'] * df['MP_tm_pct_base']) + 
        (0.2 * df['MP_base']) + df['MP_trend']
    )
    
    df["is_cold_start"] = (df[cold_features].eq(1).any(axis=1).astype(int))
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['Team_type', 'team_game_num', 'is_OT', 'Spread', 'team_mins_pct', 'MP_tm_pct_base', 
                  'returning_MP', 'scenario_mins', 'MP_q4'] + cleanup_cols + cold_features, axis=1)    


    return df

In [14]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

game_dates = (df_mins[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

mins_train_df = df_mins[df_mins['Date'] <= train_end]
mins_val_df   = df_mins[(df_mins['Date'] > train_end) & (df_mins['Date'] <= val_end)]
mins_test_df  = df_mins[df_mins['Date'] > val_end]
mins_DFS = (mins_train_df, mins_val_df, mins_test_df)

# Prev r2/mae/rmse best: 0.7064/4.0508/5.1790 [1/24/2026]
# mins_params = hyperparam_tuning(mins_DFS, "MP", n_iter=25)
# with open(f"{MDL_PATH}/mins_params.json", "w") as f:
#     json.dump(mins_params, f)

# Prev r2/mae/rmse best: 0.729/3.615/4.599 [1/24/2026]
mins_model, mins_results = refit_model(df_mins, 'MP', 'mins_params', min_train_days=910, rolling_window=180)
# feature_importance(mins_model, df_mins.columns.tolist())

mins_model.get_booster().save_model(f"{MDL_PATH}/mins_model.json")
print('Saved Mins booster!')

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,MP_base,gms_L14_days,missed_games,games_since_return,team_MP_available,starters_out,starters_returning,MP_Change,MP_trend,Expected_MP,is_cold_start
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,1,22.98,,0,0,0,0.0,0,0,0,,,1
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,2,3.75,,0,0,0,0.0,0,0,0,,,1
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,2,3.75,,0,0,0,0.0,0,0,0,,,1
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,2,3.75,,0,0,0,0.0,0,0,0,,,1
4,2021,2021-10-19,BRK,MIL,James Harden,PG,1,30.63,,0,0,0,0.0,0,0,0,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106379,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,15.80,16.9059,6,0,19,65.0,2,0,0,-2.325667,25.539790,0
106380,2025,2026-01-26,POR,BOS,Robert Williams,C,2,19.68,17.5051,4,1,0,65.0,2,0,0,1.450667,29.598207,0
106381,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,34.32,32.4946,6,0,28,65.0,2,0,0,1.219000,36.424146,0
106382,2025,2026-01-26,POR,BOS,Sidy Cissoko,SF,1,24.77,26.2039,6,0,16,65.0,2,0,0,2.881000,35.124206,0


Rows: 106384, Dates: 943, min_train_days: 910
Progress:   3.03% (1/33)
Progress:   6.06% (2/33)
Progress:   9.09% (3/33)
Progress:  12.12% (4/33)
Progress:  15.15% (5/33)
Progress:  18.18% (6/33)
Progress:  21.21% (7/33)
Progress:  24.24% (8/33)
Progress:  27.27% (9/33)
Progress:  30.30% (10/33)
Progress:  33.33% (11/33)
Progress:  36.36% (12/33)
Progress:  39.39% (13/33)
Progress:  42.42% (14/33)
Progress:  45.45% (15/33)
Progress:  48.48% (16/33)
Progress:  51.52% (17/33)
Progress:  54.55% (18/33)
Progress:  57.58% (19/33)
Progress:  60.61% (20/33)
Progress:  63.64% (21/33)
Progress:  66.67% (22/33)
Progress:  69.70% (23/33)
Progress:  72.73% (24/33)
Progress:  75.76% (25/33)
Progress:  78.79% (26/33)
Progress:  81.82% (27/33)
Progress:  84.85% (28/33)
Progress:  87.88% (29/33)
Progress:  90.91% (30/33)
Progress:  93.94% (31/33)
Progress:  96.97% (32/33)
Progress: 100.00% (33/33)
Walk-forward RMSE: 4.579
Walk-forward MAE: 3.603
Walk-forward R²: 0.732
Saved Mins booster!


In [15]:
# analyze_df = mins_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df[analyze_df.Date.isin(['2026-01-23'])].sort_values('Diff', ascending=False).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [16]:
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull())]

# df_lines["Team"] = team_encoder.transform(df_lines["Team"])
# df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
# df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
# df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
# df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
# df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
# df_prediction = df_pred.copy()

# # Predict Mins
# mins_booster = xgb.Booster()
# mins_booster.load_model("../ML_models/dev/mins_model.json")
# mins_model = XGBRegressor()
# mins_model._Booster = mins_booster

# df_prediction_mins = setup_df_mins(con, df_prediction)
# df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))
# df_prediction_mins = df_prediction_mins[df_prediction_mins.Date == now]

# df_prediction_mins['Team'] = team_encoder.inverse_transform(df_prediction_mins["Team"])
# df_prediction_mins['Opp'] = team_encoder.inverse_transform(df_prediction_mins["Opp"])
# df_prediction_mins['Player'] = player_encoder.inverse_transform(df_prediction_mins["Player"])

# if df_prediction_mins.shape[0] >= 50:
#     print(df_prediction_mins.shape[0], 'rows')
#     for tm in df_prediction_mins.Team.unique():
#         display(df_prediction_mins[df_prediction_mins.Team == tm])
# else:
#     display(df_prediction_mins)

# Stats Model

In [17]:
def setup_df_main(df, tgt_stat):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'Spread', 'Total']]
    cleanup_cols = []

    df['missed_games'] = (df.groupby(['Player', 'Team', 'Season'])['team_game_num'].diff().sub(1).fillna(0).astype(int))
    df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

#     df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
#         'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
#         'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
#         'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
#     }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
    
    
    # Create rolling + lag features
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['TeamPTS_pct'] = df['PTS'] / df['TeamPTS']

    # Create rolling + lag features    
    for col in ['MP', tgt_stat, 'TeamPTS_pct', 'FGA']:
        for N in [1, 3, 5, 10]:
            if col == f'Def{tgt_stat}':
                df[f'Def{tgt_stat}_L{N}_avg'] = (
                    df[df.role <= 2]
                      .groupby(['Season', 'Date', 'Opp', 'Pos'])[tgt_stat]
                      .sum()
                      .groupby(['Opp', 'Pos', 'Season'])
                      .shift(1)
                      .rolling(window=N, min_periods=N)
                      .mean()
                      .reindex(df.set_index(['Season', 'Date', 'Opp', 'Pos']).index)
                      .values
                )
            else:
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = ((df[f'{col}_L3_avg'] * 0.15) + (df[f'{col}_L5_avg'] * 0.25) + (df[f'{col}_L10_avg'] * 0.6))
    
    df['role'] = np.where((df.role == 2) & (df.MP_base < 13), 3, df.role)

    df2 = create_df_missing(df, tgt_stat)
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', f'team_{tgt_stat}_available']:
        df[col] = df[col].fillna(0)
    
    df['starters_returning'] = ((df['missed_games'] > 0) & (df['role'] == 1)).astype(int)
    df[f'returning_{tgt_stat}'] = (
        (df[f'{tgt_stat}_L10_avg'] * df['starters_returning'])
        .groupby([df['Team'], df['Date']])
        .transform('sum')
    )
    
    df['starters_returning'] = df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['starters_returning'].transform('sum')
    df[f'team_{tgt_stat}_available'] = df[f'team_{tgt_stat}_available'] - df[f'returning_{tgt_stat}']
    
#     df['ProjPts'] = (df.TeamPTS * df.TeamPTS_pct_L10) + (df.team_PTS_available * df.TeamPTS_pct_L10) # TeamPTS is LEAKING
    df[f'scenario_{tgt_stat}'] = (
        df.sort_values(['Season','Team','role','Pos','Date'])
          .groupby(['Season','Team','role','Pos'])[tgt_stat].shift(1)
          .expanding()
          .mean() 
          .reset_index(drop=True)
    )
    
    df[f'Expected_{tgt_stat}'] = (
        (0.8 * df[f'scenario_{tgt_stat}']) +
        (df[f'team_{tgt_stat}_available'] * df['TeamPTS_pct_base']) + 
        (0.2 * df[f'{tgt_stat}_base'])
    )
    
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
        
    df = df.drop(['team_game_num', 'missed_games', 'MP_q4', 'Spread', 'Total', 'role', 'MP_base', 'TeamPTS', 'TeamPTS_pct', 
                 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS',
                 f'returning_{tgt_stat}', 'starters_out', 'starters_returning', f'scenario_{tgt_stat}', f'team_{tgt_stat}_available'
                 ] + cleanup_cols, axis=1)

    return df

In [18]:
df_main = df.copy()
df_main = setup_df_main(df_main, tgt_stat)
display(df_main)

game_dates = (df_main[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

main_train_df = df_main[df_main['Date'] <= train_end]
main_val_df   = df_main[(df_main['Date'] > train_end) & (df_main['Date'] <= val_end)]
main_test_df  = df_main[df_main['Date'] > val_end]
main_DFS = (main_train_df, main_val_df, main_test_df)

# Prev r2/mae/rmse best: 0.6706/3.7633/5.0858 [1/24/2026]
# stat_params = hyperparam_tuning(main_DFS, tgt_stat, n_iter=25, decay=0.99)
# with open(f"{MDL_PATH}/{tgt_stat}_params.json", "w") as f:
#     json.dump(stat_params, f)

# Prev r2/mae/rmse best: 0.695/3.628/4.840 [1/21/2026]
stat_model, stat_results = refit_model(df_main, tgt_stat, f'{tgt_stat}_params', min_train_days=910, decay=0.99)
# feature_importance(stat_model, df_main.columns.tolist())

stat_model.get_booster().save_model(f"{MDL_PATH}/{tgt_stat}_model.json")
print(f"Saved {tgt_stat} booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,MP,PTS,PTS_base,TeamPTS_pct_base,FGA_base,Expected_PTS
0,2021,2021-10-19,BRK,MIL,Blake Griffin,C,22.98,6.0,,,,
1,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,3.75,0.0,,,,
2,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,3.75,2.0,,,,
3,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,3.75,0.0,,,,
4,2021,2021-10-19,BRK,MIL,James Harden,PG,30.63,20.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
127044,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,15.80,4.0,3.27,0.029093,4.01,10.209732
127045,2025,2026-01-26,POR,BOS,Robert Williams,C,19.68,6.0,6.43,0.056638,3.14,11.805746
127046,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,34.32,9.0,24.05,0.207642,19.11,20.614838
127047,2025,2026-01-26,POR,BOS,Sidy Cissoko,SF,24.77,4.0,7.02,0.058351,5.76,11.983613


Rows: 127049, Dates: 943, min_train_days: 910
Progress:   3.03% (1/33)
Progress:   6.06% (2/33)
Progress:   9.09% (3/33)
Progress:  12.12% (4/33)
Progress:  15.15% (5/33)
Progress:  18.18% (6/33)
Progress:  21.21% (7/33)
Progress:  24.24% (8/33)
Progress:  27.27% (9/33)
Progress:  30.30% (10/33)
Progress:  33.33% (11/33)
Progress:  36.36% (12/33)
Progress:  39.39% (13/33)
Progress:  42.42% (14/33)
Progress:  45.45% (15/33)
Progress:  48.48% (16/33)
Progress:  51.52% (17/33)
Progress:  54.55% (18/33)
Progress:  57.58% (19/33)
Progress:  60.61% (20/33)
Progress:  63.64% (21/33)
Progress:  66.67% (22/33)
Progress:  69.70% (23/33)
Progress:  72.73% (24/33)
Progress:  75.76% (25/33)
Progress:  78.79% (26/33)
Progress:  81.82% (27/33)
Progress:  84.85% (28/33)
Progress:  87.88% (29/33)
Progress:  90.91% (30/33)
Progress:  93.94% (31/33)
Progress:  96.97% (32/33)
Progress: 100.00% (33/33)
Walk-forward RMSE: 4.871
Walk-forward MAE: 3.663
Walk-forward R²: 0.685
Saved PTS booster!


In [19]:
# analyze_df = stat_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df[analyze_df.Date.isin(['2026-01-24'])].sort_values('Diff', ascending=True).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Residual PTS

##### Try some sort of defensive rk L5 - d_rk today feature to measure difficulty increase/decrease
##### Create some short term Lx Average features for the target columns (Res_PTS/Bet)

In [20]:
def setup_df_res(df):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 'MP_q4', 'team_game_num', 'Res_PTS', 'PTS_line',  
             'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 
             'Spread', 'Total']]
    
    # Create rolling + lag features
    df3 = load_df('season_gamelogs')
    df3 = con.execute("""SELECT Date, Team, CAST(ROUND(SUM(MP), 0) as INT) as Team_Mins, 
                         CAST(SUM(FGA) as INT) as Team_FGA, CAST(SUM(FTA) as INT) as Team_FTA, CAST(SUM(TOV) as INT) as Team_TOV 
                         FROM df3
                         GROUP BY Date, Team""").fetchdf()
    df = df.merge(df3, on=['Date', 'Team'], how='left')
    df['eFG'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS']  = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['USG'] = (
        (df['FGA'] + 0.44*df['FTA'] + df['TOV']) * (df['Team_Mins'] / 5)
        / (df['MP'] * (df['Team_FGA'] + 0.44*df['Team_FTA'] + df['Team_TOV']))
    )

    LN_cols = []
    for col in ['MP', 'PTS', 'eFG', 'TS', 'USG']:
        for N in [3, 10]:
            if col != 'else':
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )

                if col not in ['MP']:
                    LN_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_trend'] = df[f'{col}_L3_avg'] - df[f'{col}_L10_avg']
    
    df['game_spread_type'] = 0
    df['game_spread_type'] = np.where(abs(df.Spread) < 13, 1, df.game_spread_type) 
    df['game_spread_type'] = np.where((abs(df.Spread) >= 13) & (abs(df.Spread) <= 18), 2, df.game_spread_type) 
    df['game_spread_type'] = np.where(abs(df.Spread) > 18, 3, df.game_spread_type) 
    
    df['TeamPTS'] = (df.Total + (df.Spread * -1)) / 2
#     df['TeamPTS_type'] = 0
#     df['TeamPTS_type'] = np.where((df.TeamPTS < 110), 1, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS >= 110) & (df.TeamPTS <= 130), 2, df.TeamPTS_type)
#     df['TeamPTS_type'] = np.where((df.TeamPTS > 130), 3, df.TeamPTS_type)

    df[['pts_low', 'pts_mid', 'pts_high']] = pd.DataFrame({
        'pts_low': ((110 - df.TeamPTS) / 20).clip(lower=0),
        'pts_mid': (1 - abs(df.TeamPTS - 120) / 20).clip(lower=0),
        'pts_high': ((df.TeamPTS - 130) / 20).clip(lower=0)
    }, index=df.index)
#     w = df[['pts_low','pts_mid','pts_high']]
#     df[['pts_low','pts_mid','pts_high']] = w.div(w.sum(axis=1).replace(0,1), axis=0)
        
    for col in ['TeamPTS']:
        for N in [3, 10]:
            df[f'{col}_L{N}_avg'] = (
                df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                  .rolling(window=N, min_periods=N)
                  .mean()
            )
            df[f'PTS_pct_L{N}'] = df[f'PTS_L{N}_avg'] / df[f'TeamPTS_L{N}_avg']
            df = df.drop(f'TeamPTS_L{N}_avg', axis=1)
        df['PTS_pct_trend'] = df['PTS_pct_L3'] - df['PTS_pct_L10']
        df = df.drop(['PTS_pct_L3', 'PTS_pct_L10'], axis=1)
    
    df2 = create_df_missing(df, 'PTS')
    df = df.merge(df2, on=["Season", "Date", "Team"], how='left')
    for col in ['starters_out', 'team_PTS_available']:
        df[col] = df[col].fillna(0)
    df['starters_out_L1'] = (
        df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])['starters_out'].shift(1)
          .rolling(window=1, min_periods=1)
          .mean()
    )
    df['starters_returning'] = np.where(df['starters_out_L1'] > df['starters_out'], df['starters_out_L1'] - df['starters_out'], 0)

    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')
    df = df.drop(['team_game_num', 'Spread', 'Total', 'MP_q4', 'TeamPTS', 'Team_FGA', 'Team_FTA', 'Team_TOV', 
                 'PTS', 'FG', 'FGA', 'FG%', 'TPA', 'TPM', 'TP%', 'FT', 'FTA', 'FT%', 'TOV', 'eFG', 'TS', 'USG', 
                  'Team_Mins', 'starters_out_L1'] + LN_cols, axis=1)
        
    return df

##### Regressor

In [21]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Prev r2/mae/rmse best: 0.2124/4.3069/5.6478 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Res_PTS', n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_RG_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev mae best: 4.1101 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Res_PTS', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_RG_model.json")
print("Saved Res_PTS_RG booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP,Res_PTS,PTS_line,MP_L3_avg,MP_L10_avg,MP_trend,PTS_trend,eFG_trend,TS_trend,USG_trend,game_spread_type,pts_low,pts_mid,pts_high,PTS_pct_trend,team_PTS_available,starters_out,starters_returning
0,2025,2025-11-20,ATL,SAS,Dyson Daniels,SG,1,30.68,-3.5,11.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
1,2025,2025-11-20,ATL,SAS,Jalen Johnson,SF,1,37.95,3.5,22.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
2,2025,2025-11-20,ATL,SAS,Kristaps Porzingis,C,1,29.05,-0.5,16.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
3,2025,2025-11-20,ATL,SAS,Nickeil Alexander-Walker,SG,1,33.65,20.5,17.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
4,2025,2025-11-20,ATL,SAS,Onyeka Okongwu,C,2,25.13,0.5,14.5,,,,,,,,1,0.0,0.7,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5754,2025,2026-01-26,POR,BOS,Jrue Holiday,PG,1,24.45,1.5,12.5,20.283333,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5755,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,15.80,0.5,3.5,19.746667,,,,,,,1,0.8,0.0,0.0,,35.0,2.0,0.0
5756,2025,2026-01-26,POR,BOS,Robert Williams,C,2,19.68,-0.5,6.5,18.656667,16.084,2.572667,-0.400000,,,-0.021068,1,0.8,0.0,0.0,-0.009271,35.0,2.0,0.0
5757,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,34.32,-14.5,23.5,33.440000,32.221,1.219000,1.500000,-0.029100,-0.031318,0.035742,1,0.8,0.0,0.0,0.015482,35.0,2.0,0.0


Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Walk-forward MAE: 4.229058576918787
|Pred| >= 0: accuracy = 0.631, n = 1612
|Pred| >= 1: accuracy = 0.678, n = 1142
|Pred| >= 2: accuracy = 0.735, n = 740
|Pred| >= 3: accuracy = 0.802, n = 449
Saved Res_PTS_RG booster!


##### Classifier

In [22]:
df_res = df[(~df.PTS_line.isnull())].copy()
df_res = setup_df_res(df_res)
df_res['Bet'] = (df_res['Res_PTS'] > 0).astype(int)  # 1 = over, 0 = under
df_res = df_res.drop('Res_PTS', axis=1)
# display(df_res)

game_dates = (df_res[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_cut = int(0.70 * n_days)
val_cut   = int(0.80 * n_days)
train_end = game_dates.loc[train_cut, 'Date']
val_end   = game_dates.loc[val_cut, 'Date']

res_train_df = df_res[df_res['Date'] <= train_end]
res_val_df   = df_res[(df_res['Date'] > train_end) & (df_res['Date'] <= val_end)]
res_test_df  = df_res[df_res['Date'] > val_end]
res_DFS = (res_train_df, res_val_df, res_test_df)

# Test Accuracy: 0.6672 [1/24/2026]
# res_params = hyperparam_tuning(res_DFS, 'Bet', is_classification=True, n_iter=25)
# with open(f"{MDL_PATH}/Res_PTS_CLF_params.json", "w") as f:
#     json.dump(res_params, f)

# Prev roc_auc best: 0.720 [1/18/2026]
res_model, res_results = refit_model(df_res, 'Bet', 'Res_PTS_RG_params', min_train_days=50)
# feature_importance(res_model, df_res.columns.tolist())

res_model.get_booster().save_model(f"{MDL_PATH}/Res_PTS_CLF_model.json")
print("Saved Res_PTS_CLF booster!")

Rows: 5759, Dates: 66, min_train_days: 50
Progress:   6.25% (1/16)
Progress:  12.50% (2/16)
Progress:  18.75% (3/16)
Progress:  25.00% (4/16)
Progress:  31.25% (5/16)
Progress:  37.50% (6/16)
Progress:  43.75% (7/16)
Progress:  50.00% (8/16)
Progress:  56.25% (9/16)
Progress:  62.50% (10/16)
Progress:  68.75% (11/16)
Progress:  75.00% (12/16)
Progress:  81.25% (13/16)
Progress:  87.50% (14/16)
Progress:  93.75% (15/16)
Progress: 100.00% (16/16)
Confusion Matrix:
 [[548 286]
 [300 478]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.66      0.65       834
           1       0.63      0.61      0.62       778

    accuracy                           0.64      1612
   macro avg       0.64      0.64      0.64      1612
weighted avg       0.64      0.64      0.64      1612

ROC AUC: 0.693
High-confidence hit rate (<= 0.3 & >= 0.7): 0.77
Saved Res_PTS_CLF booster!


# HT Stats

In [23]:
def setup_df_ht(df, tgt_stat):
    
    df = df[['Season', 'Date', 'Team', 'Opp', 'Player', 'Pos', 'role', 'MP', 
             'MP_h1', 'PTS_h1', 'PTS_h2', 'FG_h1', 'FGA_h1', 'FGA_h2', 'FT_h1', 'FTA_h1', 'TPM_h1', 'TPA_h1', 'PF_h1', 
             'PTS', 'FGA']]
    cleanup_cols = []
    df['MP_h2'] = df['MP'] - df['MP_h1']
    
    for col in ['PTS', 'FGA']:
        df[f'Team{col}_h1'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])[f'{col}_h1'].transform('sum'))
        df[f'Team{col}_pct_h1'] = df[f'{col}_h1'] / df[f'Team{col}_h1']

#     df['TeamPTS_h1'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['PTS_h1'].transform('sum'))
#     df['TeamPTS_pct_h1'] = df['PTS_h1'] / df['TeamPTS_h1']
#     df['TeamFGA_h1'] = (df.sort_values(['Team', 'Date']).groupby(['Team', 'Date'])['FGA_h1'].transform('sum'))
#     df['TeamFGA_pct_h1'] = df['FGA_h1'] / df['TeamFGA_h1']
    
    df['OppTeamPTS_h1'] = (
        (
        df.groupby(['Season', 'Date', 'Team'], as_index=True)['TeamPTS_h1']
        .first()
        ).reindex(
        pd.MultiIndex.from_frame(df[['Season', 'Date', 'Opp']])
        ).to_numpy())
    df['Spread_h1'] = df['TeamPTS_h1'] - df['OppTeamPTS_h1']
    
    # Create rolling + lag features
    df['role'] = np.where((df.role == 2) & (df.MP_h1 < 5), 3, df.role)

    # Create rolling + lag features    
    for col in ['MP', tgt_stat, 'FGA', 'FGA_h1', 'FGA_h2', 'PTS_h1', 'PTS_h2']:
        for N in [1, 3, 5, 10]:
            if col == f'Def{tgt_stat}':
                df[f'Def{tgt_stat}_L{N}_avg'] = (
                    df[df.role <= 2]
                      .groupby(['Season', 'Date', 'Opp', 'Pos'])[tgt_stat]
                      .sum()
                      .groupby(['Opp', 'Pos', 'Season'])
                      .shift(1)
                      .rolling(window=N, min_periods=N)
                      .mean()
                      .reindex(df.set_index(['Season', 'Date', 'Opp', 'Pos']).index)
                      .values
                )
            else:
                df[f'{col}_L{N}_avg'] = (
                    df.sort_values(['Player', 'Date']).groupby(['Player', 'Season'])[col].shift(1)
                     .rolling(window=N, min_periods=N)
                     .mean()
                )
            cleanup_cols.append(f'{col}_L{N}_avg')
        df[f'{col}_base'] = df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].mul([0.15, 0.25, 0.60]).sum(axis=1, skipna=True) / df[[f'{col}_L3_avg', f'{col}_L5_avg', f'{col}_L10_avg']].notna().mul([0.15, 0.25, 0.60]).sum(axis=1)
        
    df['PTSDiff'] = df.PTS_h1 - df.PTS_h1_base
    df['FGADiff'] = df.FGA_h1 - df.FGA_h1_base
        
    df['Team'] = df['Team'].astype('category')
    df['Opp'] = df['Opp'].astype('category')
    df['Player'] = df['Player'].astype('category')
    df['Pos'] = df['Pos'].astype('category')    
    df = df.drop(['PTS_h2', 'TeamFGA_h1', 'TeamPTS_h1', 'OppTeamPTS_h1', 'MP_base', 'PTS_base', 'FGA_base', 
                  'MP', 'FGA', 'FGA_h2'] + cleanup_cols, axis=1)

    return df

In [24]:
# FEATURES IDEAS:
# Get H2 rolling averages (h2, q3, q4)

df_ht = df.copy()
df_ht = setup_df_ht(df_ht, tgt_stat)
display(df_ht)

game_dates = (df_ht[['Date']].drop_duplicates().sort_values('Date').reset_index(drop=True))
n_days = len(game_dates)
train_end = game_dates.loc[int(0.70 * n_days), 'Date']
val_end   = game_dates.loc[int(0.80 * n_days), 'Date']

ht_train_df = df_ht[df_ht['Date'] <= train_end]
ht_val_df   = df_ht[(df_ht['Date'] > train_end) & (df_ht['Date'] <= val_end)]
ht_test_df  = df_ht[df_ht['Date'] > val_end]
ht_DFS = (ht_train_df, ht_val_df, ht_test_df)

# Prev r2/mae/rmse best: 0.8255/2.6876/3.6973 [1/26/2026]
# ht_params = hyperparam_tuning(ht_DFS, tgt_stat, n_iter=1, decay=0.99)
# with open(f"{MDL_PATH}/ht_{tgt_stat}_params.json", "w") as f:
#     json.dump(ht_params, f)

# Prev r2/mae/rmse best: 0.832/2.627/3.564 [1/26/2026]
ht_model, ht_results = refit_model(df_ht, tgt_stat, f'ht_{tgt_stat}_params', min_train_days=910, decay=0.99)
# feature_importance(ht_model, df_ht.columns.tolist())

ht_model.get_booster().save_model(f"{MDL_PATH}/ht_{tgt_stat}_model.json")
print(f"Saved ht_{tgt_stat} booster!")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP_h1,PTS_h1,FG_h1,FGA_h1,FT_h1,FTA_h1,TPM_h1,TPA_h1,PF_h1,PTS,MP_h2,TeamPTS_pct_h1,TeamFGA_pct_h1,Spread_h1,FGA_h1_base,FGA_h2_base,PTS_h1_base,PTS_h2_base,PTSDiff,FGADiff
1,2021,2021-10-19,BRK,MIL,Blake Griffin,C,1,11.98,6.0,2.0,3.0,2.0,2.0,0.0,0.0,0.0,6.0,11.00,0.101695,0.073171,-7.0,,,,,,
2,2021,2021-10-19,BRK,MIL,Bruce Brown,SF,3,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.75,0.000000,0.000000,-7.0,,,,,,
3,2021,2021-10-19,BRK,MIL,Cam Thomas,SG,3,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.75,0.000000,0.000000,-7.0,,,,,,
6,2021,2021-10-19,BRK,MIL,DeAndre' Bembry,SF,3,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.75,0.000000,0.000000,-7.0,,,,,,
9,2021,2021-10-19,BRK,MIL,James Harden,PG,1,17.83,15.0,5.0,9.0,2.0,2.0,3.0,6.0,2.0,20.0,12.80,0.254237,0.219512,-7.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256162,2025,2026-01-26,POR,BOS,Rayan Rupert,SG,2,7.18,2.0,0.0,2.0,2.0,2.0,0.0,1.0,3.0,4.0,8.62,0.054054,0.051282,-15.0,2.00,2.01,1.31,1.96,0.69,0.00
256163,2025,2026-01-26,POR,BOS,Robert Williams,C,2,8.53,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,11.15,0.054054,0.025641,-15.0,1.61,1.53,3.36,3.07,-1.36,-0.61
256165,2025,2026-01-26,POR,BOS,Shaedon Sharpe,SG,1,16.82,5.0,1.0,7.0,2.0,2.0,1.0,5.0,2.0,9.0,17.50,0.135135,0.179487,-15.0,9.26,9.85,10.84,13.21,-5.84,-2.26
256166,2025,2026-01-26,POR,BOS,Sidy Cissoko,SF,1,12.90,2.0,0.0,1.0,2.0,2.0,0.0,1.0,2.0,4.0,11.87,0.054054,0.025641,-15.0,2.99,2.77,4.96,2.06,-2.96,-1.99


Rows: 127049, Dates: 943, min_train_days: 910
Progress:   3.03% (1/33)
Progress:   6.06% (2/33)
Progress:   9.09% (3/33)
Progress:  12.12% (4/33)
Progress:  15.15% (5/33)
Progress:  18.18% (6/33)
Progress:  21.21% (7/33)
Progress:  24.24% (8/33)
Progress:  27.27% (9/33)
Progress:  30.30% (10/33)
Progress:  33.33% (11/33)
Progress:  36.36% (12/33)
Progress:  39.39% (13/33)
Progress:  42.42% (14/33)
Progress:  45.45% (15/33)
Progress:  48.48% (16/33)
Progress:  51.52% (17/33)
Progress:  54.55% (18/33)
Progress:  57.58% (19/33)
Progress:  60.61% (20/33)
Progress:  63.64% (21/33)
Progress:  66.67% (22/33)
Progress:  69.70% (23/33)
Progress:  72.73% (24/33)
Progress:  75.76% (25/33)
Progress:  78.79% (26/33)
Progress:  81.82% (27/33)
Progress:  84.85% (28/33)
Progress:  87.88% (29/33)
Progress:  90.91% (30/33)
Progress:  93.94% (31/33)
Progress:  96.97% (32/33)
Progress: 100.00% (33/33)
Walk-forward RMSE: 3.593
Walk-forward MAE: 2.675
Walk-forward R²: 0.829
Saved ht_PTS booster!


In [25]:
# analyze_df = ht_results.copy()
# analyze_df['Diff'] = analyze_df.Predictions - analyze_df.Actuals
# display(analyze_df.sort_values('Diff', ascending=False).head(10))

# plt.figure(figsize=(10,6))
# hist_col = 'Diff'
# plt.hist(analyze_df[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [26]:
df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull())]

df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_prediction = df_pred.copy()

mins_booster = xgb.Booster()
mins_booster.load_model("../ML_models/dev/mins_model.json")
mins_model = XGBRegressor()
mins_model._Booster = mins_booster
df_prediction_mins = setup_df_mins(con, df_prediction)
df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))

df_ht = setup_df_ht(df_pred, tgt_stat)
df_ht = df_ht[df_ht.Date == now]
df_ht = df_ht.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'])
df_ht['MP'] = df_ht['MP_preds']
df_ht = df_ht.drop('MP_preds', axis=1)
display(df_ht)
partition_save_df(df_ht, f"../tables/{YEAR}/ht_api_input.csv")

Unnamed: 0,Season,Date,Team,Opp,Player,Pos,role,MP_h1,PTS_h1,FG_h1,FGA_h1,FT_h1,FTA_h1,TPM_h1,TPA_h1,PF_h1,PTS,MP_h2,TeamPTS_pct_h1,TeamFGA_pct_h1,Spread_h1,FGA_h1_base,FGA_h2_base,PTS_h1_base,PTS_h2_base,PTSDiff,FGADiff,MP
0,2025,2026-01-27,BRK,PHO,Ben Saraf,SG,2,,,,,,,,,,,,,,0.0,2.60,3.24,2.56,4.25,,,17.133324
1,2025,2026-01-27,BRK,PHO,Cam Thomas,SG,2,,,,,,,,,,,,,,0.0,4.44,5.38,3.84,5.40,,,20.579041
2,2025,2026-01-27,BRK,PHO,Chaney Johnson,SF,2,,,,,,,,,,,,,,0.0,,,,,,,13.815017
3,2025,2026-01-27,BRK,PHO,Danny Wolf,PF,2,,,,,,,,,,,,,,0.0,3.56,3.90,3.78,4.58,,,18.632870
4,2025,2026-01-27,BRK,PHO,Day'Ron Sharpe,C,2,,,,,,,,,,,,,,0.0,3.90,1.31,5.85,2.16,,,19.446617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,2025,2026-01-27,WAS,POR,Sharife Cooper,PG,2,,,,,,,,,,,,,,0.0,0.00,0.50,0.00,0.75,,,10.618246
242,2025,2026-01-27,WAS,POR,Trae Young,PG,2,,,,,,,,,,,,,,0.0,,,,,,,18.241755
243,2025,2026-01-27,WAS,POR,Tre Johnson,SG,1,,,,,,,,,,,,,,0.0,6.75,7.02,7.96,8.08,,,29.523197
244,2025,2026-01-27,WAS,POR,Tristan Vukcevic,C,2,,,,,,,,,,,,,,0.0,2.80,3.12,2.76,4.78,,,13.842532


../tables/2025/ht_api_input.csv saved!


# Today's predictions

In [29]:
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_PTS.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday[(df_yesterday.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
#                 .rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)
# df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[~(df_lines.Team.isnull()) & ~(df_lines.PTS_line.isnull())].drop(['Pos', 'Spread', 'Total'], axis=1)
# df_gms = df_gms.merge(df_lines, on=['Date', 'Team', 'Player'])
# df_gms['Res_PTS'] = df_gms.PTS - df_gms.PTS_line

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', 'PTS', 'Res_PTS', 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', 'PTS_line', 'PTS_proj', 'PTS', 'Res_PTS_proj', 'Res_PTS', 'pred_prob', 'pred_class']][df_yesterday.MP > 0]

# # Mins
# df_yesterday['Diff'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 5, 1, 0)
# print("\nYesterday's Results:")
# print("Total Accuracy (Minutes-in-range):", (df_yesterday.InTgtRange == 1).mean())
# print((df_yesterday.InTgtRange == 1).sum(), '/', df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange'], axis=1)

# # Raw PTS
# df_yesterday['Diff'] = abs(df_yesterday['PTS'] - df_yesterday['PTS_proj'])
# df_yesterday['InTgtRange'] = np.where(df_yesterday['Diff'] <= 3, 1, 0)
# df_yesterday['Act_Res'] = np.where((df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['Pred_Res'] = np.where((df_yesterday.PTS_proj > df_yesterday.PTS_line), 1, 0)
# df_yesterday['PHit1'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (Raw PTS):", (df_yesterday.PHit1 == 1).mean())
# print((df_yesterday.PHit1 == 1).sum(), "/", df_yesterday.shape[0])
# print("Total Accuracy (Raw PTS-in-range):", (df_yesterday.InTgtRange == 1).mean())
# df_yesterday = df_yesterday.drop(['Diff', 'InTgtRange', 'Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Regression)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday['Res_PTS_proj'] > 0, 'O', 'U')
# df_yesterday['PHit2'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# print("Total Accuracy (ResPTS Regression):", (df_yesterday.PHit2 == 1).mean())
# print((df_yesterday.PHit2 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res', 'Pred_Res'], axis=1)

# # Res PTS (Classifier)
# df_yesterday['Act_Res'] = np.where(df_yesterday['Res_PTS'] > 0, 1, 0)
# df_yesterday['PHit3'] = np.where(df_yesterday['Act_Res'] == df_yesterday['pred_class'], 1, 0)
# df_yesterday['pred_class'] = np.where(df_yesterday['pred_class'] == 1, 'O', 'U')
# print("Total Accuracy (ResPTS Classification):", (df_yesterday.PHit3 == 1).mean())
# print((df_yesterday.PHit3 == 1).sum(), "/", df_yesterday.shape[0])
# df_yesterday = df_yesterday.drop(['Act_Res'], axis=1)

# df_yesterday['Majority'] = np.where(((df_yesterday.PTS_proj > df_yesterday.PTS_line).astype(int) + (df_yesterday['Res_PTS_proj'] > 0).astype(int) + (df_yesterday['pred_class'] == 1).astype(int)) >= 2, 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 1) & (df_yesterday.PTS > df_yesterday.PTS_line), 1, 0)
# df_yesterday['MajorityHit'] = np.where((df_yesterday.Majority == 0) & (df_yesterday.PTS < df_yesterday.PTS_line), 1, df_yesterday.MajorityHit)
# print("Total Accuracy (MajorityHit):", (df_yesterday.MajorityHit == 1).mean())

# df_yesterday['AllAgree'] = '-'
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 1) & (df_yesterday.PHit2 == 1) & (df_yesterday.PHit3 == 1), 1, df_yesterday['AllAgree'])
# df_yesterday['AllAgree'] = np.where((df_yesterday.PHit1 == 0) & (df_yesterday.PHit2 == 0) & (df_yesterday.PHit3 == 0), 0, df_yesterday['AllAgree'])
# print("Total Accuracy (AllAgree):", ((df_yesterday.AllAgree == 1).sum() / ((df_yesterday.AllAgree == 0).sum() + (df_yesterday.AllAgree == 1).sum())))

# df_yesterday = df_yesterday.drop(['Majority', 'MajorityHit', 'AllAgree'], axis=1).sort_values('PTS_line', ascending=False)

# # if df_yesterday.shape[0] >= 50:
# #     for tm in df_yesterday.Team.unique():
# #         display(df_yesterday[(df_yesterday.Team == tm)]) #  & (df_yesterday.PHit == 1)
# # else:
# #     display(df_yesterday)

In [30]:
df_lines = pd.read_csv(f"../tables/{YEAR}/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[~(df_lines.Team.isnull())]

df_pred = df_pred.merge(df_lines[['Date', 'Team', 'Spread', 'Total']], on=['Date', 'Team'], how='left')
df_pred = df_pred[~df_pred[['Date', 'Team', 'Player']].duplicated(keep='last')]
df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
df_prediction = df_pred.copy()

# Predict Mins
mins_booster = xgb.Booster()
mins_booster.load_model("../ML_models/dev/mins_model.json")
mins_model = XGBRegressor()
mins_model._Booster = mins_booster

df_prediction_mins = setup_df_mins(con, df_prediction)
df_prediction_mins['MP_preds'] = mins_model.predict(df_prediction_mins.drop(['Season', 'Date', 'MP'], axis=1))

# Predict Stat
stat_booster = xgb.Booster()
stat_booster.load_model(f"../ML_models/dev/{tgt_stat}_model.json")
stat_model = XGBRegressor()
stat_model._Booster = stat_booster
res_booster_RG = xgb.Booster()
res_booster_RG.load_model("../ML_models/dev/Res_PTS_RG_model.json")
res_model_RG = XGBRegressor()
res_model_RG._Booster = res_booster_RG
res_model_CLF = XGBClassifier()
res_model_CLF.load_model("../ML_models/dev/Res_PTS_CLF_model.json")

df_prediction = setup_df_main(df_prediction, 'PTS')
df_prediction = df_prediction.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'], how='left')
df_prediction['MP'] = df_prediction.MP.fillna(df_prediction.MP_preds)
feature_cols = [col for col in df_prediction.columns if col not in ['Season', 'Date', 'MP_preds', 'PTS']]
df_prediction = df_prediction[df_prediction.Date == now][feature_cols]
df_prediction["PTS_proj"] = stat_model.predict(df_prediction)

df_prediction2 = df_pred[(~df_pred.PTS_line.isnull())].copy()
df_prediction2 = setup_df_res(df_prediction2)
df_prediction2 = df_prediction2.merge(df_prediction_mins[['Date', 'Team', 'Player', 'MP_preds']], on=['Date', 'Team', 'Player'], how='left')
df_prediction2['Player'] = df_prediction2['Player'].astype('category')
df_prediction2['MP'] = df_prediction2.MP.fillna(df_prediction2.MP_preds)
feature_cols = [col for col in df_prediction2.columns if col not in ['Season', 'Date', 'MP_preds', 'Res_PTS']]
df_prediction2 = df_prediction2[df_prediction2.Date == now][feature_cols]
df_prediction2["Res_PTS_proj"] = res_model_RG.predict(df_prediction2)

df_prediction2['pred_prob'] = res_model_CLF.predict_proba(df_prediction2.drop('Res_PTS_proj', axis=1))[:,1]
df_prediction2['pred_class'] = (df_prediction2['pred_prob'] > 0.5).astype(int)

# Setup Today's Picks
df_lines = df_lines[df_lines.Date == now][['Team', 'Player', 'PTS_line']]
df_prediction = df_prediction.merge(df_lines, on=['Team', 'Player'])
df_prediction = df_prediction.merge(df_prediction2[['Team', 'Player', 'Res_PTS_proj', 'pred_prob', 'pred_class']], on=['Team', 'Player'])

tds_picks = df_prediction[~(df_prediction['PTS_line'].isnull())]\
            [['Team', 'Player', 'MP', 'PTS_line', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class']]

tds_picks['O/U'] = '-'
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 1) & (tds_picks.PTS_proj > tds_picks.PTS_line) & (tds_picks.Res_PTS_proj > 0)), 'O', tds_picks['O/U'])
tds_picks['O/U'] = np.where(((tds_picks.pred_class == 0) & (tds_picks.PTS_proj < tds_picks.PTS_line) & (tds_picks.Res_PTS_proj < 0)), 'U', tds_picks['O/U'])

tds_picks = tds_picks.sort_values(['O/U', 'Team', 'Player'], ascending=[False, False, False])
if tds_picks.shape[0] >= 50:
    print(tds_picks.shape[0], 'rows')
    for tm in tds_picks.Team.unique():
        display(tds_picks[tds_picks.Team == tm])
else:
    display(tds_picks)
tds_picks.insert(0, 'Date', pd.to_datetime(now))
partition_save_df(tds_picks, f"../tables/{YEAR}/gmday_preds_PTS.csv")

97 rows


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
96,WAS,Will Riley,18.169117,7.5,6.927652,-0.548306,0.485082,0,U
94,WAS,Kyshawn George,29.918226,16.5,14.52393,-1.46058,0.376255,0,U
93,WAS,Khris Middleton,25.069828,10.5,10.439105,-0.081297,0.41557,0,U
92,WAS,Justin Champagnie,22.562153,10.5,8.967732,-0.364148,0.394358,0,U
95,WAS,Tre Johnson,29.523197,17.5,14.976211,-0.396283,0.53648,1,-
91,WAS,Bilal Coulibaly,27.191515,10.5,11.208566,-0.264779,0.522145,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
84,SAC,Zach LaVine,27.060722,16.5,16.369507,-2.718373,0.228539,0,U
79,SAC,Keon Ellis,21.396004,7.5,6.185509,-0.514244,0.434499,0,U
83,SAC,Russell Westbrook,32.224388,15.5,17.758379,2.057286,0.547968,1,O
82,SAC,Precious Achiuwa,27.560024,7.5,10.373734,4.197327,0.750618,1,O
81,SAC,Nique Clifford,25.539791,9.5,9.601602,2.295359,0.679856,1,O
78,SAC,Domantas Sabonis,33.111877,16.5,17.333969,3.654681,0.523055,1,O
76,SAC,DeMar DeRozan,35.49823,18.5,20.794184,2.823322,0.569847,1,O
80,SAC,Malik Monk,24.788836,12.5,13.036776,1.388989,0.448271,0,-
77,SAC,Dennis Schroder,23.635395,11.5,11.218341,0.275613,0.461972,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
53,OKC,Kenrich Williams,22.360968,10.5,9.742764,-0.725772,0.355656,0,U
52,OKC,Isaiah Joe,22.785831,11.5,11.481876,-0.461269,0.382938,0,U
51,OKC,Chet Holmgren,31.123371,19.5,18.386627,-1.677577,0.435095,0,U
50,OKC,Cason Wallace,23.411282,8.5,6.766316,-0.706858,0.311425,0,U
49,OKC,Aaron Wiggins,29.45038,14.5,13.737965,-1.368131,0.329266,0,U
54,OKC,Luguentz Dort,31.950081,10.5,11.786724,1.546251,0.516243,1,O
56,OKC,Shai Gilgeous-Alexander,35.611759,32.5,30.446608,1.803972,0.512798,1,-
55,OKC,Ousmane Dieng,15.425275,5.5,5.828289,-2.711623,0.173845,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
41,NYK,Jalen Brunson,35.160316,27.5,26.100622,-0.785452,0.38751,0,U
48,NYK,OG Anunoby,34.656315,15.5,16.739769,1.120398,0.506455,1,O
46,NYK,Miles McBride,25.508888,10.5,12.71756,2.970942,0.503681,1,O
45,NYK,Mikal Bridges,35.107796,15.5,15.967882,2.888641,0.501758,1,O
44,NYK,Landry Shamet,20.986206,6.5,9.353056,1.206767,0.638007,1,O
42,NYK,Josh Hart,33.931778,11.5,12.920037,1.165097,0.542584,1,O
47,NYK,Mitchell Robinson,20.098225,5.5,5.411059,0.488617,0.581849,1,-
43,NYK,Karl-Anthony Towns,31.834221,20.5,19.016094,0.620636,0.510332,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
32,MIL,Ryan Rollins,33.421249,20.5,16.580988,-3.814228,0.387451,0,U
30,MIL,Kyle Kuzma,31.208509,14.5,14.362081,-2.164492,0.227805,0,U
29,MIL,Gary Trent Jr.,17.598537,7.5,5.718048,-1.144398,0.371909,0,U
28,MIL,Cole Anthony,20.269623,10.5,8.441941,-1.096728,0.33471,0,U
31,MIL,Myles Turner,30.033457,12.5,13.347614,1.199067,0.525613,1,O
27,MIL,Bobby Portis,26.677357,15.5,14.956202,2.195863,0.494486,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
20,LAC,Brook Lopez,17.288729,6.5,6.100579,-1.343718,0.29061,0,U
26,LAC,Nicolas Batum,19.225632,3.5,4.791473,0.019502,0.597665,1,O
25,LAC,Kris Dunn,31.354046,7.5,8.523602,2.227057,0.724795,1,O
24,LAC,Kawhi Leonard,32.766907,25.5,28.687513,4.341513,0.801556,1,O
23,LAC,John Collins,31.267538,14.5,14.798935,2.136174,0.660732,1,O
22,LAC,James Harden,37.335682,25.5,27.516479,0.831456,0.582542,1,O
21,LAC,Ivica Zubac,32.97356,15.5,15.538578,0.505604,0.454525,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
8,DEN,Jonas Valanciunas,21.975353,13.5,10.794977,-0.649408,0.421663,0,U
9,DEN,Peyton Watson,35.048546,16.5,17.695704,0.645914,0.524307,1,O
7,DEN,Jamal Murray,37.837948,26.5,27.201151,3.41561,0.608875,1,O
11,DEN,Zeke Nnaji,19.166176,6.5,6.742452,-0.36923,0.565964,1,-
10,DEN,Tim Hardaway Jr.,27.417459,13.5,13.160113,0.238885,0.47824,0,-
6,DEN,Jalen Pickett,29.78401,8.5,9.54452,0.444821,0.48524,0,-
5,DEN,Bruce Brown,22.725088,5.5,6.103909,-1.327698,0.319129,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
90,UTA,Lauri Markkanen,36.272144,24.5,28.168283,0.18525,0.665339,1,O
89,UTA,Kyle Filipowski,27.951279,10.5,11.498871,0.970758,0.659999,1,O
88,UTA,Isaiah Collier,28.474836,11.5,12.050806,2.431805,0.601238,1,O
87,UTA,Cody Williams,21.67098,6.5,7.514329,-0.446003,0.435531,0,-
86,UTA,Brice Sensabaugh,23.717064,14.5,13.068491,-0.203966,0.590121,1,-
85,UTA,Ace Bailey,30.00009,13.5,14.250902,1.302465,0.484673,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
75,POR,Toumani Camara,33.494236,12.5,13.275942,0.5086,0.522291,1,O
73,POR,Jrue Holiday,27.603306,11.5,14.195718,2.298589,0.698271,1,O
72,POR,Jerami Grant,27.432676,15.5,17.443642,1.514484,0.627801,1,O
71,POR,Donovan Clingan,28.563089,10.5,11.682961,1.523448,0.565318,1,O
70,POR,Deni Avdija,33.335232,23.5,25.950788,2.098074,0.610766,1,O
74,POR,Shaedon Sharpe,31.73233,21.5,23.098791,-0.271159,0.553172,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
69,PHO,Royce O'Neale,29.696455,9.5,10.360057,1.742227,0.681234,1,O
68,PHO,Oso Ighodaro,22.702623,6.5,6.509019,0.976178,0.546519,1,O
67,PHO,Mark Williams,26.29678,12.5,13.887484,3.941144,0.682534,1,O
66,PHO,Jordan Goodwin,27.482574,12.5,11.08531,1.883548,0.615459,1,-
65,PHO,Grayson Allen,32.113148,21.5,18.659916,1.374927,0.488518,0,-
64,PHO,Dillon Brooks,33.151993,22.5,20.455002,2.637224,0.662793,1,-
63,PHO,Collin Gillespie,27.343782,14.5,14.000179,0.84997,0.640114,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
62,PHI,VJ Edgecombe,36.332264,13.5,14.779477,1.15759,0.638509,1,O
61,PHI,Tyrese Maxey,39.604332,27.5,29.340471,3.690694,0.752989,1,O
60,PHI,Quentin Grimes,26.62133,7.5,10.858283,0.205567,0.651634,1,O
59,PHI,Paul George,32.523464,14.5,15.065503,2.508958,0.698074,1,O
58,PHI,Kelly Oubre Jr.,28.49218,12.5,13.222077,2.057247,0.603586,1,O
57,PHI,Joel Embiid,34.638844,27.5,26.999975,3.08945,0.707498,1,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
38,NOP,Trey Murphy III,36.069168,20.5,21.547558,3.197381,0.675898,1,O
37,NOP,Saddiq Bey,31.404303,15.5,17.701105,3.34155,0.683074,1,O
34,NOP,Herbert Jones,29.868355,8.5,8.968177,2.413419,0.603016,1,O
33,NOP,Derik Queen,27.418886,9.5,11.926513,1.963151,0.72962,1,O
40,NOP,Zion Williamson,31.340746,21.5,19.583261,1.334477,0.699718,1,-
39,NOP,Yves Missi,22.68922,5.5,6.993577,0.375407,0.456724,0,-
36,NOP,Micah Peavy,19.20027,4.5,5.937557,-0.413095,0.359536,0,-
35,NOP,Jeremiah Fears,23.235441,9.5,11.292857,-0.50254,0.479826,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
19,DET,Tobias Harris,31.974625,13.5,14.85787,4.350227,0.529443,1,O
16,DET,Jaden Ivey,17.13973,7.5,8.582063,0.727729,0.510361,1,O
14,DET,Duncan Robinson,29.149492,10.5,13.281934,2.047452,0.69326,1,O
18,DET,Javonte Green,18.000685,4.5,7.189793,-0.435538,0.417547,0,-
17,DET,Jalen Duren,29.606968,16.5,16.889198,-1.701639,0.580347,1,-
15,DET,Isaiah Stewart,22.499912,8.5,9.550665,0.417884,0.364008,0,-
13,DET,Cade Cunningham,37.03727,23.5,23.702768,-2.143488,0.618103,1,-
12,DET,Ausar Thompson,27.626585,9.5,11.229294,-0.27126,0.47118,0,-


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj,Res_PTS_proj,pred_prob,pred_class,O/U
2,BRK,Michael Porter Jr.,33.649582,21.5,22.710592,0.624953,0.638102,1,O
1,BRK,Drake Powell,25.389494,6.5,9.122299,0.823421,0.526678,1,O
0,BRK,Day'Ron Sharpe,19.446617,7.5,8.140172,0.43299,0.535156,1,O
4,BRK,Terance Mann,25.219486,6.5,8.242704,-0.238319,0.426538,0,-
3,BRK,Nolan Traore,21.453074,7.5,7.052244,-0.281604,0.505434,1,-


../tables/2025/gmday_preds_PTS.csv saved!


In [31]:
# tds_picks['PTS_proj_mag'] = tds_picks.PTS_proj - tds_picks.PTS_line
# print('O:', tds_picks[tds_picks.PTS_proj_mag > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.PTS_proj_mag < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'PTS_proj_mag'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()
# tds_picks = tds_picks.drop('PTS_proj_mag', axis=1)

In [32]:
# print('O:',tds_picks[tds_picks.Res_PTS_proj > 0].shape[0] / tds_picks.shape[0])
# print('U:',tds_picks[tds_picks.Res_PTS_proj < 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'Res_PTS_proj'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

In [33]:
# print('O:', tds_picks[tds_picks.pred_class == 1].shape[0] / tds_picks.shape[0])
# print('U:', tds_picks[tds_picks.pred_class == 0].shape[0] / tds_picks.shape[0])

# plt.figure(figsize=(10,6))
# hist_col = 'pred_prob'
# plt.hist(tds_picks[hist_col], bins=30, color='skyblue', edgecolor='black')
# plt.title(f'Histogram of {hist_col}')
# plt.xlabel(hist_col)
# plt.ylabel('Frequency')
# plt.grid(axis='y', alpha=0.75)
# plt.show()

# Misc.

In [34]:
# # Historical Percentages
# df_yesterday = pd.read_csv(f'../tables/{YEAR}/gmday_preds_{tgt_stat}.csv')
# df_yesterday['Date'] = pd.to_datetime(df_yesterday.Date)
# df_yesterday = df_yesterday.rename(columns={"MP": "MP_proj"})

# df_gms = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
# df_gms['Date'] = pd.to_datetime(df_gms.Date)

# df_yesterday = df_yesterday.merge(df_gms[['Date', 'Team', 'Player', tgt_stat, 'MP']], on=['Date', 'Team', 'Player'])
# df_yesterday = df_yesterday[['Date', 'Team', 'Player', 'MP', 'MP_proj', f'{tgt_stat}_line', f'{tgt_stat}_proj', tgt_stat]][df_yesterday.MP > 0]

# df_yesterday['Diff'] = df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[f'{tgt_stat}_line']
# df_yesterday['Diff2'] = abs(df_yesterday[f'{tgt_stat}_proj'] - df_yesterday[tgt_stat])
# df_yesterday['Act_Res'] = np.where(df_yesterday[tgt_stat] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['Pred_Res'] = np.where(df_yesterday[f'{tgt_stat}_proj'] > df_yesterday[f'{tgt_stat}_line'], 'O', 'U')
# df_yesterday['ParlayHit'] = np.where(df_yesterday['Act_Res'] == df_yesterday['Pred_Res'], 1, 0)
# df_yesterday['Diff3'] = abs(df_yesterday['MP_proj'] - df_yesterday['MP'])
# df_yesterday['InRMSE_Range'] = np.where(df_yesterday['Diff3'] <= 5, 1, 0)

# for day in df_gms.Date.unique():
#     df_temp = df_yesterday[df_yesterday.Date == day]
#     if df_temp.shape[0] > 0:
#         print(f"{day.date()} Total PTS Accuracy:", f"{(df_temp.ParlayHit == 1).sum()}/{df_temp.shape[0]}", ((df_temp.ParlayHit == 1).sum() / df_temp.shape[0]))
#         print(f"{day.date()} Total MP Accuracy:", f"{(df_temp.InRMSE_Range == 1).sum()}/{df_temp.shape[0]}", ((df_temp.InRMSE_Range == 1).sum() / df_temp.shape[0]), "\n")

In [35]:
# df_mins = df.copy()
# df_mins = setup_df_mins(con, df_mins)

# train_summary = df_mins.drop(['Season', 'Date', 'MP'], axis=1).describe().T
# gameday_summary = df_prediction_mins[df_prediction_mins.Date == now].drop(['Season', 'Date', 'MP'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [36]:
# df_main = df.copy()
# df_main = setup_df_main(df_main, tgt_stat)

# train_summary = df_main.drop(['Season', 'Team', 'Opp', 'Player', 'Pos', 'Date', 'PTS'], axis=1).describe().T
# gameday_summary = df_prediction[(~df_prediction.PTS_line.isnull())].drop(['Pos', 'PTS_proj', 'Res_PTS_proj', 'pred_prob', 'pred_class', 'PTS_line'], axis=1).describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [37]:
# df_res = df[(~df.PTS_line.isnull())].copy()
# df_res = setup_df_res(df_res)

# train_summary = df_res.drop(['Date', 'Res_PTS'], axis=1).describe().T
# gameday_summary = df_prediction2[(~df_prediction2.PTS_line.isnull())].describe().T
# display(train_summary[['mean','std']])
# display(gameday_summary[['mean','std']])

In [38]:
# for col in mins_train_df.drop(['Season', 'Date', 'MP'], axis=1).columns:
#     if col not in ['Team', 'Player', 'Opp', 'Pos']:
#         PartialDependenceDisplay.from_estimator(
#             mins_model,
#             mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#             features=[col],
#             grid_resolution=25
#         )
        
# ##################
# features = [('role')]
# PartialDependenceDisplay.from_estimator(
#     mins_model,
#     mins_train_df.drop(['Season', 'Date', 'MP'], axis=1),
#     features=features,
#     grid_resolution=25
# )