# To do:

 - Figure out how to signal injuries
 - Create model that generates minutes projections
 - To df_lines add the real spread result and total result

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings

import xgboost as xgb
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)

Today's date: 2025-12-16


In [2]:
%run ./common_utils.ipynb

# Feature Engineering

In [3]:
def feature_importance(model):
    importance = model.get_booster().get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
import math

def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [5]:
def create_baseline_model(df, pred_col, train_df, val_df, test_df):

    print('Train:', len(train_df), '/ Validation:', len(val_df), '/ Test:', len(test_df))

    feature_cols = [col for col in df.columns 
                    if col not in ['Date', pred_col]]

    X_train, y_train = train_df[feature_cols], train_df[pred_col]
    X_val,   y_val   = val_df[feature_cols],   val_df[pred_col]
    X_test,  y_test  = test_df[feature_cols],  test_df[pred_col]


    model = XGBRegressor(
        enable_categorical=True,
        n_estimators=300,    # number of trees
        learning_rate=0.05,  # step size
        max_depth=6,         # complexity
        subsample=0.8,       # row sampling
        colsample_bytree=0.8,
        objective='reg:squarederror',
        random_state=42,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R²:", r2)
    
    return model, (X_train, y_train, X_val, y_val, X_test,  y_test)

In [6]:
def hyperparam_tuning(model, splits):

    X_train = splits[0]
    y_train = splits[1]
    X_val = splits[2]
    y_val = splits[3]
    X_test = splits[4]
    y_test = splits[5]
    
    param_dist = {
        "n_estimators": randint(300, 2000),
        "learning_rate": uniform(0.005, 0.05),
        "max_depth": randint(3, 8),
        "min_child_weight": randint(1, 15),
        "subsample": uniform(0.7, 0.3),
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 10),
        "reg_lambda": uniform(0, 10),
        "reg_alpha": uniform(0, 5),
        "max_leaves": randint(10, 80)
    }

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,
        scoring='neg_mean_squared_error',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    best_model = random_search.best_estimator_
    print("Best params:", random_search.best_params_, "\n")

    preds = best_model.predict(X_test)
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))
    
    return best_model, preds

In [7]:
df = pd.DataFrame()
df2 = pd.DataFrame()
for i in [2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_gms = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_gms['Date'] = pd.to_datetime(df_gms.Date)
    df2 = pd.concat([df2, df_gms])

df['Date'] = pd.to_datetime(df.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)]

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()

df2['Date'] = pd.to_datetime(df2.Date)
df_mtch = df2[['Date', 'AwayABV', 'HomeABV', 'AwayB2B', 'HomeB2B', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Home'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Date','Team', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Date','Team', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Away'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Date', 'Team', 'Team_type', 'cup_gm', 'pstszn_gm']]
df = df.merge(df_mtch, on=['Date', 'Team'])

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
print('base df created', datetime.now())

base df created 2025-12-16 18:28:45.564076


# Minutes Projection Model

In [8]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'Team', 'Team_type', 'Opp', 'Player', 'B2B', 'MP', 'cup_gm', 'pstszn_gm']]
    df = df.sort_values(['Player', 'Season', 'Date']).reset_index(drop=True)
    
    df['MP_lst_gm'] = (
        df
        .groupby(['Player', 'Season'])['MP']
        .shift(1)
    )

    df['MP_last_5_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=5, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    df['MP_last_10_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=10, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
        
    # EXPERIMENTAL
    df['Team_mins'] = (
        df.groupby(['Team', 'Date'])['MP']
          .transform('sum')
    )
    df['Team_mins'] = np.where(df.Team_mins < 240, 240, df.Team_mins)
    df['Lst5_TmMins'] = (
        df.groupby(['Team', 'Season'])['Team_mins']
          .rolling(window=5, min_periods=1)
          .sum()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    df['Lst5_PlyrMins'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=5, min_periods=1)
          .sum()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    df['TmMinsPct_Lst5'] = df.Lst5_PlyrMins / df.Lst5_TmMins
    
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
    
    # 2.) EXPERIMENT: take full season team mins and take full season players MP and get pct that way, then rank player 
    # usage rate per team    
    # 3.) Add games missed column
    # 4.) Add coming back from injury column
    
    # In attempt to keep minutes realistic
    # 5.) Add column that signals max career reg games mins
    # 6.) Add column that signals max career OT mins too?
   
    
    df = df.drop(['Team_mins', 'Lst5_TmMins', 'Lst5_PlyrMins', 'reserve_td', 'bench_td', 'starter_td'], axis=1)    

    
    return df

In [9]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
display(df_mins)

n = len(df_mins)
train_end = int(0.7 * n)
val_end   = int(0.85 * n)
mins_train_df = df_mins.iloc[:train_end]
mins_val_df   = df_mins.iloc[train_end:val_end]
mins_test_df  = df_mins.iloc[val_end:]

mins_model, mins_splits = create_baseline_model(df_mins, "MP", mins_train_df, mins_val_df, mins_test_df)
mins_model, mins_preds = hyperparam_tuning(mins_model, mins_splits)
# feature_importance(mins_model)

Unnamed: 0,Season,Date,Team,Team_type,Opp,Player,B2B,MP,cup_gm,pstszn_gm,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,games_last_7_days,prev_team_mins_pct,TmMinsPct_Lst5,reserve,bench,starter
0,2023,2023-10-26,16,0,22,0,0,0.00,0,0,,,,0,,,1,0,0
1,2023,2023-10-29,16,0,0,0,0,5.62,0,0,0.00,0.000000,0.000000,1,0.000000,0.000000,1,0,0
2,2023,2023-10-30,16,0,15,0,0,15.35,0,0,5.62,2.810000,2.810000,2,0.023417,0.011708,1,0,0
3,2023,2023-11-01,16,1,27,0,0,5.52,0,0,15.35,6.990000,6.990000,3,0.063958,0.029124,1,0,0
4,2023,2023-11-03,16,0,19,0,0,0.00,1,0,5.52,6.622500,6.622500,4,0.023000,0.027593,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74542,2025,2025-12-14,18,1,3,735,0,26.62,0,0,30.73,30.280000,30.586667,3,0.128042,0.124972,0,0,1
74543,2024,2025-01-30,14,0,10,736,0,0.00,0,0,,29.784000,30.190000,1,,0.124098,1,0,0
74544,2024,2025-02-03,14,0,26,736,0,0.82,0,0,0.00,0.000000,0.000000,1,0.000000,0.000000,1,0,0
74545,2024,2025-02-08,14,0,20,736,1,0.82,0,0,0.82,0.410000,0.410000,2,0.003417,0.000683,1,0,0


Train: 52182 / Validation: 11182 / Test: 11183
RMSE: 6.800306711638951
MAE: 5.177618500720012
R²: 0.7294910720793739
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'colsample_bytree': np.float64(0.8803345035229626), 'gamma': np.float64(7.080725777960454), 'learning_rate': np.float64(0.006029224714790123), 'max_depth': 4, 'max_leaves': 39, 'min_child_weight': 6, 'n_estimators': 685, 'reg_alpha': np.float64(0.9091248360355031), 'reg_lambda': np.float64(1.8340450985343382), 'subsample': np.float64(0.7912726728878613)} 

RMSE: 6.525438465018873
MAE: 4.911355334641117
R²: 0.7509170512080121


# Main Model

In [10]:
def setup_df_main(df):
    df = df.sort_values(['Player', 'Date'])
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)

    stat_cols = []
    for stat in categories:
        for col in [f'Off_{stat}', f'Off_L5_{stat}', f'Def_{stat}', f'Def_L5_{stat}']:
            stat_cols.append(col)

    final_cols = ['Date', 'Team', 'Team_type', 'B2B', 'cup_gm', 'pstszn_gm', 'Player', 'Opp', 'MP',
                  'DaysLstGm', 'travel_km', 'travel_hours', 'PrevLocation', 'is_long_trip', 'same_arena', 
                  tgt_stat] + stat_cols
    df = df[final_cols]
    
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(0)

    return df

In [11]:
tgt_stat = "PTS"
df_main = df.copy()
df_main = setup_df_main(df_main)
display(df_main)

n = len(df)
train_end = int(0.7 * n)
val_end   = int(0.85 * n)
main_train_df = df_main.iloc[:train_end]
main_val_df   = df_main.iloc[train_end:val_end]
main_test_df  = df_main.iloc[val_end:]

stat_model, main_splits = create_baseline_model(df_main, tgt_stat, main_train_df, main_val_df, main_test_df)
stat_model, stat_preds = hyperparam_tuning(stat_model, main_splits)

Unnamed: 0,Date,Team,Team_type,B2B,cup_gm,pstszn_gm,Player,Opp,MP,DaysLstGm,travel_km,travel_hours,PrevLocation,is_long_trip,same_arena,PTS,Off_PTS,Off_L5_PTS,Def_PTS,Def_L5_PTS,Off_AST,Off_L5_AST,Def_AST,Def_L5_AST,Off_REB,Off_L5_REB,Def_REB,Def_L5_REB,Off_PR,Off_L5_PR,Def_PR,Def_L5_PR,Off_PA,Off_L5_PA,Def_PA,Def_L5_PA,Off_RA,Off_L5_RA,Def_RA,Def_L5_RA,Off_PRA,Off_L5_PRA,Def_PRA,Def_L5_PRA,Off_TPM,Off_L5_TPM,Def_TPM,Def_L5_TPM,Off_STL,Off_L5_STL,Def_STL,Def_L5_STL,Off_BLK,Off_L5_BLK,Def_BLK,Def_L5_BLK,Off_STL_BLK,Off_L5_STL_BLK,Def_STL_BLK,Def_L5_STL_BLK
387,2023-10-26,16,0,0,0,0,0,22,0.00,0,0.0,0.0,0.0,0,0,0,0.000000,0.000000,5.000000,5.00,0.000000,0.000000,3.000000,3.00,0.000000,0.000000,4.000000,4.0,0.000000,0.000000,9.000000,9.00,0.000000,0.0,8.000000,8.0,0.000000,0.00,7.000000,7.00,0.000000,0.000000,12.000000,12.0,0.000000,0.000000,1.000000,1.00,0.000000,0.0,2.000000,2.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,2.000000,2.0
1138,2023-10-29,16,0,0,0,0,0,0,5.62,3,0.0,0.0,22.0,0,0,7,0.000000,0.000000,16.600000,16.60,0.000000,0.000000,4.000000,4.00,0.000000,0.000000,3.400000,3.4,0.000000,0.000000,20.000000,20.00,0.000000,0.0,20.600000,20.6,0.000000,0.00,7.400000,7.40,0.000000,0.000000,24.000000,24.0,0.000000,0.000000,2.200000,2.20,0.000000,0.0,0.600000,0.6,0.000000,0.0,0.200000,0.2,0.000000,0.0,0.800000,0.8
1362,2023-10-30,16,0,0,0,0,0,15,15.35,1,0.0,0.0,0.0,0,0,0,3.500000,3.500000,14.750000,14.75,0.000000,0.000000,3.250000,3.25,0.500000,0.500000,4.000000,4.0,4.000000,4.000000,18.750000,18.75,3.500000,3.5,18.000000,18.0,0.500000,0.50,7.250000,7.25,4.000000,4.000000,22.000000,22.0,1.000000,1.000000,2.750000,2.75,0.000000,0.0,0.500000,0.5,0.000000,0.0,1.500000,1.5,0.000000,0.0,2.000000,2.0
1738,2023-11-01,16,1,0,0,0,0,27,5.52,2,0.0,0.0,15.0,0,0,2,2.333333,2.333333,9.857143,7.80,0.666667,0.666667,2.428571,2.40,0.333333,0.333333,7.000000,6.4,2.666667,2.666667,16.857143,14.20,3.000000,3.0,12.285714,10.2,1.000000,1.00,9.428571,8.80,3.333333,3.333333,19.285714,16.6,0.666667,0.666667,1.285714,0.80,0.000000,0.0,1.000000,1.0,0.000000,0.0,0.714286,1.0,0.000000,0.0,1.714286,2.0
2136,2023-11-03,16,0,0,1,0,0,19,0.00,2,0.0,0.0,27.0,0,0,0,2.250000,2.250000,20.166667,21.80,0.750000,0.750000,3.833333,4.20,0.500000,0.500000,4.833333,4.6,2.750000,2.750000,25.000000,26.40,3.000000,3.0,24.000000,26.0,1.250000,1.25,8.666667,8.80,3.500000,3.500000,28.833333,30.6,0.500000,0.500000,2.333333,2.60,0.000000,0.0,1.500000,1.4,0.000000,0.0,0.500000,0.4,0.000000,0.0,2.000000,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74403,2025-12-14,18,1,0,0,0,735,3,26.62,15,0.0,0.0,9.0,0,0,18,20.090909,21.400000,19.500000,20.20,3.636364,3.400000,2.857143,2.80,5.090909,4.400000,6.178571,5.4,25.181818,25.800000,25.678571,25.60,23.727273,24.8,22.357143,23.0,8.727273,7.80,9.035714,8.20,28.818182,29.200000,28.535714,28.4,0.000000,0.000000,2.071429,3.20,1.454545,1.2,0.857143,0.8,0.454545,0.8,0.821429,0.2,1.909091,2.0,1.678571,1.0
33760,2025-01-30,14,0,0,0,0,736,10,0.00,0,0.0,0.0,0.0,0,0,0,0.000000,0.000000,13.171717,20.40,0.000000,0.000000,2.979798,4.80,0.000000,0.000000,3.818182,6.0,0.000000,0.000000,16.989899,26.40,0.000000,0.0,16.151515,25.2,0.000000,0.00,6.797980,10.80,0.000000,0.000000,19.969697,31.2,0.000000,0.000000,1.818182,1.80,0.000000,0.0,1.010101,1.8,0.000000,0.0,0.373737,0.6,0.000000,0.0,1.383838,2.4
51907,2025-02-03,14,0,0,0,0,736,26,0.82,4,0.0,0.0,10.0,0,0,0,0.000000,0.000000,13.930000,9.80,0.000000,0.000000,3.250000,2.00,0.000000,0.000000,3.890000,3.8,0.000000,0.000000,17.820000,13.60,0.000000,0.0,17.180000,11.8,0.000000,0.00,7.140000,5.80,0.000000,0.000000,21.070000,15.6,0.000000,0.000000,2.010000,0.60,0.000000,0.0,1.160000,0.2,0.000000,0.0,0.260000,0.2,0.000000,0.0,1.420000,0.4
52917,2025-02-08,14,0,1,0,0,736,20,0.82,5,0.0,0.0,26.0,0,0,0,0.000000,0.000000,12.083333,16.20,0.000000,0.000000,2.675926,2.40,0.000000,0.000000,3.907407,4.4,0.000000,0.000000,15.990741,20.60,0.000000,0.0,14.759259,18.6,0.000000,0.00,6.583333,6.80,0.000000,0.000000,18.666667,23.0,0.000000,0.000000,1.657407,1.20,0.000000,0.0,0.787037,0.6,0.000000,0.0,0.370370,0.4,0.000000,0.0,1.157407,1.0


Train: 52182 / Validation: 11182 / Test: 11183
RMSE: 4.834145949004272
MAE: 3.239393711090088
R²: 0.7343387603759766
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'colsample_bytree': np.float64(0.8896917491780738), 'gamma': np.float64(6.335297107608947), 'learning_rate': np.float64(0.03178873420373792), 'max_depth': 3, 'max_leaves': 68, 'min_child_weight': 6, 'n_estimators': 763, 'reg_alpha': np.float64(0.20387570777381958), 'reg_lambda': np.float64(5.908929431882418), 'subsample': np.float64(0.9032693085526846)} 

RMSE: 4.808500733703831
MAE: 3.2306954860687256
R²: 0.7371499538421631


In [12]:
rmse = np.sqrt(mean_squared_error(main_splits[5], stat_preds)) # splits[5] = y_test
pred_col = f'{tgt_stat}_Pred'
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)
df_lines = df_lines[['Date', 'Team', 'Player', f'{tgt_stat}_line']]
main_test_df[pred_col] = stat_preds
main_test_df['Team'] = team_encoder.inverse_transform(main_test_df["Team"])
main_test_df['Player'] = player_encoder.inverse_transform(main_test_df["Player"])

df_test = main_test_df.merge(df_lines, on=['Date', 'Team', 'Player'])
df_test = df_test[[c for c in df_test.columns if c != pred_col] + [pred_col]]

df_test['Diff'] = df_test[f'{tgt_stat}_Pred'] - df_test[f'{tgt_stat}_line']
df_test['Act_Res'] = np.where(df_test[tgt_stat] > df_test[f'{tgt_stat}_line'], 'O', 'U')
df_test['Pred_Res'] = np.where(df_test[pred_col] > df_test[f'{tgt_stat}_line'], 'O', 'U')
df_test['ParlayHit'] = np.where(df_test['Act_Res'] == df_test['Pred_Res'], 1, 0)

df_test['Diff2'] = abs(df_test[f'{tgt_stat}_Pred'] - df_test[tgt_stat])
df_test['InRMSE_Range'] = np.where(df_test['Diff2'] <= rmse, 1, 0)

print("Total Accuracy (ParlayHit):", ((df_test.ParlayHit == 1).sum() / df_test.shape[0]))
print((df_test.ParlayHit == 1).sum(), "/", df_test.shape[0])

print("\nTotal Accuracy (InRMSE_Range):", ((df_test.InRMSE_Range == 1).sum() / df_test.shape[0]))
print((df_test.InRMSE_Range == 1).sum(), "/", df_test.shape[0])

Total Accuracy (ParlayHit): 0.6440129449838188
199 / 309

Total Accuracy (InRMSE_Range): 0.6116504854368932
189 / 309


In [13]:
df_ystrday = df_test[(df_test.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
            [['Team', 'Player', tgt_stat, f'{tgt_stat}_line', f'{tgt_stat}_Pred', 'ParlayHit', 'Diff', 'InRMSE_Range', 'Diff2']]\
            .sort_values(f'{tgt_stat}_line', ascending=False)

print("Yesterday's Results:")
print("Total Accuracy (ParlayHit):", ((df_ystrday.ParlayHit == 1).sum() / df_ystrday.shape[0]))
print("Total Accuracy (InRMSE_Range):", ((df_ystrday.InRMSE_Range == 1).sum() / df_ystrday.shape[0]))
if df_ystrday.shape[0] >= 50:
    for tm in df_ystrday.Team.unique():
        display(df_ystrday[df_ystrday.Team == tm])
else:
    display(df_ystrday)

Yesterday's Results:
Total Accuracy (ParlayHit): 0.7
Total Accuracy (InRMSE_Range): 0.8


Unnamed: 0,Team,Player,PTS,PTS_line,PTS_Pred,ParlayHit,Diff,InRMSE_Range,Diff2
191,MIA,Tyler Herro,0,22.5,0.988694,1,-21.511306,1,0.988694
67,TOR,Scottie Barnes,17,21.5,17.363161,1,-4.136839,1,0.363161
56,MEM,Santi Aldama,3,15.5,12.997926,1,-2.502074,0,9.997926
154,DET,Tobias Harris,13,13.5,15.630841,0,2.130841,1,2.630841
145,DEN,Tim Hardaway Jr.,13,12.5,20.879818,1,8.379818,0,7.879818
47,TOR,Sandro Mamukelashvili,11,8.5,7.814708,0,-0.685292,1,3.185292
36,BOS,Sam Hauser,0,7.5,-0.264129,1,-7.764129,1,0.264129
113,UTA,Svi Mykhailiuk,0,7.5,0.221365,1,-7.278635,1,0.221365
103,HOU,Steven Adams,3,5.5,3.787588,1,-1.712412,1,0.787588
231,MEM,Vince Williams Jr.,8,5.5,4.310775,0,-1.189225,1,3.689225


### Today's predictions

In [14]:
df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
df_pred['Date'] = pd.to_datetime(df_pred.Date)
df_pred['Season'] = 2025
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred[col].astype('category')
df_pred = df_pred.merge(df_mtch, on=['Date', 'Team'])
df_pred[tgt_stat] = 0

# Predict minutes
df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP']], on=['Date', 'Team', 'Player'], how='left')
df_pred_mins = setup_df_mins(con, df_pred)

df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
df_pred['MP'] = mins_model.predict(df_pred_mins)

df_pred = setup_df_main(df_pred)
feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
df_pred = df_pred[df_pred.Date == now][feature_cols]

# Predict stat
df_pred["Team"] = team_encoder.transform(df_pred["Team"])
df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
df_pred = df_pred[~(df_pred.PrevLocation.isnull())] # Filters out players who are debuting on the year
df_pred["PrevLocation"] = team_encoder.transform(df_pred["PrevLocation"])
df_pred["Player"] = player_encoder.transform(df_pred["Player"])
df_pred['Team_type'] = team_type_encoder.transform(df_pred['Team_type'])
df_pred[f"{tgt_stat}_proj"] = stat_model.predict(df_pred)
df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
df_lines['Date'] = pd.to_datetime(df_lines.Date)

df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])

df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

for tm in df_pred.Team.unique():
    display(df_pred[(df_pred.Team == tm) & ~(df_pred[f'{tgt_stat}_line'].isnull())]\
            [['Team', 'Player', 'MP', f'{tgt_stat}_line', f'{tgt_stat}_proj']]\
            .sort_values(f'{tgt_stat}_proj', ascending=False))

Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj
15,SAS,Victor Wembanyama,29.786194,19.5,19.42193
9,SAS,Keldon Johnson,31.16754,7.5,15.249816
7,SAS,Julian Champagnie,31.476618,7.5,15.032214
3,SAS,Harrison Barnes,30.162277,10.5,13.778195
10,SAS,Luke Kornet,29.909546,5.5,13.196087
1,SAS,Devin Vassell,23.775856,13.5,11.612605
0,SAS,De'Aaron Fox,20.44474,20.5,9.624804
2,SAS,Dylan Harper,18.689646,10.5,9.222927
14,SAS,Stephon Castle,21.430927,17.5,9.133797


Unnamed: 0,Team,Player,MP,PTS_line,PTS_proj
8,NYK,Karl-Anthony Towns,29.541214,20.5,19.979771
6,NYK,Josh Hart,34.349819,13.5,14.532132
13,NYK,OG Anunoby,29.670664,17.5,13.595518
4,NYK,Jalen Brunson,20.114212,30.5,12.893822
12,NYK,Mitchell Robinson,29.630777,3.5,11.776099
5,NYK,Jordan Clarkson,22.168364,6.5,11.264559
11,NYK,Mikal Bridges,24.910011,16.5,10.507944
