# To do:

 - Figure out how to signal injuries
 - Learn basic PyTorch

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import duckdb
import warnings
import math         # haversine_km()
import os

import ray
from ray import tune
from ray import train
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# import xgboost as xgb
# from xgboost import XGBRegressor
# from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import warnings
from datetime import datetime, timedelta
from haversine import haversine

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

categories = ['PTS', 'AST', 'REB', 'PR', 'PA', 'RA', 'PRA', 'TPM', 'STL', 'BLK', 'STL_BLK']
con = duckdb.connect(database=":memory:")
now = str(datetime.now().date())
print(f"Today's date:", now)

Using device: cuda
Today's date: 2025-12-20


In [2]:
%run ./common_utils.ipynb

# ML Functions

In [3]:
def feature_importance(model):
    importance = model.get_score(importance_type='gain')

    # Convert to table
    df_importance = (
        pd.DataFrame({
            'feature': list(importance.keys()),
            'importance': list(importance.values())
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )

    df_importance['pct'] = df_importance.importance.cumsum() / df_importance.importance.sum()
    display(df_importance)

    xgb.plot_importance(model)
    plt.show()

In [4]:
class BaselineRegression(nn.Module):
    def __init__(self, input_dim, hidden_units=None):
        super().__init__()
        
        if hidden_units:  # if hidden_units is provided, create a hidden layer
            self.network = nn.Sequential(
                nn.Linear(input_dim, hidden_units),
                nn.ReLU(),                 # activation for hidden layer
                nn.Linear(hidden_units, 1) # output layer
            )
        else:  # if no hidden_units, just a simple linear regression
            self.network = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.network(x)

In [25]:
def create_baseline_model(df, pred_col):

    if pred_col == 'MP':
        print('Minutes Model')
        feature_cols = [
            'Team', 'Player', 
            'MP', 'MP_lst_gm', 'MP_last_5_avg', 'MP_last_10_avg',
            'starter', 'bench', 'reserve'
        ]
    else:
        print('Stats Model')
        feature_cols = [
            tgt_stat,
            'MP_lst_gm',
            'MP_last_5_avg',
            'MP_last_10_avg',
            f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}',
            f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}',
            'DaysLstGm'
        ]

    X = df[feature_cols].values
    y = df[pred_col].values

    X = torch.tensor(X, dtype=torch.float32).to(device)
    y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)

    print("tensor X", X.shape)
    print("tensor y", y.shape)

    N = len(X)
    split = int(0.8 * N)

    X_train, X_val = X[:split], X[split:]
    y_train, y_val = y[:split], y[split:]

    model = BaselineRegression(input_dim=X.shape[1]).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(1000):
        model.train()

        optimizer.zero_grad()
        preds = model(X_train)
        loss = criterion(preds, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_preds = model(X_val)
                val_loss = criterion(val_preds, y_val)

            print(
                f"Epoch {epoch:03d} | "
                f"Train MSE: {loss.item():.4f} | "
                f"Val MSE: {val_loss.item():.4f}"
            )

In [28]:
def train_mdl(model, X_train, y_train, num_epochs=50, lr=0.001):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    for epoch in range(num_epochs):
        optimizer.zero_grad()
        preds = model(X_train)
        loss = criterion(preds, y_train)
        loss.backward()
        optimizer.step()
        
    return loss.item()
    
def eval_mdl(model, X_val, y_val):
    model.eval()
    with torch.no_grad():
        preds = model(X_val)
        criterion = torch.nn.MSELoss()
        val_loss = criterion(preds, y_val)
    return val_loss.item()
def train_and_eval(config):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model = BaselineRegression(input_dim=config["input_dim"], hidden_units=config["hidden_units"]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    criterion = nn.MSELoss()
    
    X_train_device = config["X_train"].to(device)
    y_train_device = config["y_train"].to(device)
    X_val_device = config["X_val"].to(device)
    y_val_device = config["y_val"].to(device)
    
    train_loss = train_mdl(model, X_train_device, y_train_device, num_epochs=1000, lr=config["lr"])
    val_loss = eval_mdl(model, X_val_device, y_val_device)
    
    tune.report({"train_loss": train_loss, "val_loss": val_loss})

def hyperparam_tuning(X, y, num_samples):

    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    N = len(X)
    train_end = int(0.7 * N)
    val_end = int(0.85 * N)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]

    search_space = {
        "lr": tune.loguniform(1e-4, 1e-2),
        "hidden_units": tune.choice([8,16,32]),
        "input_dim": X.shape[1],
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val
    }

    abs_path = os.path.abspath("../ray_temp_outputs")  # convert to absolute path
    analysis = tune.run(train_and_eval, config=search_space, num_samples=num_samples, storage_path=abs_path)
    best_trial = analysis.get_best_trial(
        metric="val_loss",  # the metric you want to optimize
        mode="min",         # 'min' because lower val_loss is better
        scope="all"         # look across all reported steps
    )
    # Select device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Move final model to device
    best_config = best_trial.config
    final_model = BaselineRegression(input_dim=X.shape[1], 
                                     hidden_units=best_config["hidden_units"]).to(device)

    # Combine training + validation data optionally (common practice for final training)
    X_trainval = torch.cat([X_train, X_val], dim=0).to(device)
    y_trainval = torch.cat([y_train, y_val], dim=0).to(device)

    # Train final model with best hyperparameters
    train_mdl(final_model, X_trainval, y_trainval, num_epochs=100, lr=best_config["lr"])

    # Evaluate on test set
    X_test_device = X_test.to(device)
    y_test_device = y_test.to(device)
    test_loss = eval_mdl(final_model, X_test_device, y_test_device)
    print("Final test loss:", test_loss)

    # Get predictions (e.g., on test or new data)
    final_model.eval()
    with torch.no_grad():
        preds = final_model(X_test_device)
        preds_np = preds.cpu().numpy()
    print(preds_np)

### Create Base df

In [7]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
for i in [2022, 2023, 2024, 2025]:
    df_actuals = pd.read_csv(f"../tables/{i}/parlay_actuals.csv")
    df_actuals['Season'] = i
    df = pd.concat([df, df_actuals])

    df_schd = pd.read_csv(f"../tables/{i}/nba_schedule.csv")
    df_schd['Season'] = i
    df2 = pd.concat([df2, df_schd])
    
    df_gms = pd.read_csv(f"../tables/{i}/season_gamelogs.csv")
    df_gms['Season'] = i
    df3 = pd.concat([df3, df_gms])

df['Date'] = pd.to_datetime(df.Date)
df2['Date'] = pd.to_datetime(df2.Date)
df3['Date'] = pd.to_datetime(df3.Date)

df['Tms'] = df['game_id'].apply(lambda x: x.split("_")[1:3])
df['WrngTm'] = df.apply(lambda row: 0 if row['Team'] in row['Tms'] else 1, axis=1)
df['WrngOpp'] = df.apply(lambda row: 0 if row['Opp'] in row['Tms'] else 1, axis=1)
df = df[(df.WrngTm == 0) & (df.WrngOpp == 0)].drop(['WrngTm', 'WrngOpp', 'Tms'], axis=1)

df3 = df3[['game_id', 'Date', 'Team', 'Player', 'FG', 'FGA', 'FG%', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TOV', 'PF', '+/-']]\
        .rename(columns={"3PA": "TPA", "3P%": "TP%"})
df3 = df3[~df3[['Date', 'Team', 'Player']].duplicated(keep='last')]
df = df.merge(df3, on=['game_id', 'Date', 'Team', 'Player'])

df_mtch = df2[['Season', 'Date', 'AwayABV', 'HomeABV', 'AwayPTS', 'HomePTS', 'AwayB2B', 'HomeB2B', 'cup_gm', 'pstszn_gm']]
df_mtch['Team_type'] = 'Away'
df_mtch = df_mtch.rename(columns={"AwayABV": "Team", "HomeABV": "Opp", "AwayB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm', 'Team_type']]
df_mtch2 = df_mtch.copy().rename(columns={"Team": "Opp", "Opp": "Team", "HomeB2B": "B2B"})[['Season', 'Date', 'Team', 'AwayPTS', 'HomePTS', 'Opp', 'B2B', 'cup_gm', 'pstszn_gm']]
df_mtch2['Team_type'] = 'Home'
df_mtch = pd.concat([df_mtch, df_mtch2])
df_mtch = df_mtch[['Season', 'Date', 'Team', 'Team_type', 'AwayPTS', 'HomePTS', 'cup_gm', 'pstszn_gm']]
df_mtch = df_mtch.sort_values(["Team", "Date"])
df_mtch['team_game_num'] = df_mtch.groupby(["Team", "Season"]).cumcount() + 1
df_mtch['Spread'] = np.where(df_mtch.Team_type == 'Home', df_mtch.HomePTS - df_mtch.AwayPTS, df_mtch.AwayPTS - df_mtch.HomePTS)
df_mtch['Total'] = df_mtch.AwayPTS + df_mtch.HomePTS
df_mtch['is_Win'] = np.where(df_mtch.Spread > 0, 1, 0)
df_mtch['Szn_Wins'] = df_mtch.groupby(['Season', 'Team'])['is_Win'].cumsum()
df = df.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])

team_encoder = LabelEncoder()
player_encoder = LabelEncoder()
team_type_encoder = LabelEncoder()
position_encoder = LabelEncoder()

# Encode string cols
team_encoder.fit(pd.concat([df["Team"], df["Opp"]], axis=0))
df["Team"] = team_encoder.transform(df["Team"])
df["Opp"] = team_encoder.transform(df["Opp"])
df["Player_name"] = df.Player
df["Player"] = player_encoder.fit_transform(df["Player"])
df["Pos"] = position_encoder.fit_transform(df["Pos"])
df['Team_type'] = team_type_encoder.fit_transform(df['Team_type'])
df = df.sort_values(['Season', 'Date', 'Team', 'Player']).reset_index(drop=True)
print('base df created', datetime.now())

base df created 2025-12-20 13:51:26.510959


# Minutes Projection Model

In [8]:
def setup_df_mins(con, df):
    
    df = df[['Season', 'Date', 'game_id', 'Team', 'Team_type', 'Opp', 'Player', 'Pos', 'B2B', 'MP', 'TOV', 'PF', '+/-',
             'Spread', 'Total', 'team_game_num', 'Szn_Wins', 'cup_gm', 'pstszn_gm']]
    
    for col in ['MP', 'TOV', 'PF', '+/-']:
        df[f'{col}_lst_gm'] = (
            df
            .groupby(['Player', 'Season'])[col]
            .shift(1)
        )

        df[f'{col}_last_5_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=5, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

        df[f'{col}_last_10_avg'] = (
            df.groupby(['Player', 'Season'])[col]
              .rolling(window=10, min_periods=1)
              .mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )

    games_last_7_days = df.groupby(['Player', 'Season']).rolling('7D', on='Date')['MP'].count().shift(1).to_frame(name='games_last_7_days').reset_index()
    df = df.merge(games_last_7_days, on=['Player', 'Season', 'Date'])
    df['games_last_7_days'] = df.games_last_7_days.fillna(0).astype(int)
    
    df['prev_team_mins_pct'] = (df.groupby(['Player', 'Season'])['MP'].shift(1)) / 240
        
    
    df['reserve_td'] = (df.MP < 8).astype(int)
    df['bench_td']   = ((df.MP >= 8) & (df.MP <= 25)).astype(int)
    df['starter_td'] = (df.MP > 25).astype(int)
    role_counts = df.groupby(['Season', 'Player'])[['reserve_td', 'bench_td', 'starter_td']].sum()
    role_counts['most_common_role'] = role_counts[['reserve_td', 'bench_td', 'starter_td']].idxmax(axis=1)
    role_counts['reserve'] = (role_counts['most_common_role'] == 'reserve_td').astype(int)
    role_counts['bench']   = (role_counts['most_common_role'] == 'bench_td').astype(int)
    role_counts['starter'] = (role_counts['most_common_role'] == 'starter_td').astype(int)
    df = df.merge(role_counts[['reserve', 'bench', 'starter']], on=['Season', 'Player'], how='left')
      
    df['missed_games'] = (
        df.groupby(['Player', 'Team', 'Season'])['team_game_num']      
          .diff()
          .sub(1)
          .fillna(0)
          .astype(int)
    )

#     df["career_min_mins"] = (
#         df.assign(MP_nonzero=df["MP"].replace(0, np.nan))
#           .groupby("Player")["MP_nonzero"]
#           .cummin()
#           .shift(1)
#     )
#     df["career_max_mins"] = df.groupby("Player")["MP"].cummax().shift(1)
    
    df['blowout'] = np.where(abs(df.Spread >= 15), 1, 0)
    
#     df['Szn_Wins'] = df.groupby(['Player', 'Season', 'Team'])['Szn_Wins'].shift(1).fillna(0)
#     df['Win_Pct'] = df.Szn_Wins / df.team_game_num
    
    # Have to derive OppSzn_Wins and then add it to the df
#     df['OppSzn_Wins'] = df.groupby(['Player', 'Season', 'Opp'])['Szn_Wins'].shift(1).fillna(0)
    
    df = df.drop(['reserve_td', 'bench_td', 'starter_td', 'Szn_Wins', 'TOV', 'PF', '+/-'], axis=1)    
    
    return df

In [30]:
df_mins = df.copy()
df_mins = setup_df_mins(con, df_mins)
df_mins = df_mins.dropna() # TEMP SOLUTION TO NULLS
display(df_mins)

create_baseline_model(df_mins, 'MP')
df_mins = df_mins.drop(['Season', 'Date', 'game_id'], axis=1)
for col in df_mins.columns:
    if not pd.api.types.is_float_dtype(df_mins[col]):
        df_mins[col] = df_mins[col].astype(float)

X = df_mins.drop('MP', axis=1).values
y = df_mins['MP'].values
hyperparam_tuning(X, y, 10)

Unnamed: 0,Season,Date,game_id,Team,Team_type,Opp,Player,Pos,B2B,MP,Spread,Total,team_game_num,cup_gm,pstszn_gm,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,TOV_lst_gm,TOV_last_5_avg,TOV_last_10_avg,PF_lst_gm,PF_last_5_avg,PF_last_10_avg,+/-_lst_gm,+/-_last_5_avg,+/-_last_10_avg,games_last_7_days,prev_team_mins_pct,reserve,bench,starter,missed_games,blowout
281,2022,2022-10-22,20221022_BOS_ORL,1,0,21,61,0,1,9.55,6.0,246.0,3,0,0,0.00,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,1,0,0,0,0
282,2022,2022-10-22,20221022_BOS_ORL,1,0,21,188,4,1,36.95,6.0,246.0,3,0,0,24.65,24.650,24.650,1.0,1.0,1.0,4.0,4.0,4.0,1.0,1.0,1.0,1,0.102708,0,0,1,0,0
283,2022,2022-10-22,20221022_BOS_ORL,1,0,21,256,1,1,31.05,6.0,246.0,3,0,0,25.40,25.400,25.400,1.0,1.0,1.0,1.0,1.0,1.0,15.0,15.0,15.0,1,0.105833,0,0,1,0,0
284,2022,2022-10-22,20221022_BOS_ORL,1,0,21,291,4,1,0.00,6.0,246.0,3,0,0,0.00,0.000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,1,0,0,0,0
285,2022,2022-10-22,20221022_BOS_ORL,1,0,21,356,3,1,37.50,6.0,246.0,3,0,0,34.02,34.020,34.020,3.0,3.0,3.0,5.0,5.0,5.0,-4.0,-4.0,-4.0,1,0.141750,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95151,2025,2025-12-19,20251219_SAS_ATL,26,0,0,423,3,1,18.32,28.0,224.0,28,0,0,25.85,27.014,29.439,1.0,0.6,0.6,1.0,1.6,1.9,9.0,5.0,-0.8,3,0.107708,0,0,1,0,1
95152,2025,2025-12-19,20251219_SAS_ATL,26,0,0,448,3,1,16.13,28.0,224.0,28,0,0,21.03,17.904,21.556,1.0,0.4,0.8,2.0,1.8,2.0,15.0,0.4,1.5,3,0.087625,0,1,0,0,1
95153,2025,2025-12-19,20251219_SAS_ATL,26,0,0,514,0,1,17.62,28.0,224.0,28,0,0,23.18,25.790,26.485,0.0,0.2,0.4,2.0,1.8,2.1,12.0,-0.6,1.9,3,0.096583,0,0,1,0,1
95154,2025,2025-12-19,20251219_SAS_ATL,26,0,0,702,2,1,23.92,28.0,224.0,28,0,0,25.13,28.670,29.341,3.0,3.6,3.0,1.0,3.4,3.8,15.0,4.4,2.8,3,0.104708,0,0,1,0,1


Minutes Model
tensor X torch.Size([93072, 9])
tensor y torch.Size([93072, 1])
Epoch 000 | Train MSE: 11076.9277 | Val MSE: 10075.1406
Epoch 010 | Train MSE: 3285.9463 | Val MSE: 2788.0454
Epoch 020 | Train MSE: 942.7075 | Val MSE: 865.5678
Epoch 030 | Train MSE: 884.6129 | Val MSE: 878.0828
Epoch 040 | Train MSE: 655.2648 | Val MSE: 619.5065
Epoch 050 | Train MSE: 398.7229 | Val MSE: 375.4204
Epoch 060 | Train MSE: 290.1543 | Val MSE: 274.0856
Epoch 070 | Train MSE: 197.8013 | Val MSE: 186.6168
Epoch 080 | Train MSE: 130.8905 | Val MSE: 125.6978
Epoch 090 | Train MSE: 88.1014 | Val MSE: 85.5687
Epoch 100 | Train MSE: 58.6246 | Val MSE: 57.8217
Epoch 110 | Train MSE: 40.2801 | Val MSE: 40.5992
Epoch 120 | Train MSE: 28.8693 | Val MSE: 29.7940
Epoch 130 | Train MSE: 22.0539 | Val MSE: 23.2633
Epoch 140 | Train MSE: 18.0371 | Val MSE: 19.3578
Epoch 150 | Train MSE: 15.6645 | Val MSE: 17.0067
Epoch 160 | Train MSE: 14.2153 | Val MSE: 15.5186
Epoch 170 | Train MSE: 13.2613 | Val MSE: 14.507

2025-12-20 14:33:03,639	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-12-20 14:33:42
Running for:,00:00:39.06
Memory:,24.6/63.9 GiB

Trial name,status,loc,hidden_units,lr,iter,total time (s),train_loss,val_loss
train_and_eval_d908c_00000,TERMINATED,127.0.0.1:108076,32,0.00176008,1,8.50392,42.0637,43.2967
train_and_eval_d908c_00001,TERMINATED,127.0.0.1:95192,8,0.000849076,1,8.59816,43.9458,44.6765
train_and_eval_d908c_00002,TERMINATED,127.0.0.1:88984,32,0.000910551,1,8.78912,41.9536,43.3941
train_and_eval_d908c_00003,TERMINATED,127.0.0.1:106832,16,0.000631155,1,9.42549,43.8237,44.648
train_and_eval_d908c_00004,TERMINATED,127.0.0.1:100564,8,0.000523327,1,8.97413,45.5291,46.2146
train_and_eval_d908c_00005,TERMINATED,127.0.0.1:79888,16,0.000322189,1,9.41954,43.9111,45.0026
train_and_eval_d908c_00006,TERMINATED,127.0.0.1:77012,8,0.000336309,1,8.93495,45.9197,46.2198
train_and_eval_d908c_00007,TERMINATED,127.0.0.1:80884,8,0.00192269,1,8.89725,43.7446,44.7099
train_and_eval_d908c_00008,TERMINATED,127.0.0.1:72220,16,0.000106302,1,9.2261,99.3026,95.8402
train_and_eval_d908c_00009,TERMINATED,127.0.0.1:88996,8,0.00288276,1,8.80549,41.9752,43.3611


Trial name,train_loss,val_loss
train_and_eval_d908c_00000,42.0637,43.2967
train_and_eval_d908c_00001,43.9458,44.6765
train_and_eval_d908c_00002,41.9536,43.3941
train_and_eval_d908c_00003,43.8237,44.648
train_and_eval_d908c_00004,45.5291,46.2146
train_and_eval_d908c_00005,43.9111,45.0026
train_and_eval_d908c_00006,45.9197,46.2198
train_and_eval_d908c_00007,43.7446,44.7099
train_and_eval_d908c_00008,99.3026,95.8402
train_and_eval_d908c_00009,41.9752,43.3611


2025-12-20 14:33:42,715	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Rodolfo/Jupyter_files/FantasyBasketball/ray_temp_outputs/train_and_eval_2025-12-20_14-33-03' in 15.8442s.
2025-12-20 14:33:42,744	INFO tune.py:1041 -- Total run time: 39.10 seconds (23.22 seconds for the tuning loop).


Final test loss: 52.954341888427734
[[33.27912 ]
 [ 8.136247]
 [24.285643]
 ...
 [23.02535 ]
 [26.874697]
 [25.17738 ]]


In [11]:
# rmse = np.sqrt(mean_squared_error(mins_splits[5], mins_preds)) # splits[5] = y_test
# mins_test_df['MP_pred'] = mins_preds
# df_test = mins_test_df.copy()

# df_test['Team'] = team_encoder.inverse_transform(df_test['Team'])
# df_test['Opp'] = team_encoder.inverse_transform(df_test['Opp'])
# df_test['Player'] = player_encoder.inverse_transform(df_test['Player'])
# df_test['Pos'] = position_encoder.inverse_transform(df_test['Pos'])

# df_test['Diff'] = abs(df_test['MP_pred'] - df_test['MP'])
# df_test['InRMSE_Range'] = np.where(df_test['Diff'] <= rmse, 1, 0)

# print("Total Accuracy (InRMSE_Range):", ((df_test.InRMSE_Range == 1).sum() / df_test.shape[0]))
# print((df_test.InRMSE_Range == 1).sum(), "/", df_test.shape[0])

# df_ystrday = df_test[(df_test.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d"))]\
#             [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_pred', 'InRMSE_Range', 'Diff', 'Spread']]
# print("\nYesterday's Results:")
# print("Total Accuracy (InRMSE_Range):", ((df_ystrday.InRMSE_Range == 1).sum() / df_ystrday.shape[0]))
# if df_ystrday.shape[0] >= 50:
#     for tm in df_ystrday.Team.unique():
#         display(df_ystrday[df_ystrday.Team == tm])
# else:
#     display(df_ystrday)

# Main Model

In [12]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

def travel_km_from_row(row):
    prev = row['PrevLocation']
    cur  = row['Location']
    # missing prev => first game => no travel
    if pd.isna(prev) or pd.isna(cur):
        return 0.0
    # same arena => 0
    if prev == cur:
        return 0.0
    # lookup coords
    prev_coords = arenas.get(prev)
    cur_coords  = arenas.get(cur)
    if not prev_coords or not cur_coords:
        # fallback if code not found
        return 0.0
    return haversine_km(prev_coords[0], prev_coords[1], cur_coords[0], cur_coords[1])

In [13]:
def setup_df_main(df):
    
    # Minutes based Features
    df['MP_lst_gm'] = (
        df
        .groupby(['Player', 'Season'])['MP']
        .shift(1)
    )

    df['MP_last_5_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=5, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    df['MP_last_10_avg'] = (
        df.groupby(['Player', 'Season'])['MP']
          .rolling(window=10, min_periods=1)
          .mean()
          .shift(1)
          .reset_index(level=[0, 1], drop=True)
    )
    
    # Location based features
    df["PrevOpp"] = df.groupby("Player")["Opp"].shift(1)
    df["DaysLstGm"] = (df.groupby("Player")["Date"].diff().dt.days).fillna(0).astype(int)
    df['Location'] = df.apply(lambda r: r['Team'] if r['Team_type'] == 'Home' else r['Opp'], axis=1)
    df['PrevLocation'] = df.groupby('Player')['Location'].shift(1)
    df['travel_km'] = df.apply(travel_km_from_row, axis=1).fillna(0)
    df['travel_hours'] = df['travel_km'] / 800.0      # approximate flight hours
    df['is_long_trip'] = (df['travel_km'] > 1500).astype(int)
    df['same_arena'] = (df['PrevLocation'] == df['Location']).astype(int)
    
    # Efficiency metrics
    df['three_rate_raw'] =  np.where(df.FGA > 0, df['TPA'] / df['FGA'], 0)
    df['ft_rate_raw']    =  np.where(df.FGA > 0, df['FTA'] / df['FGA'], 0)
    df['eFG_raw'] = (df['FG'] + 0.5 * df['TPM']) / df['FGA']
    df['TS_raw'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))    
    df['usage_proxy_raw'] =  np.where(df.MP > 0, (df['FGA'] + 0.44 * df['FTA']) / df['MP'], 0)
    eff_cols = []
    for w in [3, 5, 10]:
        for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
            col = f"{metric}_L{w}"
            df[col] = (
                df.groupby(['Player','Season'])[f'{metric}_raw']
                  .rolling(w, min_periods=1)
                  .mean()
                  .shift(1)
                  .reset_index(level=[0,1], drop=True)
            )
            eff_cols.append(col)
    for metric in ['three_rate', 'ft_rate', 'eFG', 'TS', 'usage_proxy']:
        col = f'{metric}_weighted'
        df[col] = (
            0.6 * df[f'{metric}_L3'] +
            0.3 * df[f'{metric}_L5'] +
            0.1 * df[f'{metric}_L10']
        )
        eff_cols.append(col)
    
    df['FGA_L5_avg'] = df.groupby(['Player', 'Season'])['FGA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['TPA_L5_avg'] = df.groupby(['Player', 'Season'])['TPA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    df['FTA_L5_avg'] = df.groupby(['Player', 'Season'])['FTA'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
    
    stat_cols = []
    for col in [f'Off_{tgt_stat}', f'Off_L5_{tgt_stat}', f'Def_{tgt_stat}', f'Def_L5_{tgt_stat}']:
        stat_cols.append(col)

    final_cols = ['Date', 'Team', 'Team_type', 'B2B', 'cup_gm', 'pstszn_gm', 'Player', 'Pos', 'Opp', 
                  'MP', 'MP_lst_gm', 'MP_last_5_avg', 'MP_last_10_avg', 'Spread', 'Total',
                  'DaysLstGm', 'travel_km', 'travel_hours', 'PrevLocation', 'is_long_trip', 'same_arena', 
                  'FGA_L5_avg', 'TPA_L5_avg', 'FTA_L5_avg',
                  tgt_stat] + stat_cols + eff_cols
    df = df[final_cols]
    
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(0)
        
    # PRA features
    if tgt_stat == 'PTS':
        pass
    elif tgt_stat == 'PRA':
        df['PRA_per_min'] = np.where(df.MP > 0, df.PRA / df['MP'], 0)
        df['PRA_last_5_per_min_avg'] = df.groupby('Player')['PRA_per_min'].rolling(5, min_periods=1).mean().shift(1).reset_index(level=[0,1], drop=True)
        df = df.drop(['PRA_per_min'], axis=1)
        
    return df

In [27]:
tgt_stat = "PRA"
df_main = df.copy()
df_main = setup_df_main(df_main)
df_main = df_main.dropna() # TEMP SOLUTION TO NULLS
display(df_main)

create_baseline_model(df_main, tgt_stat)

df_main = df_main.drop(['Date'], axis=1)
for col in df_main.columns:
    if not pd.api.types.is_float_dtype(df_main[col]):
        df_main[col] = df_main[col].astype(float)

X = df_main.drop(tgt_stat, axis=1).values
y = df_main[tgt_stat].values
hyperparam_tuning(X, y, 10)

Unnamed: 0,Date,Team,Team_type,B2B,cup_gm,pstszn_gm,Player,Pos,Opp,MP,MP_lst_gm,MP_last_5_avg,MP_last_10_avg,Spread,Total,DaysLstGm,travel_km,travel_hours,PrevLocation,is_long_trip,same_arena,FGA_L5_avg,TPA_L5_avg,FTA_L5_avg,PRA,Off_PRA,Off_L5_PRA,Def_PRA,Def_L5_PRA,three_rate_L3,ft_rate_L3,eFG_L3,TS_L3,usage_proxy_L3,three_rate_L5,ft_rate_L5,eFG_L5,TS_L5,usage_proxy_L5,three_rate_L10,ft_rate_L10,eFG_L10,TS_L10,usage_proxy_L10,three_rate_weighted,ft_rate_weighted,eFG_weighted,TS_weighted,usage_proxy_weighted,PRA_last_5_per_min_avg
1,2022-10-21,0,1,0,0,0,5,2,21,14.37,0.00,24.286,26.067000,10.0,206.0,0,0.0,0.0,0.0,0,0,11.4,4.4,4.2,7,6.000000,6.0,29.00,29.0,0.559259,0.829630,0.416667,0.519801,0.616138,0.508283,0.691717,0.475758,0.563433,0.552491,0.479785,0.587601,0.523390,0.606242,0.509510,0.536019,0.764053,0.445066,0.541535,0.586381,0.448430
2,2022-10-21,0,1,0,0,0,120,0,21,31.62,0.00,0.934,3.465000,10.0,206.0,0,0.0,0.0,0.0,0,0,0.2,0.2,0.0,15,10.000000,10.0,23.50,23.5,0.333333,0.000000,1.500000,1.500000,0.071378,0.200000,0.000000,1.500000,1.500000,0.042827,0.300000,0.000000,0.600000,0.600000,0.088741,0.290000,0.000000,1.410000,1.410000,0.064549,0.420937
3,2022-10-21,0,1,0,0,0,171,3,21,32.53,0.00,29.318,30.218000,10.0,206.0,0,0.0,0.0,0.0,0,0,12.8,5.6,2.8,19,26.000000,26.0,17.00,17.0,0.411422,0.320513,0.337558,0.394174,0.462433,0.434304,0.245641,0.446849,0.485416,0.477099,0.403221,0.308395,0.509975,0.558581,0.554476,0.417466,0.296839,0.387587,0.437987,0.476037,0.280624
4,2022-10-21,0,1,0,0,0,178,4,21,39.62,0.00,29.280,28.101000,10.0,206.0,0,0.0,0.0,0.0,0,0,9.4,0.0,1.0,38,36.000000,36.0,26.00,26.0,0.000000,0.134680,0.821549,0.799477,0.334001,0.000000,0.098990,0.705051,0.694954,0.338075,0.000000,0.175991,0.721115,0.718415,0.332501,0.000000,0.128104,0.776556,0.760014,0.335073,0.210468
5,2022-10-21,0,1,0,0,0,235,0,21,0.00,0.00,1.858,1.327143,10.0,206.0,0,0.0,0.0,0.0,0,0,0.4,0.0,0.4,0,0.000000,0.0,23.50,23.5,0.000000,0.333333,0.500000,0.520833,0.149300,0.000000,0.200000,0.500000,0.520833,0.089580,0.000000,0.142857,0.500000,0.520833,0.063986,0.000000,0.274286,0.500000,0.520833,0.122853,0.168375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95151,2025-12-19,26,0,1,0,0,423,3,0,18.32,25.85,27.014,29.439000,28.0,224.0,1,0.0,0.0,29.0,0,0,7.2,6.2,0.8,13,0.000000,0.0,0.00,0.0,0.933333,0.000000,0.422222,0.422222,0.245345,0.860000,0.094444,0.595000,0.605578,0.277445,0.800000,0.159722,0.557500,0.591225,0.300730,0.898000,0.044306,0.487583,0.494129,0.260513,0.988909
95152,2025-12-19,26,0,1,0,0,448,3,0,16.13,21.03,17.904,21.556000,28.0,224.0,1,0.0,0.0,29.0,0,0,6.4,2.8,1.8,18,0.000000,0.0,0.00,0.0,0.555556,0.000000,0.479167,0.479167,0.286266,0.469444,0.205556,0.569444,0.563364,0.386907,0.419420,0.358382,0.594132,0.612289,0.415916,0.516109,0.097505,0.517746,0.517738,0.329423,0.965009
95153,2025-12-19,26,0,1,0,0,514,0,0,17.62,23.18,25.790,26.485000,28.0,224.0,1,0.0,0.0,29.0,0,0,5.6,0.0,2.2,17,0.000000,0.0,0.00,0.0,0.000000,0.111111,0.592593,0.612618,0.271872,0.000000,0.666667,0.507937,0.596964,0.259860,0.000000,0.907143,0.540278,0.618587,0.232770,0.000000,0.357381,0.561964,0.608519,0.264358,0.000000
95154,2025-12-19,26,0,1,0,0,702,2,0,23.92,25.13,28.670,29.341000,28.0,224.0,1,0.0,0.0,29.0,0,0,14.2,4.2,5.6,26,31.277778,32.8,34.85,36.8,0.322348,0.273990,0.557828,0.584616,0.500496,0.305790,0.399632,0.572316,0.615000,0.596182,0.297835,0.466582,0.536654,0.578113,0.521046,0.314930,0.330942,0.560057,0.593081,0.531256,0.000000


Stats Model
tensor X torch.Size([95155, 9])
tensor y torch.Size([95155, 1])
Epoch 000 | Train MSE: 861.3660 | Val MSE: 808.6998
Epoch 010 | Train MSE: 231.2836 | Val MSE: 210.0199
Epoch 020 | Train MSE: 72.4779 | Val MSE: 72.8319
Epoch 030 | Train MSE: 73.4215 | Val MSE: 72.8823
Epoch 040 | Train MSE: 58.1481 | Val MSE: 59.2394
Epoch 050 | Train MSE: 47.1639 | Val MSE: 50.2533
Epoch 060 | Train MSE: 41.0898 | Val MSE: 42.8874
Epoch 070 | Train MSE: 36.0477 | Val MSE: 37.2972
Epoch 080 | Train MSE: 32.0550 | Val MSE: 33.2334
Epoch 090 | Train MSE: 28.6002 | Val MSE: 29.9551
Epoch 100 | Train MSE: 25.7607 | Val MSE: 27.0787
Epoch 110 | Train MSE: 23.2669 | Val MSE: 24.3989
Epoch 120 | Train MSE: 21.0936 | Val MSE: 22.0952
Epoch 130 | Train MSE: 19.1497 | Val MSE: 20.0661
Epoch 140 | Train MSE: 17.3940 | Val MSE: 18.2200
Epoch 150 | Train MSE: 15.7923 | Val MSE: 16.5227
Epoch 160 | Train MSE: 14.3235 | Val MSE: 14.9706
Epoch 170 | Train MSE: 12.9736 | Val MSE: 13.5489
Epoch 180 | Train MS

In [16]:
# rmse = np.sqrt(mean_squared_error(main_splits[5], stat_preds)) # splits[5] = y_test
# mae = mean_absolute_error(main_splits[5], stat_preds)
# pred_col = f'{tgt_stat}_Pred'
# df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)
# df_lines = df_lines[['Date', 'Team', 'Player', f'{tgt_stat}_line']]
# main_test_df[pred_col] = stat_preds
# main_test_df['Team'] = team_encoder.inverse_transform(main_test_df["Team"])
# main_test_df['Player'] = player_encoder.inverse_transform(main_test_df["Player"])

# df_test = main_test_df.merge(df_lines, on=['Date', 'Team', 'Player'])
# df_test = df_test[[c for c in df_test.columns if c != pred_col] + [pred_col]]

# df_test['Diff'] = df_test[f'{tgt_stat}_Pred'] - df_test[f'{tgt_stat}_line']
# df_test['Act_Res'] = np.where(df_test[tgt_stat] > df_test[f'{tgt_stat}_line'], 'O', 'U')
# df_test['Pred_Res'] = np.where(df_test[pred_col] > df_test[f'{tgt_stat}_line'], 'O', 'U')
# df_test['ParlayHit'] = np.where(df_test['Act_Res'] == df_test['Pred_Res'], 1, 0)

# df_test['Diff2'] = abs(df_test[f'{tgt_stat}_Pred'] - df_test[tgt_stat])
# df_test['InRMSE_Range'] = np.where(df_test['Diff2'] <= rmse, 1, 0)

# print("Total Accuracy (ParlayHit):", ((df_test.ParlayHit == 1).sum() / df_test.shape[0]))
# print((df_test.ParlayHit == 1).sum(), "/", df_test.shape[0])

# print("\nTotal Accuracy (InRMSE_Range):", ((df_test.InRMSE_Range == 1).sum() / df_test.shape[0]))
# print((df_test.InRMSE_Range == 1).sum(), "/", df_test.shape[0])

# df_ystrday = df_test[(df_test.Date == (datetime.strptime(now, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")) & ~(df_test[f'{tgt_stat}_line'].isnull())]\
#             [['Team', 'Player', tgt_stat, f'{tgt_stat}_line', f'{tgt_stat}_Pred', 'ParlayHit', 'Diff', 'InRMSE_Range', 'Diff2']]\
#             .sort_values(f'{tgt_stat}_line', ascending=False)

# print("\nYesterday's Results:")
# print("Total Accuracy (ParlayHit):", ((df_ystrday.ParlayHit == 1).sum() / df_ystrday.shape[0]))
# print("Total Accuracy (InRMSE_Range):", ((df_ystrday.InRMSE_Range == 1).sum() / df_ystrday.shape[0]))
# if df_ystrday.shape[0] >= 50:
#     for tm in df_ystrday.Team.unique():
#         display(df_ystrday[df_ystrday.Team == tm])
# else:
#     display(df_ystrday)

### Today's predictions

In [17]:
# df_lines = pd.read_csv(f"../tables/2025/parlay_lines.csv")
# df_lines['Date'] = pd.to_datetime(df_lines.Date)

# df_pred = pd.read_csv("../tables/2025/parlay_stats.csv")
# df_pred['Date'] = pd.to_datetime(df_pred.Date)
# df_pred['Season'] = 2025
# for col in df_pred.select_dtypes(include='object').columns:
#     df_pred[col] = df_pred[col].astype('category')
# df_pred = df_pred.drop(['Spread', 'Total'], axis=1).merge(df_mtch, on=['Season', 'Date', 'Team'])
# df_pred[tgt_stat] = 0

# # Predict minutes
# df_act_mins = pd.read_csv("../tables/2025/parlay_actuals.csv")
# df_act_mins['Date'] = pd.to_datetime(df_act_mins.Date)
# df_pred = df_pred[df_pred.Player.isin(df.Player_name.unique())].merge(df_act_mins[['Date', 'Team', 'Player', 'MP', 'TPM']], on=['Date', 'Team', 'Player'], how='left')
# df_pred = df_pred.merge(df3[['Date', 'Team', 'Player', 'TOV', 'PF', '+/-', 'FGA', 'FG', 'TPA', 'FT', 'FTA']], on=['Date', 'Team', 'Player'], how='left')

# df_pred = df_pred.merge(df_lines, on=['Date', 'Team', 'Player'], how='left')
# df_pred['Spread_x'] = np.where(df_pred.Spread_x.isnull(), df_pred.Spread_y, df_pred.Spread_x)
# df_pred['Total_x'] = np.where(df_pred.Total_x.isnull(), df_pred.Total_y, df_pred.Total_x)
# df_pred = df_pred.rename(columns={"Spread_x": "Spread", "Total_x": "Total"}).drop(['Spread_y', 'Total_y'], axis=1)
# df_pred_mins = setup_df_mins(con, df_pred)

# df_pred_mins = df_pred_mins.drop(['Date', 'MP'], axis=1)
# df_pred_mins["Team"] = team_encoder.transform(df_pred_mins["Team"])
# df_pred_mins["Opp"] = team_encoder.transform(df_pred_mins["Opp"])
# df_pred_mins['Team_type'] = team_type_encoder.transform(df_pred_mins['Team_type'])
# df_pred_mins["Player"] = player_encoder.transform(df_pred_mins["Player"])
# df_pred_mins["Pos"] = position_encoder.transform(df_pred_mins["Pos"])
# DM_mins = xgb.DMatrix(df_pred_mins)
# df_pred['MP'] = mins_model.predict(DM_mins)
# df_pred['N_TPM'] = df_pred.FG - df_pred.TPM
# df_pred['PTS'] = (df_pred.FT * 1) + (df_pred.N_TPM * 2) + (df_pred.TPM * 3)
# df_pred = setup_df_main(df_pred)
# feature_cols = [col for col in df_pred.columns if col not in ['Date', tgt_stat]]
# df_pred = df_pred[df_pred.Date == now][feature_cols]

# # Predict stat
# df_pred["Team"] = team_encoder.transform(df_pred["Team"])
# df_pred["Opp"] = team_encoder.transform(df_pred["Opp"])
# df_pred = df_pred[~(df_pred.PrevLocation.isnull())] # Filters out players who are debuting on the year
# df_pred["PrevLocation"] = team_encoder.transform(df_pred["PrevLocation"])
# df_pred["Player"] = player_encoder.transform(df_pred["Player"])
# df_pred["Pos"] = position_encoder.transform(df_pred["Pos"])
# df_pred['Team_type'] = team_type_encoder.transform(df_pred['Team_type'])
# DM_stats = xgb.DMatrix(df_pred)
# df_pred[f"{tgt_stat}_proj"] = stat_model.predict(DM_stats)

# df_pred['Team'] = team_encoder.inverse_transform(df_pred["Team"])
# df_pred['Opp'] = team_encoder.inverse_transform(df_pred["Opp"])
# df_pred['Player'] = player_encoder.inverse_transform(df_pred["Player"])
# df_pred['Pos'] = position_encoder.inverse_transform(df_pred["Pos"])

# df_lines = df_lines[df_lines.Date == now][['Team', 'Player', f'{tgt_stat}_line']]
# df_pred = df_pred.merge(df_lines, on=['Team', 'Player'])

# tds_picks = df_pred[~(df_pred[f'{tgt_stat}_line'].isnull())]\
#             [['Team', 'Player', 'Pos', 'Opp', 'MP', 'MP_last_5_avg', f'{tgt_stat}_line', f'{tgt_stat}_proj']]
# tds_picks['Diff'] = abs((df_pred[f'{tgt_stat}_line'] - df_pred[f'{tgt_stat}_proj']))
# tds_picks['Diff2'] = abs((df_pred['MP'] - df_pred['MP_last_5_avg']))
# tds_picks = tds_picks[(tds_picks.Diff >= mae) & (tds_picks.Diff2 <= 5)].sort_values('Diff', ascending=False).drop(['Diff', 'Diff2'], axis=1)
# display(tds_picks)
# tds_picks.insert(0, 'Date', pd.to_datetime(now))
# partition_save_df(tds_picks, f"../tables/2025/gmday_preds_{tgt_stat}.csv")