In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import optuna

In [3]:
import os
import random
import numpy as np
import torch

# Set a fixed seed value
seed_value = 10

# 1. Set PYTHONHASHSEED environment variable for reproducibility in hashing
os.environ['PYTHONHASHSEED'] = str(seed_value)

# 2. Set the seed for Python's built-in random module
random.seed(10)

# 3. Set the seed for NumPy
np.random.seed(10)

# 4. Set the seed for PyTorch
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)

# 5. Force PyTorch to use deterministic algorithms (may impact performance)
torch.use_deterministic_algorithms(True, warn_only=True)

# 6. Optionally limit the number of threads used by OMP and MKL (helps reduce non-determinism)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

print("Seeds and environment variables set for reproducibility.")

Seeds and environment variables set for reproducibility.


In [5]:
nba_series = nba_series = pd.read_csv("TEAM_MATCHUP_DATA_CLUSTER.csv") # updated data set with CLUSTERING PROPORTIONS (CHECK BOX)

nba_series['SEASON_YEAR'] = nba_series['SEASON'].str.split('-').str[0].astype(int)
nba_series['SEASON'] = (nba_series['SEASON_YEAR'] + 1).astype(int)
nba_series = nba_series.drop(columns=['SEASON_YEAR'])

In [7]:
# Drop columns that are not needed
nba_series = nba_series.drop(columns = ['SERIES_ID', 'SEASON_ID', 'TEAM_1_ID', 'TEAM_2_ID', 'CLUSTER_TEAM_1', 'CLUSTER_TEAM_2'])

# Create a mask to flip half of the rows
flip_mask = np.random.rand(len(nba_series)) < 0.5

# Columns to swap
team1_stat_cols = [col for col in nba_series.columns if '_TEAM_1' in col]
team2_stat_cols = [col.replace('_TEAM_1', '_TEAM_2') for col in team1_stat_cols]

# Include team name columns for flipping
stat_swap_cols = team1_stat_cols + team2_stat_cols + ['TEAM_1', 'TEAM_2']

# Create deep copies of swapped and non-swapped rows
swapped = nba_series.loc[flip_mask].copy()
not_swapped = nba_series.loc[~flip_mask].copy()

# Flip stats
swapped[team1_stat_cols] = nba_series.loc[flip_mask, team2_stat_cols].values
swapped[team2_stat_cols] = nba_series.loc[flip_mask, team1_stat_cols].values

# Flip team names
swapped['TEAM_1'] = nba_series.loc[flip_mask, 'TEAM_2'].values
swapped['TEAM_2'] = nba_series.loc[flip_mask, 'TEAM_1'].values

# Recalculate TEAM_1_W based on new TEAM_1 vs SERIES_WINNER
swapped['TEAM_1_W'] = (swapped['SERIES_WINNER'] == swapped['TEAM_1']).astype(int)
not_swapped['TEAM_1_W'] = (not_swapped['SERIES_WINNER'] == not_swapped['TEAM_1']).astype(int)

# Combine flipped and unflipped
nba_series_balanced = pd.concat([swapped, not_swapped], ignore_index=True)

# Optional: Shuffle the final DataFrame
nba_series_balanced = nba_series_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# test_data_preserved is created for displaying results later on
test_data_preserved = nba_series_balanced[['SEASON', 'TEAM_1', 'TEAM_2', 'SERIES_WINNER']]
test_data_preserved = test_data_preserved

  swapped['TEAM_1_W'] = (swapped['SERIES_WINNER'] == swapped['TEAM_1']).astype(int)


In [9]:
# Automatically extract stat bases by checking for matching suffixes
diff_df = pd.DataFrame()

# Grab all columns ending in _TEAM_1
team1_cols = [col for col in nba_series_balanced.columns if col.endswith('_TEAM_1')]

for col1 in team1_cols:
    # Get the base stat name (e.g., 'AST', 'FG_PCT')
    stat_base = col1.replace('_TEAM_1', '')
    col2 = f'{stat_base}_TEAM_2'
    
    # Only compute diff if TEAM_2 version exists
    if col2 in nba_series_balanced.columns:
        diff_df[f'{stat_base}_DIFF'] = nba_series_balanced[col1] - nba_series_balanced[col2]

# Add label and season columns
diff_df['TEAM_1_W'] = nba_series_balanced['TEAM_1_W']
diff_df['SEASON'] = nba_series_balanced['SEASON']

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, log_loss
import numpy as np

# Tracking
all_y_true = []
all_y_prob_blended = []
all_y_pred_blended = []

# Per-year metrics
yearly_metrics = []

years = np.arange(2020, 2025)

# 84% accuracy set (2020-2025 test), 0.883 AUC, threshold = 0.5, seed = 8 (fit, optuna)
params_a = {
    'n_d': 56,
    'n_a': 64,
    'n_steps': 6,
    'gamma': 1.3930814539055605,
    'lambda_sparse': 0.0029833605183872134,
    'momentum': 0.15891701764952307,
    'mask_type': 'entmax'
}

opt_params_a = {
    'lr': 0.0076661428463103455,
    'weight_decay': 2.501056095511525e-05
}

# 85% accuracy set (2020-2025 test), 0.89 AUC, threshold = 0.51, seed = 10 (fit, optuna)
params_b = {
    'n_d': 48,
    'n_a': 48,
    'n_steps': 8,
    'gamma': 1.0950455922432034,
    'lambda_sparse': 0.000365814411562923,
    'momentum': 0.3675391603570311,
    'mask_type': 'sparsemax'
}

opt_params_b = {
    'lr': 0.002071649487926403,
    'weight_decay': 4.2583833531711615e-06
}

# 79% accuracy set (2005-2005 test), 0.869 AUC, threshold = 0.5, seed = 5 (fit, optuna) 
params_c = {
    'n_d': 48,
    'n_a': 32,
    'n_steps': 5,
    'gamma': 1.1885687695822824,
    'lambda_sparse': 0.0019929526182699384,
    'momentum': 0.25773242436863386,
    'mask_type': 'entmax'
}

opt_params_c = {
    'lr': 0.01601995480535204,
    'weight_decay': 0.00015417059849810502,
}


for year in years:
    print(f"\n📆 Evaluating year: {year}")
    train_data = diff_df[diff_df['SEASON'] < year]
    test_data = diff_df[diff_df['SEASON'] == year]
    
    if train_data.empty or test_data.empty:
        continue

    X_train = train_data.drop(columns=['TEAM_1_W', 'SEASON'])
    y_train = train_data['TEAM_1_W']
    X_test = test_data.drop(columns=['TEAM_1_W', 'SEASON'])
    y_test = test_data['TEAM_1_W']

    smote = SMOTE(random_state=10)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # -------- Model A --------
    model_a = TabNetClassifier(
        **params_a,
        optimizer_params=opt_params_a,
        seed=8,
        device_name='cpu',
        verbose=0
    )
    model_a.fit(
        X_train_resampled.values, y_train_resampled.values,
        eval_set=[(X_test.values, y_test.values)],
        eval_metric=['auc'],
        max_epochs=200, patience=20,
        batch_size=64, virtual_batch_size=32
    )
    probs_a = model_a.predict_proba(X_test.values)[:, 1]

    # -------- Model B --------
    model_b = TabNetClassifier(
        **params_b,
        optimizer_params=opt_params_b,
        seed=10,
        device_name='cpu',
        verbose=0
    )
    model_b.fit(
        X_train_resampled.values, y_train_resampled.values,
        eval_set=[(X_test.values, y_test.values)],
        eval_metric=['auc'],
        max_epochs=200, patience=20,
        batch_size=64, virtual_batch_size=32
    )
    probs_b = model_b.predict_proba(X_test.values)[:, 1]

    '''
    # -------- Model C --------
    model_c = TabNetClassifier(
        **params_c,
        optimizer_params=opt_params_c,
        seed=5,
        device_name='cpu',
        verbose=0
    )
    model_c.fit(
        X_train_resampled.values, y_train_resampled.values,
        eval_set=[(X_test.values, y_test.values)],
        eval_metric=['auc'],
        max_epochs=200, patience=20,
        batch_size=64, virtual_batch_size=32
    )
    probs_c = model_c.predict_proba(X_test.values)[:, 1]
    '''

    # -------- Blend Predictions --------
    blended_probs = (probs_a + probs_b) / 2
    blended_preds = (blended_probs >= 0.505).astype(int)

    bce_loss = log_loss(y_test, blended_probs)

    # Metrics for this year
    auc = roc_auc_score(y_test, blended_probs)
    acc = accuracy_score(y_test, blended_preds)
    f1 = f1_score(y_test, blended_preds)
    cm = confusion_matrix(y_test, blended_preds)

    print(f"AUC: {auc:.4f} | Accuracy: {acc:.4f} | F1: {f1:.4f}")
    print(f"Binary Cross-Entropy Loss: {bce_loss:.4f}")
    print("Confusion Matrix:")
    print(cm)

    yearly_metrics.append({
        "year": year,
        "auc": auc,
        "accuracy": acc,
        "f1": f1,
        "confusion_matrix": cm
    })

    # Global tracking
    all_y_true.extend(y_test)
    all_y_prob_blended.extend(blended_probs)
    all_y_pred_blended.extend(blended_preds)

# ------ Overall Metrics ------
overall_auc = roc_auc_score(all_y_true, all_y_prob_blended)
overall_acc = accuracy_score(all_y_true, all_y_pred_blended)
overall_f1 = f1_score(all_y_true, all_y_pred_blended)
overall_prec = precision_score(all_y_true, all_y_pred_blended)
overall_rec = recall_score(all_y_true, all_y_pred_blended)
overall_bce_loss = log_loss(all_y_true, all_y_prob_blended)

print(f"\n🔁 Overall Blended Model Performance:")
print(f"AUC: {overall_auc:.4f}, Accuracy: {overall_acc:.4f}, F1: {overall_f1:.4f}, Precision: {overall_prec:.4f}, Recall: {overall_rec:.4f}")
print(f"Overall Binary Cross-Entropy Loss: {overall_bce_loss:.4f}")


📆 Evaluating year: 2020

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.875





Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.92857
AUC: 0.8750 | Accuracy: 0.9333 | F1: 0.9333
Binary Cross-Entropy Loss: 0.4935
Confusion Matrix:
[[7 0]
 [1 7]]

📆 Evaluating year: 2021





Early stopping occurred at epoch 30 with best_epoch = 10 and best_val_0_auc = 0.88889





Early stopping occurred at epoch 41 with best_epoch = 21 and best_val_0_auc = 0.7963




AUC: 0.9444 | Accuracy: 0.8667 | F1: 0.8333
Binary Cross-Entropy Loss: 0.4931
Confusion Matrix:
[[8 1]
 [1 5]]

📆 Evaluating year: 2022
