**<font size = 4> <font color = 'red'> IMPORT MODULES**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

**<font size = 4> <font color = 'red'> SETTING SEEDS (IMPORTANT)**

In [None]:
import os
import random
import numpy as np
import torch

# Set a fixed seed value
seed_value = 10

# 1. Set PYTHONHASHSEED environment variable for reproducibility in hashing
os.environ['PYTHONHASHSEED'] = str(seed_value)

# 2. Set the seed for Python's built-in random module
random.seed(seed_value)

# 3. Set the seed for NumPy
np.random.seed(seed_value)

# 4. Set the seed for PyTorch
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)

# 5. Force PyTorch to use deterministic algorithms (may impact performance)
torch.use_deterministic_algorithms(True, warn_only=True)

# 6. Optionally limit the number of threads used by OMP and MKL (helps reduce non-determinism)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

print("Seeds and environment variables set for reproducibility.")

**<font size = 4> <font color = 'red'> READ IN MATCHUP DATA**

In [None]:
nba_series = nba_series = pd.read_csv("TEAM_MATCHUP_DATA_CLUSTER.csv") # updated data set with CLUSTERING PROPORTIONS (CHECK BOX)

nba_series['SEASON_YEAR'] = nba_series['SEASON'].str.split('-').str[0].astype(int)
nba_series['SEASON'] = (nba_series['SEASON_YEAR'] + 1).astype(int)
nba_series = nba_series.drop(columns=['SEASON_YEAR'])

**<font size = 4> <font color = 'red'> DATA CLEANING PT.1 (RANDOMIZING AND SHUFFLING)**

In [None]:
# Drop columns that are not needed
nba_series = nba_series.drop(columns = ['SERIES_ID', 'SEASON_ID', 'TEAM_1_ID', 'TEAM_2_ID', 'CLUSTER_TEAM_1', 'CLUSTER_TEAM_2'])

# Create a mask to flip half of the rows
flip_mask = np.random.rand(len(nba_series)) < 0.5

# Columns to swap
team1_stat_cols = [col for col in nba_series.columns if '_TEAM_1' in col]
team2_stat_cols = [col.replace('_TEAM_1', '_TEAM_2') for col in team1_stat_cols]

# Include team name columns for flipping
stat_swap_cols = team1_stat_cols + team2_stat_cols + ['TEAM_1', 'TEAM_2']

# Create deep copies of swapped and non-swapped rows
swapped = nba_series.loc[flip_mask].copy()
not_swapped = nba_series.loc[~flip_mask].copy()

# Flip stats
swapped[team1_stat_cols] = nba_series.loc[flip_mask, team2_stat_cols].values
swapped[team2_stat_cols] = nba_series.loc[flip_mask, team1_stat_cols].values

# Flip team names
swapped['TEAM_1'] = nba_series.loc[flip_mask, 'TEAM_2'].values
swapped['TEAM_2'] = nba_series.loc[flip_mask, 'TEAM_1'].values

# Recalculate TEAM_1_W based on new TEAM_1 vs SERIES_WINNER
swapped['TEAM_1_W'] = (swapped['SERIES_WINNER'] == swapped['TEAM_1']).astype(int)
not_swapped['TEAM_1_W'] = (not_swapped['SERIES_WINNER'] == not_swapped['TEAM_1']).astype(int)

# Combine flipped and unflipped
nba_series_balanced = pd.concat([swapped, not_swapped], ignore_index=True)

# Optional: Shuffle the final DataFrame
nba_series_balanced = nba_series_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# test_data_preserved is created for displaying results later on
test_data_preserved = nba_series_balanced[['SEASON', 'TEAM_1', 'TEAM_2', 'SERIES_WINNER']]
test_data_preserved = test_data_preserved

**<font size = 4> <font color = 'red'> DATA CLEANING PT.2 (DIFFERENCE IN STATS)**

In [None]:
# Automatically extract stat bases by checking for matching suffixes
diff_df = pd.DataFrame()

# Grab all columns ending in _TEAM_1
team1_cols = [col for col in nba_series_balanced.columns if col.endswith('_TEAM_1')]

for col1 in team1_cols:
    # Get the base stat name (e.g., 'AST', 'FG_PCT')
    stat_base = col1.replace('_TEAM_1', '')
    col2 = f'{stat_base}_TEAM_2'
    
    # Only compute diff if TEAM_2 version exists
    if col2 in nba_series_balanced.columns:
        diff_df[f'{stat_base}_DIFF'] = nba_series_balanced[col1] - nba_series_balanced[col2]

# Add label and season columns
diff_df['TEAM_1_W'] = nba_series_balanced['TEAM_1_W']
diff_df['SEASON'] = nba_series_balanced['SEASON']

**<font size = 4> <font color = 'red'> TABNET NEURAL NETWORK WITH TEMPORAL SPLITS AND OPTUNA HYPERTUNING**

In [None]:
import optuna
import numpy as np
import pandas as pd
import os
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from pytorch_tabnet.tab_model import TabNetClassifier
import warnings
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Ensure save directory exists
os.makedirs("saved_models", exist_ok=True)

years = np.arange(2020, 2025)

def objective_tabnet(trial):
    
    n_d = trial.suggest_int("n_d", 8, 64, step=8)
    n_a = trial.suggest_int("n_a", 8, 64, step=8)
    n_steps = trial.suggest_int("n_steps", 3, 10)
    gamma = trial.suggest_float("gamma", 1.0, 2.0)
    lambda_sparse = trial.suggest_float("lambda_sparse", 0.00001, 0.01, log=True)
    momentum = trial.suggest_float("momentum", 0.01, 0.4)
    lr = trial.suggest_float("lr", 1e-3, 2e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    mask_type = trial.suggest_categorical("mask_type", ['sparsemax', 'entmax'])
    

    '''
    n_d = trial.suggest_int("n_d", 40, 56, step=4)
    n_a = trial.suggest_int("n_a", 40, 56, step=4)
    n_steps = trial.suggest_int("n_steps", 6, 10)
    gamma = trial.suggest_float("gamma", 1.0, 1.2)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-5, 1e-3, log=True)
    momentum = trial.suggest_float("momentum", 0.3, 0.4)
    lr = trial.suggest_float("lr", 0.001, 0.003, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-5, log=True)
    mask_type = trial.suggest_categorical("mask_type", ['sparsemax', 'entmax'])
    '''

    # Store predictions and true labels for global metrics
    all_y_true = []
    all_y_pred = []
    all_y_prob = []

    for year in years:
        train_data = diff_df[diff_df['SEASON'] < year]
        test_data = diff_df[diff_df['SEASON'] == year]

        if train_data.empty or test_data.empty:
            continue

        X_train = train_data.drop(columns=['TEAM_1_W', 'SEASON'])
        y_train = train_data['TEAM_1_W']
        X_test = test_data.drop(columns=['TEAM_1_W', 'SEASON'])
        y_test = test_data['TEAM_1_W']

        smote = SMOTE(random_state=10)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        clf = TabNetClassifier(
            n_d=n_d, n_a=n_a, n_steps=n_steps,
            gamma=gamma, lambda_sparse=lambda_sparse,
            momentum=momentum, mask_type=mask_type,
            optimizer_params={"lr": lr, "weight_decay": weight_decay},
            seed=342221, verbose=0, device_name='cpu'
        )

        clf.fit(
            X_train_resampled.values, y_train_resampled.values,
            eval_set=[(X_test.values, y_test.values)],
            eval_metric=['auc'],
            max_epochs=200, patience=20,
            batch_size=64, virtual_batch_size=32
        )

        y_probs = clf.predict_proba(X_test.values)[:, 1]
        y_pred = (y_probs >= 0.5).astype(int)

        #preds_label = clf.predict(X_test.values)

        # Collect for global metrics
        all_y_true.extend(y_test.values)
        all_y_pred.extend(y_pred)
        all_y_prob.extend(y_probs)

    if len(all_y_true) == 0:
        # Skip trial if no data
        return 0.0

    # Compute global metrics
    global_auc = roc_auc_score(all_y_true, all_y_prob)
    global_accuracy = accuracy_score(all_y_true, all_y_pred)
    global_f1 = f1_score(all_y_true, all_y_pred)

    print(f"Trial {trial.number}: Global AUC = {global_auc:.4f}, Accuracy = {global_accuracy:.4f}, F1 = {global_f1:.4f}")

    
    # ✅ Save models with high AUC
    if global_auc >= 0.90:
        filename = f"tabnet_trial{trial.number}_auc{global_auc:.4f}.zip"
        filepath = os.path.join("saved_models", filename)
        clf.save_model(filepath)
        print(f"✅ Saved model for Trial {trial.number} → {filename}")
    

    return global_auc

# Run the Optuna study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=342221))
study.optimize(objective_tabnet, n_trials=50)

# Print best results
print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)

**<font size = 4> <font color = 'red'> TABNET THRESHOLD TUNING WITH BEST MODEL**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score,
    confusion_matrix, roc_curve, auc
)
from imblearn.over_sampling import SMOTE
from pytorch_tabnet.tab_model import TabNetClassifier
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Define rolling years
years = np.arange(2020, 2025)

# Set range for threshold to fine tune it (i.e thresholds from 0.4 to 0.6 with a step of 0.05)
thresholds = thresholds = np.arange(0.4, 0.6, 0.05)

# Best found parameters from Optuna

# 84% accuracy set (2020-2025 test), 0.895 AUC, threshold = 0.5, seed = 10
best_params = {
    'n_d': 48,
    'n_a': 48,
    'n_steps': 8,
    'gamma': 1.0950455922432034,
    'lambda_sparse': 0.000365814411562923,
    'momentum': 0.3675391603570311,
    'mask_type': 'sparsemax'
}

optimizer_params = {
    'lr': 0.002071649487926403,
    'weight_decay': 4.2583833531711615e-06
}

# Store ROC values
all_fpr = []
all_tpr = []
all_auc = []

threshold_results = []

for threshold in thresholds:
    print(f"\n--- Threshold: {threshold:.4f} ---")
    store_acc = []
    store_f1 = []
    total_y_true = []
    total_y_probs = []

    for year in years:
        train_data = diff_df[diff_df['SEASON'] < year]
        test_data = diff_df[diff_df['SEASON'] == year]

        display_test = test_data_preserved[test_data_preserved['SEASON'] == year]

        if train_data.empty or test_data.empty:
            continue

        X_train = train_data.drop(columns=['TEAM_1_W', 'SEASON'])
        y_train = train_data['TEAM_1_W']
        X_test = test_data.drop(columns=['TEAM_1_W', 'SEASON'])
        y_test = test_data['TEAM_1_W']

        smote = SMOTE(random_state=10)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        clf = TabNetClassifier(
            **best_params, optimizer_params = optimizer_params, verbose=0, seed=10, device_name='cpu'
        )

        clf.fit(
            X_train_resampled.values, y_train_resampled.values,
            eval_set=[(X_test.values, y_test.values)],
            eval_metric=['auc'],
            max_epochs=200, patience=20,
            batch_size=64, virtual_batch_size=32
        )

        y_probs = clf.predict_proba(X_test.values)[:, 1]
        y_pred = (y_probs >= threshold).astype(int)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        store_acc.append(acc)
        store_f1.append(f1)

        print(f"Season {year} — Accuracy: {acc:.4f}, F1: {f1:.4f}")
        print("Confusion Matrix:")
        print(cm)

        display_test['TEAM_1_W'] = y_test
        display_test['PRED_TEAM_1_W'] = y_pred
        display_test['PRED_PROB'] = y_probs

        print(display_test)
        
        total_y_true.extend(y_test.values)
        total_y_probs.extend(y_probs)

    # ROC curve calculation for this threshold
    fpr, tpr, _ = roc_curve(total_y_true, total_y_probs)
    all_fpr.append(fpr)
    all_tpr.append(tpr)
    all_auc.append(auc(fpr, tpr))

    threshold_results.append({
        "threshold": threshold,
        "auc": auc(fpr, tpr),
        "accuracy": np.mean(store_acc),
        "f1": np.mean(store_f1)
    })

    print(f"Threshold {threshold}: AUC = {np.mean(auc(fpr, tpr)):.4f}, Accuracy = {np.mean(store_acc):.4f}, F1 = {np.mean(store_f1):.4f}")

# Final ROC curve plot
plt.figure(figsize=(10, 7))
for i, thr in enumerate(thresholds):
    plt.plot(all_fpr[i], all_tpr[i], label=f"Threshold {thr:.2f} (AUC = {all_auc[i]:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Chance")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for Various Thresholds")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()