In [None]:
!pip install -q catboost optuna

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import math
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
import optuna

DATA_DIR = "/kaggle/input/playground-series-s6e2"
OUTPUT_DIR = "/kaggle/working"

CONFIG = {
    'batch_size': 1024,
    'lr': 1e-3,
    'weight_decay': 1e-4,
    'epochs': 50,
    'patience': 10,
    'device': torch.device("cpu"),
    'seed': 42
}
print('Using device:', CONFIG['device'])

FEATURES = {
    'continuous': ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'],
    'ordinal': ['Chest pain type', 'EKG results', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
    'binary': ['Sex', 'FBS over 120', 'Exercise angina'],
    'target': 'Heart Disease'
}

In [None]:
class PeriodicEmbedding(nn.Module):
    def __init__(self, frequency_num=16, output_dim=8, sigma=0.1):
        super().__init__()
        self.k = frequency_num
        self.c = nn.Parameter(torch.randn(frequency_num) * sigma)
        self.linear = nn.Linear(frequency_num * 2, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        v = 2 * math.pi * self.c * x
        out = torch.cat([torch.sin(v), torch.cos(v)], dim=1) 
        out = self.linear(out)
        out = self.relu(out)
        return out

class PiecewiseLinearEmbedding(nn.Module):
    def __init__(self, bin_edges, output_dim=4):
        super().__init__()
        self.register_buffer('bin_edges', bin_edges)
        num_bins = len(bin_edges) - 1
        self.linear = nn.Linear(num_bins, output_dim)
        
    def forward(self, x):
        edges = self.bin_edges
        widths = edges[1:] - edges[:-1]
        lower = edges[:-1]
        x_expanded = x - lower
        encoding = x_expanded / (widths + 1e-6)
        encoding = torch.clamp(encoding, 0.0, 1.0)
        out = self.linear(encoding)
        return out

class TabularHeartModel(nn.Module):
    def __init__(self, ordinal_edges_dict):
        super().__init__()
        self.cont_embeddings = nn.ModuleDict()
        for feat in FEATURES['continuous']:
            self.cont_embeddings[feat] = PeriodicEmbedding(frequency_num=16, output_dim=8, sigma=0.1)
            
        self.ord_embeddings = nn.ModuleDict()
        for feat in FEATURES['ordinal']:
            edges = ordinal_edges_dict[feat]
            self.ord_embeddings[feat] = PiecewiseLinearEmbedding(bin_edges=edges, output_dim=4)
            
        input_dim = (len(FEATURES['continuous']) * 8) + \
                    (len(FEATURES['ordinal']) * 4) + \
                    len(FEATURES['binary'])
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1) 
        )

    def forward(self, x_cont, x_ord, x_bin):
        embeddings = []
        for i, feat_name in enumerate(FEATURES['continuous']):
            val = x_cont[:, i:i+1]
            emb = self.cont_embeddings[feat_name](val)
            embeddings.append(emb)
            
        for i, feat_name in enumerate(FEATURES['ordinal']):
            val = x_ord[:, i:i+1]
            emb = self.ord_embeddings[feat_name](val)
            embeddings.append(emb)
            
        embeddings.append(x_bin)
        x = torch.cat(embeddings, dim=1)
        return x, self.mlp(x)

In [None]:
class HeartDataset(Dataset):
    def __init__(self, df, feature_groups):
        self.df = df
        self.feats = feature_groups
        self.cont_data = df[self.feats['continuous']].values.astype(np.float32)
        self.ord_data = df[self.feats['ordinal']].values.astype(np.float32)
        self.bin_data = df[self.feats['binary']].values.astype(np.float32)
        
        if self.feats['target'] in df.columns:
            self.labels = df[self.feats['target']].values.astype(np.float32).reshape(-1, 1)
        else:
            self.labels = np.zeros((len(df), 1))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'cont': torch.tensor(self.cont_data[idx]),
            'ord': torch.tensor(self.ord_data[idx]),
            'bin': torch.tensor(self.bin_data[idx]),
            'label': torch.tensor(self.labels[idx])
        }

def prepare_data():
    train_full = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
    test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
    
    train_full['Heart Disease'] = train_full['Heart Disease'].map({'Absence': 0, 'Presence': 1})
    
    train_df, val_df = train_test_split(train_full, test_size=0.2, random_state=42, stratify=train_full['Heart Disease'])
    
    ordinal_edges = {}
    for col in FEATURES['ordinal']:
        edges = np.quantile(train_df[col].dropna(), np.linspace(0, 1, 9))
        if len(np.unique(edges)) < len(edges):
            edges = np.unique(edges)
        ordinal_edges[col] = torch.tensor(edges, dtype=torch.float32)
        
    return train_df, val_df, test_df, ordinal_edges

train_df, val_df, test_df, ordinal_edges = prepare_data()

train_dataset = HeartDataset(train_df, FEATURES)
val_dataset = HeartDataset(val_df, FEATURES)
test_dataset = HeartDataset(test_df, FEATURES)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

In [None]:
def extract_embeddings(loader, model, device):
    model.eval()
    embeddings_list = []
    labels_list = []
    
    with torch.no_grad():
        for batch in loader:
            b_cont = batch['cont'].to(device)
            b_ord = batch['ord'].to(device)
            b_bin = batch['bin'].to(device)
            
            features, _ = model(b_cont, b_ord, b_bin)
            embeddings_list.append(features.cpu().numpy())
            labels_list.append(batch['label'].numpy())
            
    return np.vstack(embeddings_list), np.vstack(labels_list).ravel()

emb_train_path = os.path.join(OUTPUT_DIR, 'X_train_emb.npy')
emb_val_path = os.path.join(OUTPUT_DIR, 'X_val_emb.npy')
emb_test_path = os.path.join(OUTPUT_DIR, 'X_test_emb.npy')
y_train_path = os.path.join(OUTPUT_DIR, 'y_train_emb.npy')
y_val_path = os.path.join(OUTPUT_DIR, 'y_val_emb.npy')

if os.path.exists(emb_train_path) and os.path.exists(emb_test_path):
    print("Loading cached embeddings...")
    X_train_emb = np.load(emb_train_path)
    X_val_emb = np.load(emb_val_path)
    X_test_emb = np.load(emb_test_path)
    y_train_emb = np.load(y_train_path)
    y_val_emb = np.load(y_val_path)
else:
    print("Training Embeddings...")
    model = TabularHeartModel(ordinal_edges).to(CONFIG['device'])
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

    best_val_loss = float('inf')
    patience_counter = 0
    
    model_save_path = os.path.join(OUTPUT_DIR, 'best_model_cat.pth')

    for epoch in range(CONFIG['epochs']):
        model.train()
        for batch in train_loader:
            b_cont = batch['cont'].to(CONFIG['device'])
            b_ord = batch['ord'].to(CONFIG['device'])
            b_bin = batch['bin'].to(CONFIG['device'])
            labels = batch['label'].to(CONFIG['device'])
            
            optimizer.zero_grad()
            _, logits = model(b_cont, b_ord, b_bin)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                b_cont = batch['cont'].to(CONFIG['device'])
                b_ord = batch['ord'].to(CONFIG['device'])
                b_bin = batch['bin'].to(CONFIG['device'])
                labels = batch['label'].to(CONFIG['device'])
                _, logits = model(b_cont, b_ord, b_bin)
                val_loss += criterion(logits, labels).item()
                
        avg_val_loss = val_loss / len(val_loader)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), model_save_path)
        else:
            patience_counter += 1
            if patience_counter >= CONFIG['patience']:
                break
                
    model.load_state_dict(torch.load(model_save_path))
    
    X_train_emb, y_train_emb = extract_embeddings(train_loader, model, CONFIG['device'])
    X_val_emb, y_val_emb = extract_embeddings(val_loader, model, CONFIG['device'])
    X_test_emb, _ = extract_embeddings(test_loader, model, CONFIG['device'])
    
    np.save(emb_train_path, X_train_emb)
    np.save(emb_val_path, X_val_emb)
    np.save(emb_test_path, X_test_emb)
    np.save(y_train_path, y_train_emb)
    np.save(y_val_path, y_val_emb)
    print("Embedding Training Complete and Cached.")

X_full = np.concatenate([X_train_emb, X_val_emb], axis=0)
y_full = np.concatenate([y_train_emb, y_val_emb], axis=0)

In [None]:
def get_search_space(trial, stage_params=None):
    if stage_params is None: 
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 10),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0) if trial.params.get("bootstrap_type") == "Bayesian" else None,
            "subsample": trial.suggest_float("subsample", 0.1, 1.0) if trial.params.get("bootstrap_type") != "Bayesian" else None,
        }
    else:
        p = stage_params
        bs_type = p["bootstrap_type"]
        params = {
            "learning_rate": trial.suggest_float("learning_rate", max(1e-4, p["learning_rate"]*0.8), min(0.3, p["learning_rate"]*1.2), log=True),
            "depth": trial.suggest_int("depth", max(3, p["depth"]-1), min(12, p["depth"]+1)),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", max(0.1, p["l2_leaf_reg"]*0.8), min(20, p["l2_leaf_reg"]*1.2)),
            "bootstrap_type": bs_type,
            "random_strength": trial.suggest_float("random_strength", max(1e-9, p["random_strength"]*0.8), min(10.0, p["random_strength"]*1.2), log=True),
        }
        if bs_type == "Bayesian":
            params["bagging_temperature"] = trial.suggest_float("bagging_temperature", max(0.0, p.get("bagging_temperature", 1)*0.8), min(10.0, p.get("bagging_temperature", 1)*1.2))
        else:
            params["subsample"] = trial.suggest_float("subsample", max(0.1, p.get("subsample", 0.8)*0.9), min(1.0, p.get("subsample", 0.8)*1.1))
        return params

def objective(trial, stage_params=None):
    params = get_search_space(trial, stage_params)
    params = {k: v for k, v in params.items() if v is not None}
    
    params.update({
        "eval_metric": "AUC",
        "loss_function": "Logloss",
        "od_type": "Iter",
        "od_wait": 50,
        "verbose": False,
        "allow_writing_files": False,
        "random_seed": 42,
        "thread_count": 2,
        "task_type": "CPU",
        "iterations": 500
    })
    
    fold_aucs = []
    skf_optuna = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    for fold_i, (train_idx, val_idx) in enumerate(skf_optuna.split(X_full, y_full)):
        X_tr, y_tr = X_full[train_idx], y_full[train_idx]
        X_va, y_va = X_full[val_idx], y_full[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)
        
        if fold_i == 0:
            evals = model.get_evals_result()["validation"]["AUC"]
            for step, auc in enumerate(evals):
                trial.report(auc, step)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()
        
        best_auc = model.get_best_score()['validation']['AUC']
        fold_aucs.append(best_auc)
        
    return np.mean(fold_aucs)

In [None]:
study_stage1 = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(
        multivariate=True, 
        n_startup_trials=50, 
        gamma=lambda n: int(0.25 * n)
    ),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=20, 
        n_warmup_steps=30
    )
)

print("Starting Stage 1")
study_stage1.optimize(lambda t: objective(t, stage_params=None), n_trials=100)
print(f"Stage 1 Best AUC: {study_stage1.best_value}")

In [None]:
study_stage2 = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(
        multivariate=True, 
        n_startup_trials=20, 
        gamma=lambda n: int(0.08 * n)
    ),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=10, 
        n_warmup_steps=20
    )
)

study_stage2.add_trials(study_stage1.trials)

print("Starting Stage 2")
study_stage2.optimize(lambda t: objective(t, stage_params=study_stage1.best_params), n_trials=80)
print(f"Stage 2 Best AUC: {study_stage2.best_value}")

In [None]:
final_params = study_stage2.best_params.copy()

final_params.update({
    "iterations": 8000, 
    "verbose": 1000, 
    "eval_metric": "AUC",
    "loss_function": "Logloss",
    "od_type": "Iter",
    "od_wait": 200, 
    "allow_writing_files": False,
    "thread_count": 2,
    "task_type": "CPU"
})

seeds = [42, 2024, 999]
test_preds_ensemble = np.zeros(len(test_df)) 
oof_preds = np.zeros(len(X_full))
oof_targets = y_full

for seed in seeds:
    final_params["random_seed"] = seed
    print(f"--> Processing Seed: {seed}")
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    
    seed_oof = np.zeros(len(X_full))
    seed_test_preds = np.zeros(len(test_df))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_full, y_full)):
        X_tr, y_tr = X_full[train_idx], y_full[train_idx]
        X_va, y_va = X_full[val_idx], y_full[val_idx]
        
        clf = CatBoostClassifier(**final_params)
        clf.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)
        
        val_p = clf.predict_proba(X_va)[:, 1]
        seed_oof[val_idx] = val_p
        
        test_p = clf.predict_proba(X_test_emb)[:, 1]
        seed_test_preds += (test_p / 5)

    oof_preds += (seed_oof / len(seeds))
    test_preds_ensemble += (seed_test_preds / len(seeds))

submission = pd.DataFrame({
    'id': test_df['id'],
    'Heart Disease': test_preds_ensemble
})
submission.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)
print("Ensemble Submission Saved")

In [None]:
plt.figure(figsize=(10, 6))

sns.histplot(x=oof_preds, hue=oof_targets, bins=50, kde=True, palette={0: "blue", 1: "red"}, alpha=0.5)

plt.title(f"Predicted Probabilities vs Actual Labels (OOF AUC: {roc_auc_score(oof_targets, oof_preds):.5f})")
plt.xlabel("Predicted Probability")
plt.ylabel("Count")
plt.legend(title="Actual Target", labels=["Presence (1)", "Absence (0)"])
plt.grid(True, alpha=0.3)
plt.show()