In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from catboost import CatBoostClassifier

CONFIG = {
    'batch_size': 1024,
    'lr': 1e-3,
    'weight_decay': 1e-4,
    'epochs': 50,
    'patience': 10,
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}
print(f"Using device: {CONFIG['device']}")
FEATURES = {
    'continuous': ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'],
    'ordinal': ['Chest pain type', 'EKG results', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
    'binary': ['Sex', 'FBS over 120', 'Exercise angina'],
    'target': 'Heart Disease'
}

Using device: cuda


In [16]:
class PeriodicEmbedding(nn.Module):
    def __init__(self, frequency_num=16, output_dim=8, sigma=0.1):
        super().__init__()
        self.k = frequency_num
        self.c = nn.Parameter(torch.randn(frequency_num) * sigma)
        self.linear = nn.Linear(frequency_num * 2, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        v = 2 * math.pi * self.c * x
        out = torch.cat([torch.sin(v), torch.cos(v)], dim=1) 
        out = self.linear(out)
        out = self.relu(out)
        return out

class PiecewiseLinearEmbedding(nn.Module):
    def __init__(self, bin_edges, output_dim=4):
        super().__init__()
        self.register_buffer('bin_edges', bin_edges)
        num_bins = len(bin_edges) - 1
        self.linear = nn.Linear(num_bins, output_dim)
        
    def forward(self, x):
        edges = self.bin_edges
        widths = edges[1:] - edges[:-1]
        lower = edges[:-1]
        x_expanded = x - lower
        encoding = x_expanded / (widths + 1e-6)
        encoding = torch.clamp(encoding, 0.0, 1.0)
        out = self.linear(encoding)
        return out

In [17]:
class HeartDataset(Dataset):
    def __init__(self, df, feature_groups):
        self.df = df
        self.feats = feature_groups
        
        self.cont_data = df[self.feats['continuous']].values.astype(np.float32)
        self.ord_data = df[self.feats['ordinal']].values.astype(np.float32)
        self.bin_data = df[self.feats['binary']].values.astype(np.float32)
        
        if self.feats['target'] in df.columns:
            self.labels = df[self.feats['target']].values.astype(np.float32).reshape(-1, 1)
        else:
            self.labels = np.zeros((len(df), 1))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'cont': torch.tensor(self.cont_data[idx]),
            'ord': torch.tensor(self.ord_data[idx]),
            'bin': torch.tensor(self.bin_data[idx]),
            'label': torch.tensor(self.labels[idx])
        }

def prepare_data():
    train_full = pd.read_csv(r'C:\Users\Saswat Balyan\dev\Predicting-Heart-Disease-Playground-Series-S6ep2\playground-series-s6e2\train.csv')
    test_df = pd.read_csv(r'C:\Users\Saswat Balyan\dev\Predicting-Heart-Disease-Playground-Series-S6ep2\playground-series-s6e2\test.csv') 
    
    train_full['Heart Disease'] = train_full['Heart Disease'].map({'Absence': 0, 'Presence': 1})
    
    train_df, val_df = train_test_split(train_full, test_size=0.2, random_state=42, stratify=train_full['Heart Disease'])
    
    ordinal_edges = {}
    for col in FEATURES['ordinal']:
        edges = np.quantile(train_df[col].dropna(), np.linspace(0, 1, 9))
        if len(np.unique(edges)) < len(edges):
            edges = np.unique(edges)
        ordinal_edges[col] = torch.tensor(edges, dtype=torch.float32)
        
    return train_df, val_df, test_df, ordinal_edges

train_df, val_df, test_df, ordinal_edges = prepare_data()

train_dataset = HeartDataset(train_df, FEATURES)
val_dataset = HeartDataset(val_df, FEATURES)
test_dataset = HeartDataset(test_df, FEATURES)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

In [18]:
class TabularHeartModel(nn.Module):
    def __init__(self, ordinal_edges_dict):
        super().__init__()
        
        self.cont_embeddings = nn.ModuleDict()
        for feat in FEATURES['continuous']:
            self.cont_embeddings[feat] = PeriodicEmbedding(frequency_num=16, output_dim=8, sigma=0.1)
            
        self.ord_embeddings = nn.ModuleDict()
        for feat in FEATURES['ordinal']:
            edges = ordinal_edges_dict[feat]
            self.ord_embeddings[feat] = PiecewiseLinearEmbedding(bin_edges=edges, output_dim=4)
            
        input_dim = (len(FEATURES['continuous']) * 8) + \
                    (len(FEATURES['ordinal']) * 4) + \
                    len(FEATURES['binary'])
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1) 
        )

    def forward(self, x_cont, x_ord, x_bin):
        embeddings = []
        for i, feat_name in enumerate(FEATURES['continuous']):
            val = x_cont[:, i:i+1]
            emb = self.cont_embeddings[feat_name](val)
            embeddings.append(emb)
            
        for i, feat_name in enumerate(FEATURES['ordinal']):
            val = x_ord[:, i:i+1]
            emb = self.ord_embeddings[feat_name](val)
            embeddings.append(emb)
            
        embeddings.append(x_bin)
        x = torch.cat(embeddings, dim=1)
        return x, self.mlp(x)

In [19]:
model = TabularHeartModel(ordinal_edges).to(CONFIG['device'])
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

best_val_loss = float('inf')
patience_counter = 0

print("Training Embeddings...")

for epoch in range(CONFIG['epochs']):
    model.train()
    for batch in train_loader:
        b_cont = batch['cont'].to(CONFIG['device'])
        b_ord = batch['ord'].to(CONFIG['device'])
        b_bin = batch['bin'].to(CONFIG['device'])
        labels = batch['label'].to(CONFIG['device'])
        
        optimizer.zero_grad()
        _, logits = model(b_cont, b_ord, b_bin)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            b_cont = batch['cont'].to(CONFIG['device'])
            b_ord = batch['ord'].to(CONFIG['device'])
            b_bin = batch['bin'].to(CONFIG['device'])
            labels = batch['label'].to(CONFIG['device'])
            _, logits = model(b_cont, b_ord, b_bin)
            val_loss += criterion(logits, labels).item()
            
    avg_val_loss = val_loss / len(val_loader)
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_cat.pth')
    else:
        patience_counter += 1
        if patience_counter >= CONFIG['patience']:
            break
            
model.load_state_dict(torch.load('best_model_cat.pth'))
print("Embedding Training Complete.")

Training Embeddings...
Embedding Training Complete.


  model.load_state_dict(torch.load('best_model_cat.pth'))


In [20]:
def extract_embeddings(loader, model, device):
    model.eval()
    embeddings_list = []
    labels_list = []
    
    with torch.no_grad():
        for batch in loader:
            b_cont = batch['cont'].to(device)
            b_ord = batch['ord'].to(device)
            b_bin = batch['bin'].to(device)
            
            features, _ = model(b_cont, b_ord, b_bin)
            embeddings_list.append(features.cpu().numpy())
            labels_list.append(batch['label'].numpy())
            
    return np.vstack(embeddings_list), np.vstack(labels_list).ravel()

X_train_emb, y_train_emb = extract_embeddings(train_loader, model, CONFIG['device'])
X_val_emb, y_val_emb = extract_embeddings(val_loader, model, CONFIG['device'])
X_test_emb, _ = extract_embeddings(test_loader, model, CONFIG['device'])

cat_model = CatBoostClassifier(
    iterations=6000,
    learning_rate=0.015,
    depth=6,
    l2_leaf_reg=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=200
)

cat_model.fit(
    X_train_emb, y_train_emb,
    eval_set=(X_val_emb, y_val_emb),
    early_stopping_rounds=200
)

0:	test: 0.9393826	best: 0.9393826 (0)	total: 32.5ms	remaining: 3m 14s
200:	test: 0.9540443	best: 0.9540443 (200)	total: 6.9s	remaining: 3m 19s
400:	test: 0.9550659	best: 0.9550659 (400)	total: 13.7s	remaining: 3m 10s
600:	test: 0.9553925	best: 0.9553925 (600)	total: 20.1s	remaining: 3m
800:	test: 0.9555636	best: 0.9555636 (800)	total: 26.5s	remaining: 2m 52s
1000:	test: 0.9557644	best: 0.9557644 (1000)	total: 33.1s	remaining: 2m 45s
1200:	test: 0.9558933	best: 0.9558933 (1200)	total: 39.5s	remaining: 2m 37s
1400:	test: 0.9559561	best: 0.9559561 (1400)	total: 46.3s	remaining: 2m 31s
1600:	test: 0.9559908	best: 0.9559908 (1600)	total: 52.8s	remaining: 2m 25s
1800:	test: 0.9560244	best: 0.9560250 (1796)	total: 59.6s	remaining: 2m 19s
2000:	test: 0.9560524	best: 0.9560524 (2000)	total: 1m 6s	remaining: 2m 12s
2200:	test: 0.9560702	best: 0.9560702 (2200)	total: 1m 12s	remaining: 2m 5s
2400:	test: 0.9560830	best: 0.9560830 (2400)	total: 1m 19s	remaining: 1m 59s
2600:	test: 0.9560956	best: 0

<catboost.core.CatBoostClassifier at 0x16f5fbcd850>

In [7]:
preds = cat_model.predict_proba(X_test_emb)[:, 1]

submission = pd.DataFrame({
    'id': test_df['id'],
    'Heart Disease': preds
})

submission.to_csv('submission_catboost.csv', index=False)
print("CatBoost Submission Saved!")
print(submission.head())

CatBoost Submission Saved!
       id  Heart Disease
0  630000       0.947585
1  630001       0.005743
2  630002       0.989294
3  630003       0.003266
4  630004       0.174639


In [14]:
import random
from catboost import Pool

param_grid = {
    'learning_rate': [0.01, 0.03, 0.05],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7],
    'random_strength': [1, 2]
}

train_pool = Pool(X_train_emb, y_train_emb)
val_pool = Pool(X_val_emb, y_val_emb)

best_auc = 0
best_params = {}

print("Starting Manual Hyperparameter Search...")

for i in range(5):
    params = {k: random.choice(v) for k, v in param_grid.items()}
    
    test_model = CatBoostClassifier(
        iterations=500,
        **params,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=False
    )
    
    test_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

    current_auc = test_model.get_best_score()['validation']['AUC']
    print(f"Trial {i+1}: AUC = {current_auc:.4f} with {params}")
    
    if current_auc > best_auc:
        best_auc = current_auc
        best_params = params

print(f"\nBest AUC found: {best_auc:.4f}")
print(f"Best Parameters: {best_params}")

Starting Manual Hyperparameter Search...
Trial 1: AUC = 0.9562 with {'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 5, 'random_strength': 2}
Trial 2: AUC = 0.9552 with {'learning_rate': 0.01, 'depth': 8, 'l2_leaf_reg': 5, 'random_strength': 1}
Trial 3: AUC = 0.9548 with {'learning_rate': 0.01, 'depth': 6, 'l2_leaf_reg': 7, 'random_strength': 2}


KeyboardInterrupt: 

In [12]:
final_cat_model = CatBoostClassifier(
    iterations=8000,
    **best_params,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    use_best_model=True,
    verbose=500
)

final_cat_model.fit(
    X_train_emb, y_train_emb,
    eval_set=(X_val_emb, y_val_emb),
    early_stopping_rounds=300
)

final_preds = final_cat_model.predict_proba(X_test_emb)[:, 1]
submission_optimized = pd.DataFrame({
    'id': test_df['id'],
    'Heart Disease': final_preds
})
submission_optimized.to_csv('submission_cat_optimized.csv', index=False)
print("\nOptimized CatBoost Submission Saved!")

0:	test: 0.9364097	best: 0.9364097 (0)	total: 36.3ms	remaining: 4m 50s
500:	test: 0.9561880	best: 0.9561880 (500)	total: 16.4s	remaining: 4m 4s
1000:	test: 0.9562405	best: 0.9562451 (847)	total: 32.3s	remaining: 3m 46s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.956245118
bestIteration = 847

Shrink model to first 848 iterations.

Optimized CatBoost Submission Saved!
