In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from xgboost import XGBClassifier

CONFIG = {
    'batch_size': 1024,
    'lr': 1e-3,
    'weight_decay': 1e-4,
    'epochs': 50,
    'patience': 10,
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

FEATURES = {
    'continuous': ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'],
    'ordinal': ['Chest pain type', 'EKG results', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
    'binary': ['Sex', 'FBS over 120', 'Exercise angina'],
    'target': 'Heart Disease'
}

In [4]:
class PeriodicEmbedding(nn.Module):
    def __init__(self, frequency_num=16, output_dim=8, sigma=0.1):
        super().__init__()
        self.k = frequency_num
        self.c = nn.Parameter(torch.randn(frequency_num) * sigma)
        self.linear = nn.Linear(frequency_num * 2, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        v = 2 * math.pi * self.c * x
        out = torch.cat([torch.sin(v), torch.cos(v)], dim=1) 
        out = self.linear(out)
        out = self.relu(out)
        return out

class PiecewiseLinearEmbedding(nn.Module):
    def __init__(self, bin_edges, output_dim=4):
        super().__init__()
        self.register_buffer('bin_edges', bin_edges)
        num_bins = len(bin_edges) - 1
        self.linear = nn.Linear(num_bins, output_dim)
        
    def forward(self, x):
        edges = self.bin_edges
        widths = edges[1:] - edges[:-1]
        lower = edges[:-1]
        x_expanded = x - lower
        encoding = x_expanded / (widths + 1e-6)
        encoding = torch.clamp(encoding, 0.0, 1.0)
        out = self.linear(encoding)
        return out

In [5]:
class HeartDataset(Dataset):
    def __init__(self, df, feature_groups):
        self.df = df
        self.feats = feature_groups
        
        self.cont_data = df[self.feats['continuous']].values.astype(np.float32)
        self.ord_data = df[self.feats['ordinal']].values.astype(np.float32)
        self.bin_data = df[self.feats['binary']].values.astype(np.float32)
        
        if self.feats['target'] in df.columns:
            self.labels = df[self.feats['target']].values.astype(np.float32).reshape(-1, 1)
        else:
            self.labels = np.zeros((len(df), 1))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'cont': torch.tensor(self.cont_data[idx]),
            'ord': torch.tensor(self.ord_data[idx]),
            'bin': torch.tensor(self.bin_data[idx]),
            'label': torch.tensor(self.labels[idx])
        }

def prepare_data():
    train_full = pd.read_csv('C:\\Users\\Saswat Balyan\\dev\\Predicting-Heart-Disease-Playground-Series-S6ep2\\playground-series-s6e2\\train.csv')
    test_df = pd.read_csv('C:\\Users\\Saswat Balyan\\dev\\Predicting-Heart-Disease-Playground-Series-S6ep2\\playground-series-s6e2\\test.csv') 
    
    train_full['Heart Disease'] = train_full['Heart Disease'].map({'Absence': 0, 'Presence': 1})
    
    train_df, val_df = train_test_split(train_full, test_size=0.2, random_state=42, stratify=train_full['Heart Disease'])
    
    ordinal_edges = {}
    for col in FEATURES['ordinal']:
        edges = np.quantile(train_df[col].dropna(), np.linspace(0, 1, 9))
        if len(np.unique(edges)) < len(edges):
            edges = np.unique(edges)
        ordinal_edges[col] = torch.tensor(edges, dtype=torch.float32)
        
    return train_df, val_df, test_df, ordinal_edges

train_df, val_df, test_df, ordinal_edges = prepare_data()

train_dataset = HeartDataset(train_df, FEATURES)
val_dataset = HeartDataset(val_df, FEATURES)
test_dataset = HeartDataset(test_df, FEATURES)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

In [6]:
class TabularHeartModel(nn.Module):
    def __init__(self, ordinal_edges_dict):
        super().__init__()
        
        self.cont_embeddings = nn.ModuleDict()
        for feat in FEATURES['continuous']:
            self.cont_embeddings[feat] = PeriodicEmbedding(frequency_num=16, output_dim=8, sigma=0.1)
            
        self.ord_embeddings = nn.ModuleDict()
        for feat in FEATURES['ordinal']:
            edges = ordinal_edges_dict[feat]
            self.ord_embeddings[feat] = PiecewiseLinearEmbedding(bin_edges=edges, output_dim=4)
            
        input_dim = (len(FEATURES['continuous']) * 8) + \
                    (len(FEATURES['ordinal']) * 4) + \
                    len(FEATURES['binary'])
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1) 
        )

    def forward(self, x_cont, x_ord, x_bin):
        embeddings = []
        for i, feat_name in enumerate(FEATURES['continuous']):
            val = x_cont[:, i:i+1]
            emb = self.cont_embeddings[feat_name](val)
            embeddings.append(emb)
            
        for i, feat_name in enumerate(FEATURES['ordinal']):
            val = x_ord[:, i:i+1]
            emb = self.ord_embeddings[feat_name](val)
            embeddings.append(emb)
            
        embeddings.append(x_bin)
        x = torch.cat(embeddings, dim=1)
        return x, self.mlp(x)

In [7]:
model = TabularHeartModel(ordinal_edges).to(CONFIG['device'])
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

best_val_loss = float('inf')
patience_counter = 0

print("Training Embeddings...")

for epoch in range(CONFIG['epochs']):
    model.train()
    for batch in train_loader:
        b_cont = batch['cont'].to(CONFIG['device'])
        b_ord = batch['ord'].to(CONFIG['device'])
        b_bin = batch['bin'].to(CONFIG['device'])
        labels = batch['label'].to(CONFIG['device'])
        
        optimizer.zero_grad()
        _, logits = model(b_cont, b_ord, b_bin)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            b_cont = batch['cont'].to(CONFIG['device'])
            b_ord = batch['ord'].to(CONFIG['device'])
            b_bin = batch['bin'].to(CONFIG['device'])
            labels = batch['label'].to(CONFIG['device'])
            _, logits = model(b_cont, b_ord, b_bin)
            val_loss += criterion(logits, labels).item()
            
    avg_val_loss = val_loss / len(val_loader)
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= CONFIG['patience']:
            break
            
model.load_state_dict(torch.load('best_model.pth'))
print("Embedding Training Complete.")

Training Embeddings...
Embedding Training Complete.


  model.load_state_dict(torch.load('best_model.pth'))


In [9]:
def extract_embeddings(loader, model, device):
    model.eval()
    embeddings_list = []
    labels_list = []
    
    with torch.no_grad():
        for batch in loader:
            b_cont = batch['cont'].to(device)
            b_ord = batch['ord'].to(device)
            b_bin = batch['bin'].to(device)
            
            features, _ = model(b_cont, b_ord, b_bin)
            embeddings_list.append(features.cpu().numpy())
            labels_list.append(batch['label'].numpy())
            
    return np.vstack(embeddings_list), np.vstack(labels_list).ravel()

X_train_emb, y_train_emb = extract_embeddings(train_loader, model, CONFIG['device'])
X_val_emb, y_val_emb = extract_embeddings(val_loader, model, CONFIG['device'])
X_test_emb, _ = extract_embeddings(test_loader, model, CONFIG['device'])

xgb_model = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    early_stopping_rounds=50,
    eval_metric='logloss'
)

xgb_model.fit(
    X_train_emb, y_train_emb,
    eval_set=[(X_val_emb, y_val_emb)],
    verbose=False
)

print(f"XGBoost Best Iteration: {xgb_model.best_iteration}")

XGBoost Best Iteration: 999


In [10]:
preds = xgb_model.predict_proba(X_test_emb)[:, 1]

submission = pd.DataFrame({
    'id': test_df['id'],
    'Heart Disease': preds
})

submission.to_csv('submission_XGB.csv', index=False)
print(submission.head())

       id  Heart Disease
0  630000       0.929111
1  630001       0.009582
2  630002       0.981713
3  630003       0.006812
4  630004       0.201415
