In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, QuantileTransformer
from sklearn.metrics import roc_auc_score
import copy
import gc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

Using device: cpu


In [3]:
train_df = pd.read_csv('/home/saswat-balyan/devStuff/DiseasePredict/playground-series-s6e2/train.csv')
test_df = pd.read_csv('/home/saswat-balyan/devStuff/DiseasePredict/playground-series-s6e2/test.csv')
sample_sub = pd.read_csv('/home/saswat-balyan/devStuff/DiseasePredict/playground-series-s6e2/sample_submission.csv')

train_df['Heart Disease'] = train_df['Heart Disease'].map({'Absence': 0, 'Presence': 1})

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (630000, 15)
Test shape: (270000, 14)


In [7]:
def feature_engineering(df):
    df = df.copy()
    
    df['MaxHR_Age_Ratio'] = df['Max HR'] / (df['Age'] + 1e-5)
    df['Chol_Age_Ratio'] = df['Cholesterol'] / (df['Age'] + 1e-5)
    df['BP_Age_Ratio'] = df['BP'] / (df['Age'] + 1e-5)
    
    df['Rate_Pressure_Product'] = df['Max HR'] * df['BP']
    
    for col in ['Cholesterol', 'BP', 'Max HR', 'ST depression']:
        min_val = df[col].min()
        shift = abs(min_val) + 1 if min_val <= 0 else 0
        df[f'Log_{col}'] = np.log1p(df[col] + shift)

    df['Age_Bin'] = pd.cut(df['Age'], bins=[0, 45, 60, 100], labels=[0, 1, 2]).astype(int)
    
    return df

train_eng = feature_engineering(train_df)
test_eng = feature_engineering(test_df)

target_col = 'Heart Disease'
ignore_cols = ['id', target_col]
feature_cols = [c for c in train_eng.columns if c not in ignore_cols]

cat_cols = [c for c in feature_cols if train_eng[c].nunique() < 10]
num_cols = [c for c in feature_cols if c not in cat_cols]

print(f"Categorical features ({len(cat_cols)}): {cat_cols}")
print(f"Numerical features ({len(num_cols)}): {num_cols}")

Categorical features (9): ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Age_Bin']
Numerical features (13): ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression', 'MaxHR_Age_Ratio', 'Chol_Age_Ratio', 'BP_Age_Ratio', 'Rate_Pressure_Product', 'Log_Cholesterol', 'Log_BP', 'Log_Max HR', 'Log_ST depression']


In [8]:
all_data = pd.concat([train_eng[feature_cols], test_eng[feature_cols]], axis=0)

cat_dims = []
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])
    train_eng[col] = all_data.iloc[:len(train_eng)][col].values
    test_eng[col] = all_data.iloc[len(train_eng):][col].values
    cat_dims.append(len(le.classes_))

scaler = QuantileTransformer(output_distribution='normal', random_state=42)
all_data[num_cols] = scaler.fit_transform(all_data[num_cols])

train_eng[num_cols] = all_data.iloc[:len(train_eng)][num_cols].values
test_eng[num_cols] = all_data.iloc[len(train_eng):][num_cols].values

X_cat = torch.tensor(train_eng[cat_cols].values, dtype=torch.long)
X_num = torch.tensor(train_eng[num_cols].values, dtype=torch.float32)
y = torch.tensor(train_eng[target_col].values, dtype=torch.float32).unsqueeze(1)

X_test_cat = torch.tensor(test_eng[cat_cols].values, dtype=torch.long)
X_test_num = torch.tensor(test_eng[num_cols].values, dtype=torch.float32)

In [9]:
class TabM_Mini(nn.Module):
    def __init__(self, cat_dims, num_features, k=16, d_model=128, depth=3, dropout=0.1):
        super().__init__()
        self.k = k 

        self.cat_embeddings = nn.ModuleList([
            nn.Embedding(dims, min(50, (dims+1)//2)) for dims in cat_dims
        ])
        cat_emb_size = sum(emb.embedding_dim for emb in self.cat_embeddings)
        
        input_dim = cat_emb_size + num_features
        self.input_proj = nn.Linear(input_dim, d_model)
        self.bn_input = nn.BatchNorm1d(d_model)
        
        self.ensemble_scale = nn.Parameter(torch.ones(1, k, d_model))
        self.ensemble_bias = nn.Parameter(torch.zeros(1, k, d_model))

        layers = []
        for _ in range(depth):
            layers.append(nn.Linear(d_model, d_model))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(d_model))
            layers.append(nn.Dropout(dropout))
        self.backbone = nn.Sequential(*layers)

        self.head = nn.Linear(d_model, 1)

    def forward(self, x_cat, x_num):
        batch_size = x_num.size(0)

        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeddings)]
        x_cat_emb = torch.cat(embs, dim=1)

        x = torch.cat([x_cat_emb, x_num], dim=1)
        
        x = self.input_proj(x)
        x = self.bn_input(x)
        
        x = x.unsqueeze(1).expand(-1, self.k, -1)
        
        x = x * self.ensemble_scale + self.ensemble_bias

        x_flat = x.reshape(batch_size * self.k, -1)

        feat = self.backbone(x_flat)

        logits = self.head(feat)

        logits = logits.view(batch_size, self.k)
        
        return logits

In [10]:
def train_tabm(X_cat, X_num, y, X_test_cat, X_test_num, k_ensemble=16, folds=5):
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    test_preds = np.zeros((len(X_test_num), 1))
    oof_preds = np.zeros((len(X_num), 1))
    
    test_dataset = TensorDataset(X_test_cat, X_test_num)
    test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_num, y)):
        print(f"\n--- Fold {fold+1} / {folds} ---")
        
        train_dataset = TensorDataset(X_cat[train_idx], X_num[train_idx], y[train_idx])
        val_dataset = TensorDataset(X_cat[val_idx], X_num[val_idx], y[val_idx])
        
        train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

        model = TabM_Mini(
            cat_dims=cat_dims, 
            num_features=X_num.shape[1], 
            k=k_ensemble,  
            d_model=256,   
            depth=3,       
            dropout=0.2
        ).to(device)
        
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
        criterion = nn.BCEWithLogitsLoss()
        
        best_auc = 0
        best_model_state = None
        patience = 10
        counter = 0

        for epoch in range(50): 
            model.train()
            total_loss = 0
            
            for bc, bn, by in train_loader:
                bc, bn, by = bc.to(device), bn.to(device), by.to(device)
                
                optimizer.zero_grad()
                
                preds_k = model(bc, bn)

                by_expanded = by.expand(-1, k_ensemble)

                loss = criterion(preds_k, by_expanded)
            
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            model.eval()
            val_preds_fold = []
            val_targets = []
            with torch.no_grad():
                for bc, bn, by in val_loader:
                    bc, bn = bc.to(device), bn.to(device)

                    preds_k = model(bc, bn)

                    preds_avg = torch.sigmoid(preds_k).mean(dim=1).cpu().numpy()
                    val_preds_fold.extend(preds_avg)
                    val_targets.extend(by.numpy())
            
            val_auc = roc_auc_score(val_targets, val_preds_fold)

            scheduler.step(val_auc)
            
            if val_auc > best_auc:
                best_auc = val_auc
                best_model_state = copy.deepcopy(model.state_dict())
                counter = 0
            else:
                counter += 1
                if counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
        
        print(f"Fold {fold+1} Best AUC: {best_auc:.4f}")

        model.load_state_dict(best_model_state)
        model.eval()

        with torch.no_grad():
            fold_val_preds = []
            for bc, bn, by in val_loader:
                bc, bn = bc.to(device), bn.to(device)
                preds_k = model(bc, bn)
                fold_val_preds.extend(torch.sigmoid(preds_k).mean(dim=1).cpu().numpy())
            oof_preds[val_idx] = np.array(fold_val_preds).reshape(-1, 1)

            fold_test_preds = []
            for bc, bn in test_loader:
                bc, bn = bc.to(device), bn.to(device)
                preds_k = model(bc, bn)
                fold_test_preds.extend(torch.sigmoid(preds_k).mean(dim=1).cpu().numpy())
            test_preds += np.array(fold_test_preds).reshape(-1, 1) / folds

    return oof_preds, test_preds

oof, test_pred_final = train_tabm(X_cat, X_num, y, X_test_cat, X_test_num, k_ensemble=32)
print(f"Overall CV AUC: {roc_auc_score(y.numpy(), oof):.5f}")


--- Fold 1 / 5 ---


KeyboardInterrupt: 

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'Heart Disease': test_pred_final.flatten()
})

submission.to_csv('submission_tabm.csv', index=False)
print("Submission file saved as 'submission_tabm.csv'")
print(submission.head())