In [None]:
!pip install pytabkit -q
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from pytabkit import TabM_D_Classifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
RANDOM_STATE = 42
N_FOLDS = 5

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

target_col = 'Heart Disease'

if train[target_col].dtype == 'object':
    train[target_col] = train[target_col].map({'Presence': 1, 'Absence': 0}).fillna(0).astype(int)

try:
    original = pd.read_csv("Heart_Disease_Prediction.csv")
    if original[target_col].dtype == 'object':
        original[target_col] = original[target_col].map({'Presence': 1, 'Absence': 0}).fillna(0).astype(int)
except:
    original = pd.DataFrame()

base_features = [col for col in train.columns if col not in [target_col, 'id']]

def add_engineered_features_safe(df, original, base_features):
    df_temp = df.copy()
    orig_numeric = original.select_dtypes(include=[np.number]).columns
    base_numeric = [col for col in base_features if col in orig_numeric]

    for col in base_numeric[:3]:
        if len(original[col].unique()) > 1:
            stats = original.groupby(col)[target_col].agg(['mean']).reset_index()
            stats.columns = [col, f"orig_{col}_mean"]
            stats[f"orig_{col}_mean"] = pd.to_numeric(stats[f"orig_{col}_mean"], errors='coerce')
            df_temp = df_temp.merge(stats, on=col, how='left')

    return df_temp.fillna(-999)

if len(original) > 0:
    train = add_engineered_features_safe(train, original, base_features)
    test = add_engineered_features_safe(test, original, base_features)

feature_cols = [col for col in train.columns if col not in ['id', target_col]]

for col in feature_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce').fillna(-999)
    test[col] = pd.to_numeric(test[col], errors='coerce').fillna(-999)

X = train[feature_cols]
y = train[target_col]
X_test = test[feature_cols]

tabm_params = {
    'arch_type': 'tabm-mini-normal',
    'tabm_k': 16,
    'num_emb_type': 'pwl',
    'd_embedding': 12,
    'batch_size': 256,
    'lr': 1e-3,
    'n_epochs': 80,
    'dropout': 0.1,
    'd_block': 128,
    'n_blocks': 4,
    'weight_decay': 1e-3,
    'verbosity': 0,
    'patience': 15
}

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = TabM_D_Classifier(device=DEVICE, **tabm_params)
    model.fit(X_tr, y_tr)

    val_probs = model.predict_proba(X_val)[:, 1]
    test_probs = model.predict_proba(X_test)[:, 1]

    oof_preds[val_idx] = val_probs
    test_preds += test_probs / N_FOLDS

    score = roc_auc_score(y_val, val_probs)
    fold_scores.append(score)

    if DEVICE == 'cuda':
        torch.cuda.empty_cache()

cv_mean = np.mean(fold_scores)
oof_auc = roc_auc_score(y, oof_preds)

submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': np.clip(test_preds, 0.001, 0.999)
})
submission.to_csv('submission_tabm_clean.csv', index=False)

oof_df = pd.DataFrame({
    'id': train['id'],
    'Heart_Disease_prob': oof_preds
})
oof_df.to_csv('oof_tabm_clean.csv', index=False)
