[PyTorch Frame](https://github.com/pyg-team/pytorch-frame) is a deep learning extension for PyTorch, designed for heterogeneous tabular data with different column types. [Trompt](https://arxiv.org/abs/2305.18446) - a novel tabular NN architecture inspired by
prompt learning of language models.

## Setup Libraries

In [None]:
%%time
!pip install -q pytorch_frame

In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold

import torch
import torch.optim as optim
import torch.nn.functional as F

import torch_frame
from torch_frame import stype
from torch_frame.data.loader import DataLoader
from torch_frame.nn.models.trompt import Trompt
from torch_frame.data.stats import compute_col_stats
from torch_frame.data import DataFrameToTensorFrameConverter

print("PyTorch       version:", torch.__version__)
print("PyTorch Frame version:", torch_frame.__version__)

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load Data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")
print("Train shape:", train.shape)
print("Test shape :", test.shape)

## Preprocess Features

In [None]:
%%time
TARGET = 'loan_paid_back'
X = train.drop(['id', TARGET], axis=1)
y = train[TARGET]; train_id = train.id
X_test = test.drop(['id'], axis=1)
del train, test
print("X      shape:", X.shape)
print("X_test shape:", X_test.shape, "\n")

cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
print("len(cat_cols):", len(cat_cols))
print("len(num_cols):", len(num_cols), "\n")

col_to_stype = {c: stype.numerical for c in num_cols}
col_to_stype.update({c: stype.categorical for c in cat_cols})
col_to_stype[TARGET] = stype.numerical

scaler = MinMaxScaler()
X[num_cols] = scaler.fit_transform(X[num_cols]).astype(np.float32)
X_test[num_cols] = scaler.transform(X_test[num_cols]).astype(np.float32)

## Config

In [None]:
class CFG:
    FOLDS = 5
    EPOCHS = 4
    LR = 1e-3
    WD = 1e-4
    WARMUP = 0.2
    BATCH = 512
    CHANNELS = 64
    PROMPTS = 8
    LAYERS = 4

## Train K-Fold

In [None]:
%%time
skf = StratifiedKFold(n_splits=CFG.FOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{CFG.FOLDS} ---")

    train_df = pd.concat([X.iloc[tr_idx].reset_index(drop=True),
                          pd.Series(y[tr_idx], name=TARGET).reset_index(drop=True)], axis=1)
    val_df   = pd.concat([X.iloc[val_idx].reset_index(drop=True),
                          pd.Series(y[val_idx], name=TARGET).reset_index(drop=True)], axis=1)

    col_stats = {c: compute_col_stats(train_df[c], stype=col_to_stype[c]) for c in train_df.columns if c in col_to_stype}

    converter = DataFrameToTensorFrameConverter(col_stats=col_stats, target_col=TARGET, col_to_stype=col_to_stype)
    tf_train = converter(train_df)
    tf_val   = converter(val_df)
    tf_test  = converter(X_test)

    train_loader = DataLoader(tf_train, batch_size=CFG.BATCH, shuffle=True, pin_memory=True)
    val_loader   = DataLoader(tf_val, batch_size=CFG.BATCH, shuffle=False, pin_memory=True)
    test_loader  = DataLoader(tf_test, batch_size=CFG.BATCH, shuffle=False, pin_memory=True)

    model = Trompt(
        channels=CFG.CHANNELS,
        out_channels=2,
        num_prompts=CFG.PROMPTS,
        num_layers=CFG.LAYERS,
        col_stats=col_stats,
        col_names_dict=converter.col_names_dict
    ).to(device)

    total_steps = len(train_loader) * CFG.EPOCHS
    optimizer = optim.AdamW(model.parameters(), lr=CFG.LR, weight_decay=CFG.WD)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=CFG.LR, total_steps=total_steps,
                                                    pct_start=CFG.WARMUP, cycle_momentum=False)

    for epoch in range(1, CFG.EPOCHS + 1):
        model.train()
        total_loss = 0.0
        total_samples = 0

        for batch in train_loader:
            batch = batch.to(device, non_blocking=True)
            optimizer.zero_grad()

            # Compute layer-wise supervised loss
            out = model(batch)
            batch_size, num_layers, num_classes = out.size()
            # [batch_size * num_layers, num_classes]
            pred = out.view(-1, num_classes)
            # [batch_size * num_layers] (ints)
            y_rep = batch.y.long().repeat_interleave(num_layers)
            loss = F.cross_entropy(pred, y_rep)
            
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item() * batch_size
            total_samples += batch_size
        avg_loss = total_loss / (total_samples + 1e-12)

        # Validation: average across layers, then compute positive-class prob
        model.eval()
        val_probs_parts = []
        val_targets_parts = []
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device, non_blocking=True)
                preds = model(batch).mean(dim=1)            # Average layers
                probs = torch.softmax(preds, dim=-1)[:, 1]  # Positive class prob
                val_probs_parts.append(probs.cpu().numpy())
                val_targets_parts.append(batch.y.cpu().numpy())

        val_probs = np.concatenate(val_probs_parts)
        val_targets = np.concatenate(val_targets_parts)
        val_auc = roc_auc_score(val_targets, val_probs)
        print(f"    Epoch {epoch:02d} - train_loss: {avg_loss:.5f} val_auc: {val_auc:.5f}")

    # After training fold: produce OOF preds for this fold
    model.eval()
    fold_val_probs = []
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device, non_blocking=True)
            preds = model(batch).mean(dim=1)
            probs = torch.softmax(preds, dim=-1)[:, 1]
            fold_val_probs.append(probs.cpu().numpy())
    fold_val_probs = np.concatenate(fold_val_probs)
    oof_preds[val_idx] = fold_val_probs

    # Predict test set inside fold and average across folds
    fold_test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device, non_blocking=True)
            preds = model(batch).mean(dim=1)
            probs = torch.softmax(preds, dim=-1)[:, 1]
            fold_test_preds.append(probs.cpu().numpy())
    fold_test_preds = np.concatenate(fold_test_preds)
    test_preds += fold_test_preds / CFG.FOLDS

oof_df = pd.DataFrame({'id': train_id, 'oof_pred': oof_preds})
oof_df.to_csv('oof_preds.csv', index=False)
print("\nFinal OOF AUC:", np.round(roc_auc_score(y, oof_preds),5), "\n")

## Create Submission

In [None]:
sub = pd.read_csv("/kaggle/input/playground-series-s5e11/sample_submission.csv")
sub[TARGET] = test_preds
sub.to_csv("submission.csv", index=False)
sub.head()