In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv
/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv
/kaggle/input/equity-post-HCT-survival-predictions/train.csv
/kaggle/input/equity-post-HCT-survival-predictions/test.csv


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedKFold
import random
import os
from tqdm import tqdm

# Set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Helper function for setting random seeds for reproducibility
def set_random_seeds(seed_value):
    random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.backends.cudnn.deterministic = True

# Data loading
train_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_data = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

In [4]:
# Dropping the target and time features for feature engineering
exclude_columns = ["ID", "efs", "efs_time", "y"]
race_column = train_data['race_group']
ethnicity_column = train_data['ethnicity']  # Treating ethnicity similarly as race_group

features = [col for col in train_data.columns if col not in exclude_columns]

# Combine race_group and ethnicity into a new column for stratification
train_data['race_ethnicity_group'] = train_data['race_group'].astype(str) + '_' + train_data['ethnicity'].astype(str)
test_data['race_ethnicity_group'] = test_data['race_group'].astype(str) + '_' + test_data['ethnicity'].astype(str)

In [5]:
# Handling categorical variables
categorical_columns = []

for column in features:
    if train_data[column].dtype == "object":
        train_data[column] = train_data[column].fillna("NAN")
        test_data[column] = test_data[column].fillna("NAN")
        categorical_columns.append(column)
    elif 'age' not in column:
        train_data[column] = train_data[column].astype("str")
        test_data[column] = test_data[column].astype("str")
        categorical_columns.append(column)

# Label encoding the categorical columns, including 'ethnicity'
category_sizes = []
embedding_dims = []
numerical_columns = []
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

for column in features:
    if column in categorical_columns:
        combined_data[column], _ = combined_data[column].factorize()
        combined_data[column] -= combined_data[column].min()
        combined_data[column] = combined_data[column].astype("int32")

        unique_values = combined_data[column].nunique()
        category_sizes.append(unique_values)
        embedding_dims.append(int(np.ceil(np.sqrt(unique_values))))
    else:
        if combined_data[column].dtype == "float64":
            combined_data[column] = combined_data[column].astype("float32")
        if combined_data[column].dtype == "int64":
            combined_data[column] = combined_data[column].astype("int32")

        mean = combined_data[column].mean()
        std = combined_data[column].std()
        combined_data[column] = (combined_data[column] - mean) / std
        combined_data[column] = combined_data[column].fillna(0)

        numerical_columns.append(column)

train_data = combined_data.iloc[:len(train_data)].copy()
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True).copy()

In [6]:
# Log transforming the target 'efs_time' values
train_data["y"] = train_data.efs_time.values
max_value = train_data.loc[train_data.efs == 1, "efs_time"].max()
min_value = train_data.loc[train_data.efs == 0, "efs_time"].min()
train_data.loc[train_data.efs == 0, "y"] += max_value - min_value
train_data["y"] = train_data["y"].rank()
train_data.loc[train_data.efs == 0, "y"] += 2 * len(train_data)
train_data["y"] = train_data["y"] / train_data["y"].max()
train_data["y"] = np.log(train_data["y"])
train_data["y"] -= train_data["y"].mean()
train_data["y"] *= -1.0

# Dataset class
class CustomDataset(Dataset):
    def __init__(self, X_categorical, X_numerical, y=None):
        self.X_categorical = torch.tensor(X_categorical).float()
        self.X_numerical = torch.tensor(X_numerical).float()
        self.y = torch.tensor(y).float() if y is not None else None

    def __len__(self):
        return len(self.X_categorical)

    def __getitem__(self, idx):
        return (self.X_categorical[idx], self.X_numerical[idx], self.y[idx].unsqueeze(0)) if self.y is not None else (self.X_categorical[idx], self.X_numerical[idx])


In [7]:
# Model architecture
class ResidualBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, p=0.1):
        super(ResidualBlock, self).__init__()

        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(p),
            nn.LeakyReLU(),

            nn.Linear(hidden_dim, input_dim),
            nn.BatchNorm1d(input_dim),
            nn.Dropout(p)
        )

        self.activation = nn.LeakyReLU()

    def forward(self, x):
        residual = x
        out = self.fc(x)
        out += residual
        return self.activation(out)

class KaplanMeierModel(nn.Module):
    def __init__(self, category_sizes, embedding_dims, numerical_features, hidden_dim, num_blocks, p=0.3):
        super(KaplanMeierModel, self).__init__()

        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_size, emb_dim)
            for cat_size, emb_dim in zip(category_sizes, embedding_dims)
        ])

        input_dim = sum(embedding_dims) + len(numerical_features)

        self.input_fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(p),
            nn.LeakyReLU()
        )

        self.residual_blocks = nn.ModuleList([
            ResidualBlock(hidden_dim, hidden_dim, p) for _ in range(num_blocks)
        ])

        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, X_categorical, X_numerical):
        X_categorical = X_categorical.long()
        embeddings = [emb(X_categorical[:, i]) for i, emb in enumerate(self.embeddings)]
        embeddings = [emb.flatten(start_dim=1) for emb in embeddings]
        X = torch.cat(embeddings + [X_numerical], dim=-1)
        X = self.input_fc(X)

        for block in self.residual_blocks:
            X = block(X)

        return self.output_layer(X)

In [8]:
# Training function
def train_model():
    set_random_seeds(42)

    oof_predictions = np.zeros(len(train_data))
    test_predictions = np.zeros(len(test_data))

    fold_count = 5
    repeat_count = 3
    epochs = 15
    learning_rate = 1e-3
    batch_size = 32
    criterion = nn.MSELoss()

    for repeat in range(repeat_count):
        print(f"### Repeat {repeat + 1} ###")
        
        kfold = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=42 + repeat)
        
        # Use combined 'race_ethnicity_group' for stratification
        for fold, (train_idx, val_idx) in enumerate(kfold.split(train_data, train_data['race_ethnicity_group'])):
            print(f"   Fold {fold + 1}/{fold_count}")
            
            train_subset = Subset(train_dataset, train_idx)
            val_subset = Subset(train_dataset, val_idx)
            
            train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
            
            model = KaplanMeierModel(category_sizes, embedding_dims, numerical_columns, 256, 10, 0.15).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)
            scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.9 ** epoch)

            for epoch in range(epochs):
                model.train()
                train_loss = 0
                for X_categorical_batch, X_numerical_batch, y_batch in train_loader:
                    optimizer.zero_grad()
                    output = model(X_categorical_batch.to(DEVICE), X_numerical_batch.to(DEVICE))
                    loss = criterion(output, y_batch.to(DEVICE))
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.item()
                train_loss /= len(train_loader)

                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for X_categorical_batch, X_numerical_batch, y_batch in val_loader:
                        output = model(X_categorical_batch.to(DEVICE), X_numerical_batch.to(DEVICE))
                        loss = criterion(output, y_batch.to(DEVICE))
                        val_loss += loss.item()
                val_loss /= len(val_loader)

                print(f"      train_loss: {train_loss}  val_loss: {val_loss}  Learning Rate: {scheduler.get_lr()[0]:.6f}")
                scheduler.step()

            model.eval()
            val_preds = []
            with torch.no_grad():
                for X_categorical_batch, X_numerical_batch, _ in val_loader:
                    output = model(X_categorical_batch.to(DEVICE), X_numerical_batch.to(DEVICE))
                    val_preds.append(output.cpu())

            oof_predictions[val_idx] = np.concatenate(val_preds).squeeze()

            test_preds = []
            with torch.no_grad():
                for X_categorical_batch, X_numerical_batch in test_loader:
                    output = model(X_categorical_batch.to(DEVICE), X_numerical_batch.to(DEVICE))
                    test_preds.append(output.cpu())

            test_predictions += np.concatenate(test_preds).squeeze()

    oof_predictions /= repeat_count
    test_predictions /= (fold_count * repeat_count)

    return oof_predictions, test_predictions

# Prepare dataset
train_dataset = CustomDataset(train_data[categorical_columns].values, train_data[numerical_columns].values, train_data['y'].values)
test_dataset = CustomDataset(test_data[categorical_columns].values, test_data[numerical_columns].values)

# Create DataLoader for test dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [9]:
# Train and get predictions
oof, test_preds = train_model()

### Repeat 1 ###
   Fold 1/5


  _warn_get_lr_called_within_step(self)


      train_loss: 2.389321134487788  val_loss: 2.0829470223850675  Learning Rate: 0.001000
      train_loss: 2.0335757730735673  val_loss: 1.9947398861249288  Learning Rate: 0.000900
      train_loss: 1.9610512465238572  val_loss: 2.0959809008571835  Learning Rate: 0.000810
      train_loss: 1.8963669230540594  val_loss: 1.9319616622394986  Learning Rate: 0.000729
      train_loss: 1.860611065145996  val_loss: 1.8789061619175804  Learning Rate: 0.000656
      train_loss: 1.8254302915599612  val_loss: 1.8938949743906657  Learning Rate: 0.000590
      train_loss: 1.799934014512433  val_loss: 1.930923150976499  Learning Rate: 0.000531
      train_loss: 1.7751924260622924  val_loss: 1.867977135711246  Learning Rate: 0.000478
      train_loss: 1.734413973407613  val_loss: 1.86563896867964  Learning Rate: 0.000430
      train_loss: 1.7081831917994552  val_loss: 1.9029126425584157  Learning Rate: 0.000387
      train_loss: 1.6633104326824346  val_loss: 1.8840005358060201  Learning Rate: 0.000

  _warn_get_lr_called_within_step(self)


      train_loss: 2.354656480749448  val_loss: 2.018621286418703  Learning Rate: 0.001000
      train_loss: 2.0252319660451676  val_loss: 1.9385137730174595  Learning Rate: 0.000900
      train_loss: 1.9550073835584851  val_loss: 1.9063912673128975  Learning Rate: 0.000810
      train_loss: 1.9088856102691756  val_loss: 1.8740088433027267  Learning Rate: 0.000729
      train_loss: 1.8615449654559295  val_loss: 1.9044641339116626  Learning Rate: 0.000656
      train_loss: 1.8372328345146445  val_loss: 1.9057079838381874  Learning Rate: 0.000590
      train_loss: 1.8084106830259163  val_loss: 1.8771160705222023  Learning Rate: 0.000531
      train_loss: 1.7774008482694625  val_loss: 1.895076286792755  Learning Rate: 0.000478
      train_loss: 1.735729813741313  val_loss: 1.905738667315907  Learning Rate: 0.000430
      train_loss: 1.7251389857795503  val_loss: 1.912169842918714  Learning Rate: 0.000387
      train_loss: 1.6744410461021795  val_loss: 1.955433529946539  Learning Rate: 0.00

  _warn_get_lr_called_within_step(self)


      train_loss: 2.461884361671077  val_loss: 2.0870539949999913  Learning Rate: 0.001000
      train_loss: 2.037975462608867  val_loss: 1.9958598805798424  Learning Rate: 0.000900
      train_loss: 1.950287614762783  val_loss: 1.9718318939208985  Learning Rate: 0.000810
      train_loss: 1.8844356292651758  val_loss: 1.9157340741819806  Learning Rate: 0.000729
      train_loss: 1.8408661145302985  val_loss: 2.0074347290727825  Learning Rate: 0.000656
      train_loss: 1.8106925804581908  val_loss: 1.899376357263989  Learning Rate: 0.000590
      train_loss: 1.7870721149775717  val_loss: 1.8960167189439139  Learning Rate: 0.000531
      train_loss: 1.7661518324580459  val_loss: 1.9048923747407065  Learning Rate: 0.000478
      train_loss: 1.721812855783436  val_loss: 1.9142819063531027  Learning Rate: 0.000430
      train_loss: 1.6881269902818732  val_loss: 1.9412181890673108  Learning Rate: 0.000387
      train_loss: 1.637750076254209  val_loss: 1.9122910492950016  Learning Rate: 0.0

In [10]:
submission = pd.DataFrame(data={'ID': test_data['ID'], 'prediction': test_preds})
submission.to_csv('submission.csv', index=False)

In [11]:
submission

Unnamed: 0,ID,prediction
0,28800,-1.133949
1,28801,0.415221
2,28802,-1.294203
