Before starting, you will need to install some packages to reproduce the baseline.

In [1]:
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import wandb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

## Data loading

In [2]:
# put your own path to the data root directory (see example in `Data architecture` section)
data_dir = Path("data/")

# load the training and testing data sets
train_features_dir = data_dir / "train_input" / "moco_features"
test_features_dir = data_dir / "test_input" / "moco_features"
df_train_val = pd.read_csv(data_dir  / "supplementary_data" / "train_metadata.csv")
df_test = pd.read_csv(data_dir  / "supplementary_data" / "test_metadata.csv")

# concatenate y_train_val and df_train_val
y_train_val = pd.read_csv(data_dir  / "train_output.csv")
df_train_val = df_train_val.merge(y_train_val, on="Sample ID")

print(f"Training data dimensions: {df_train_val.shape}")  # (344, 4)
df_train_val.head()

Training data dimensions: (344, 4)


Unnamed: 0,Sample ID,Patient ID,Center ID,Target
0,ID_001.npy,P_001,C_1,0
1,ID_002.npy,P_002,C_2,1
2,ID_005.npy,P_005,C_5,0
3,ID_006.npy,P_006,C_5,0
4,ID_007.npy,P_007,C_2,1


## Data processing

We now load the features matrices $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$ for $s=1,...,344$ and perform slide-level averaging. This operation should take at most 5 minutes on your laptop.

In [3]:
X_train_val = []
y_train_val= []
centers_train_val = []
patients_train_val = []

for sample, label, center, patient in tqdm(
    df_train_val[["Sample ID", "Target", "Center ID", "Patient ID"]].values
):
    # load the coordinates and features (1000, 3+2048)
    _features = np.load(train_features_dir / sample)
    # get coordinates (zoom level, tile x-coord on the slide, tile y-coord on the slide)
    # and the MoCo V2 features
    coordinates, features = _features[:, :3], _features[:, 3:]  # Ks
    # slide-level averaging
    X_train_val.append(np.mean(features, axis=0))
    y_train_val.append(label)
    centers_train_val.append(center)
    patients_train_val.append(patient)

# convert to numpy arrays
X_train_val = np.array(X_train_val)
y_train_val = np.array(y_train_val)
centers_train_val = np.array(centers_train_val)
patients_train_val = np.array(patients_train_val)

100%|██████████| 344/344 [00:01<00:00, 190.41it/s]


In [4]:
patients_unique = np.unique(patients_train_val)
y_unique = np.array(
    [np.mean(y_train_val[patients_train_val == p]) for p in patients_unique]
)

print(
    "Training set specifications\n"
    "---------------------------\n"
    f"{len(X_train_val)} unique samples\n"
    f"{len(patients_unique)} unique patients\n"
)

Training set specifications
---------------------------
344 unique samples
305 unique patients



In [5]:
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
# split is performed at the patient-level
for train_idx_, val_idx_ in kfold.split(patients_unique, y_unique):
    # retrieve the indexes of the samples corresponding to the
    # patients in `train_idx_` and `val_idx_`
    train_idx = np.arange(len(X_train_val))[
        pd.Series(patients_train_val).isin(patients_unique[train_idx_])
    ]
    val_idx = np.arange(len(X_train_val))[
        pd.Series(patients_train_val).isin(patients_unique[val_idx_])
    ]
    # set the training and validation folds
    X_train = X_train_val[train_idx]
    y_train = y_train_val[train_idx]
    X_val = X_train_val[val_idx]
    y_val = y_train_val[val_idx]
    break

## Model training

In [6]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self, num_layers=1, inside_dim=0):
    super().__init__()

    self.inside_dim = inside_dim
    self.num_layers = num_layers
    if num_layers==1:
      self.layers = nn.Sequential(
        nn.Linear(2048, 1),
        nn.Sigmoid(),
        )

    else:
      list_layers = [nn.Linear(2048, inside_dim),nn.ReLU()]
      for _ in range(num_layers-2):
        list_layers.append(nn.Linear(inside_dim, inside_dim))
        list_layers.append(nn.ReLU())
      list_layers.append(nn.Linear(inside_dim, 1))
      list_layers.append(nn.Sigmoid())
      self.layers = nn.Sequential(*list_layers)


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

In [7]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [8]:
def train(X_train, y_train, X_val, y_val, model, lr=1e-6, batch_size=64, nb_epochs=5000, criterion=nn.BCELoss(), use_wandb=False):
    train_set = CustomDataset(X_train, y_train)
    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=False, sampler=None,
                                batch_sampler=None, num_workers=0, collate_fn=None,
                                pin_memory=False, drop_last=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.cuda()
    
    if use_wandb:
        config = {
            "learning_rate": lr,
            "batch_size": batch_size,
            "inside_dim": model.inside_dim,
            "num_layers": model.num_layers
            }

        wandb.init(project="OWKNIW", name=f"MLP", config=config)

    for epoch in range(nb_epochs):
        model.train()
        train_losses = []
        for x, y in train_dataloader:
            y_predict = model(x.to("cuda"))
            loss = criterion(y_predict.cpu(), y.unsqueeze(dim=1).float())
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        train_loss = np.array(train_losses).mean()

        model.eval()
        train_roc_auc_score = roc_auc_score(y_train, model(torch.Tensor(X_train).cuda()).cpu().detach())
        val_auc_roc_auc_score = roc_auc_score(y_val, model(torch.Tensor(X_val).cuda()).cpu().detach())

        if use_wandb:
            to_log = {}
            to_log["main/train_loss"] = train_loss
            to_log["main/train_roc_auc_score"] = train_roc_auc_score
            to_log["main/val_auc_roc_auc_score"] = val_auc_roc_auc_score
            wandb.log(to_log)

        elif epoch % (nb_epochs//10) == 1:
            print(f"epoch {epoch}: loss={'{:.3f}'.format(train_loss)}, train_roc_auc_score={'{:.3f}'.format(train_roc_auc_score)}, val_auc_roc_auc_score={'{:.3f}'.format(val_auc_roc_auc_score)}")

    if use_wandb:
        wandb.finish()


In [15]:
lr = 1e-5
batch_size = 64
nb_epochs = 5000


model = MLP(num_layers=1, inside_dim=0)
train(X_train, y_train, X_val, y_val, model, lr=lr, batch_size=batch_size, nb_epochs=nb_epochs, use_wandb=True)

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.323294…

0,1
main/train_loss,██▇▇▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
main/train_roc_auc_score,▁▂▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
main/val_auc_roc_auc_score,▁▁▃▄▅▆▆▇▇▇▇▇████████████████████████████

0,1
main/train_loss,0.43874
main/train_roc_auc_score,0.86032
main/val_auc_roc_auc_score,0.71443


In [21]:
lr = 1e-6
batch_size = 64
nb_epochs = 5000

model = MLP(num_layers=3, inside_dim=512)
train(X_train, y_train, X_val, y_val, model, lr=lr, batch_size=batch_size, nb_epochs=nb_epochs, use_wandb=False)

epoch 0: loss=0.697, train_roc_auc_score=0.459, val_auc_roc_auc_score=0.351
epoch 500: loss=0.628, train_roc_auc_score=0.715, val_auc_roc_auc_score=0.569
epoch 1000: loss=0.533, train_roc_auc_score=0.814, val_auc_roc_auc_score=0.718
epoch 1500: loss=0.449, train_roc_auc_score=0.856, val_auc_roc_auc_score=0.727
epoch 2000: loss=0.383, train_roc_auc_score=0.886, val_auc_roc_auc_score=0.725
epoch 2500: loss=0.315, train_roc_auc_score=0.915, val_auc_roc_auc_score=0.734
epoch 3000: loss=0.256, train_roc_auc_score=0.936, val_auc_roc_auc_score=0.722
epoch 3500: loss=0.206, train_roc_auc_score=0.950, val_auc_roc_auc_score=0.722
epoch 4000: loss=0.157, train_roc_auc_score=0.959, val_auc_roc_auc_score=0.724
epoch 4500: loss=0.112, train_roc_auc_score=0.965, val_auc_roc_auc_score=0.723


In [9]:
lr = 1e-6
batch_size = 64
nb_epochs = 5000

inside_dim_tab = [512, 1024, 2048]
num_layers_tab = [2,3]

for inside_dim in inside_dim_tab:
    for num_layers in num_layers_tab:
        model = MLP(inside_dim=inside_dim, num_layers=num_layers)
        train(X_train, y_train, X_val, y_val, model, lr=lr, batch_size=batch_size, nb_epochs=nb_epochs, use_wandb=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstanislasdozias[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.306245…

0,1
main/train_loss,██▇▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
main/train_roc_auc_score,▁▁▁▁▂▂▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████
main/val_auc_roc_auc_score,▃▃▂▂▂▁▁▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
main/train_loss,0.63157
main/train_roc_auc_score,0.71104
main/val_auc_roc_auc_score,0.56719


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.305915…

0,1
main/train_loss,██▇▇▇▇▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁
main/train_roc_auc_score,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
main/val_auc_roc_auc_score,▂▁▂▃▄▅▅▆▇▇▇▇▇███████████████████████████

0,1
main/train_loss,0.38551
main/train_roc_auc_score,0.88491
main/val_auc_roc_auc_score,0.72036


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.305981…

0,1
main/train_loss,████▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁
main/train_roc_auc_score,▁▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇████████████████
main/val_auc_roc_auc_score,▁▁▂▄▅▆▇█████████████████████████████████

0,1
main/train_loss,0.0334
main/train_roc_auc_score,0.97209
main/val_auc_roc_auc_score,0.72826


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.323073…

0,1
main/train_loss,███▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁
main/train_roc_auc_score,▁▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
main/val_auc_roc_auc_score,▁▁▃▄▅▆▇▇▇███████████████████████████████

0,1
main/train_loss,0.27776
main/train_roc_auc_score,0.93085
main/val_auc_roc_auc_score,0.72727


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.305651…

0,1
main/train_loss,███▇▇▆▆▅▅▄▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
main/train_roc_auc_score,▁▁▂▃▅▅▆▆▆▇▇▇▇███████████████████████████
main/val_auc_roc_auc_score,▁▂▅▆████████████████████████████████████

0,1
main/train_loss,0.00017
main/train_roc_auc_score,0.96837
main/val_auc_roc_auc_score,0.71492


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.305897…

0,1
main/train_loss,██▇▇▇▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁
main/train_roc_auc_score,▁▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
main/val_auc_roc_auc_score,▁▂▄▅▇▇▇█████████████████████████████████

0,1
main/train_loss,0.17401
main/train_roc_auc_score,0.95727
main/val_auc_roc_auc_score,0.72431


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

0,1
main/train_loss,██▇▆▅▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
main/train_roc_auc_score,▁▂▄▅▆▇▇▇████████████████████████████████
main/val_auc_roc_auc_score,▁▃▆████████████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇

0,1
main/train_loss,0.0
main/train_roc_auc_score,0.97106
main/val_auc_roc_auc_score,0.65415


## Inference for submission

In [22]:
X_test = []

# load the data from `df_test` (~ 1 minute)
for sample in tqdm(df_test["Sample ID"].values):
    _features = np.load(test_features_dir / sample)
    coordinates, features = _features[:, :3], _features[:, 3:]
    X_test.append(np.mean(features, axis=0))

X_test = np.array(X_test)

100%|██████████| 149/149 [00:00<00:00, 177.66it/s]


In [23]:
y_test = model(torch.Tensor(X_test).cuda()).cpu().detach().squeeze().numpy()

In [25]:
submission = pd.DataFrame(
    {"Sample ID": df_test["Sample ID"].values, "Target": y_test}
).sort_values(
    "Sample ID"
)  # extra step to sort the sample IDs

# sanity checks
assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
assert list(submission.columns) == [
    "Sample ID",
    "Target",
], "Your submission file must have columns `Sample ID` and `Target`"

# save the submission as a csv file
submission.to_csv(data_dir / "test_outputs/MLP.csv", index=None)
submission.head()

Unnamed: 0,Sample ID,Target
0,ID_003.npy,0.712358
1,ID_004.npy,0.999987
2,ID_008.npy,0.082272
3,ID_009.npy,0.999751
4,ID_010.npy,0.006201
