Before starting, you will need to install some packages to reproduce the baseline.

In [40]:
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import MiniBatchKMeans

## Data loading

In [3]:
# put your own path to the data root directory (see example in `Data architecture` section)
data_dir = Path("data/")

# load the training and testing data sets
train_features_dir = data_dir / "train_input" / "moco_features"
test_features_dir = data_dir / "test_input" / "moco_features"
df_train = pd.read_csv(data_dir  / "supplementary_data" / "train_metadata.csv")
df_test = pd.read_csv(data_dir  / "supplementary_data" / "test_metadata.csv")

# concatenate y_train and df_train
y_train = pd.read_csv(data_dir  / "train_output.csv")
df_train = df_train.merge(y_train, on="Sample ID")

print(f"Training data dimensions: {df_train.shape}")  # (344, 4)
df_train.head()

Training data dimensions: (344, 4)


Unnamed: 0,Sample ID,Patient ID,Center ID,Target
0,ID_001.npy,P_001,C_1,0
1,ID_002.npy,P_002,C_2,1
2,ID_005.npy,P_005,C_5,0
3,ID_006.npy,P_006,C_5,0
4,ID_007.npy,P_007,C_2,1


## Data processing

We now load the features matrices $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$ for $s=1,...,344$ and perform slide-level averaging. This operation should take at most 5 minutes on your laptop.

In [36]:
X_train = []
y_train = []

features_list = []

for sample, label, center, patient in tqdm(
    df_train[["Sample ID", "Target", "Center ID", "Patient ID"]].values
):
    # load the coordinates and features (1000, 3+2048)
    _features = np.load(train_features_dir / sample)
    # get coordinates (zoom level, tile x-coord on the slide, tile y-coord on the slide)
    # and the MoCo V2 features
    coordinates, features = _features[:, :3], _features[:, 3:]  # Ks
    features_list.append(features)
    y_train.append(label)
    
y_train = np.array(y_train)

100%|██████████| 344/344 [00:05<00:00, 57.90it/s]


In [37]:
len(features_list)

344

In [38]:
features_tab = np.concatenate(features_list, axis=0)

In [39]:
features_tab.shape

(344000, 2048)

In [41]:
kmeans = MiniBatchKMeans(n_clusters=2,
                          random_state=0,
                          batch_size=10000,
                          max_iter=10,
                          n_init="auto",
                          verbose=0)

In [42]:
kmeans.fit(features_tab)

In [63]:
kmeans.cluster_centers_.shape

(2, 2048)

In [47]:
y_predict = kmeans.predict(features_tab)

In [67]:
squueeze_y_predict = np.zeros_like(y_train)
for i, y in enumerate(y_predict):
    squueeze_y_predict[i//1000] += y

In [70]:
squueeze_y_predict.max()
squueeze_y_predict.min()

0

In [80]:
aucs = dict()
for threshold in np.linspace(squueeze_y_predict.min(), squueeze_y_predict.max(), 100):
    aucs[threshold] =  max(roc_auc_score(y_train, squueeze_y_predict < threshold), roc_auc_score(y_train, squueeze_y_predict > threshold) )

In [83]:
max(aucs, key=aucs.get)

487.8484848484849

In [84]:
aucs[max(aucs, key=aucs.get)]

0.548900462962963