To make data ready for CLAM, we need:

- in dataset_csv: 
a csv file called p53_classification_clean.csv like this:

case_id,slide_id,label
patient_0,slide_0,subtype_1
patient_1,slide_1,subtype_3

- in splits/task_3_p53_classification_custom: 
a csv file for each fold called split_{fold_nr}.csv like this:

,train,val,test
0,slide_110,slide_403,slide_13
1,slide_112,slide_336,slide_353
2,slide_113,slide_423,slide_59
where each column thus has a different length

- in {data_root_dir}/p53_classification_retccl_features/pt_files 
one pt file per slide_id containing a tensor of patch features, assumed to be of shape (n_patches, n_features)

In [4]:
import pandas as pd
import os

from load_data import DATA_DIR

CLAM_DIR = os.path.join(DATA_DIR, '..','..','code','CLAM')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
import matplotlib
import numpy
import cv2
import pandas
import PIL
import torch
import scipy
import sklearn
import tensorboardx
import torchvision
from topk.svm import SmoothTop1SVM

In [2]:
!pip install tensorboardx



ERROR: Could not find a version that satisfies the requirement topk (from versions: none)
ERROR: No matching distribution found for topk


# Labels

In [14]:
DISABLE_DOUBLECLONE = True

In [15]:
labels = pd.read_csv(os.path.join(DATA_DIR, 'biopsy_labels_anon_s1.0.csv'))

"""
format:
	id	label
0	0	2
1	1	2
2	2	2
to:
case_id,slide_id,label
patient_0,slide_0,wildtype
patient_1,slide_1,overexpression
"""

labels['label_ori'] = labels['label']
labels.drop('label', axis=1, inplace=True)
labels['case_id'] = 'patient_' + labels['id'].astype(str)
labels['slide_id'] = 'slide_' + labels['id'].astype(str)
labels['label'] = labels['label_ori'].map({0: 'wildtype', 1: 'overexpression', 2: 'nullmutation', 3: 'doubleclone'})
labels.drop('id', axis=1, inplace=True)
labels.drop('label_ori', axis=1, inplace=True)

if DISABLE_DOUBLECLONE:
    labels = labels[labels['label'] != 'doubleclone']

# Show distribution of labels
print(labels['label'].value_counts())

labels.to_csv(os.path.join(CLAM_DIR, 'dataset_csv', 'p53_classification_clean.csv'), index=False)

label
wildtype          1140
overexpression     266
nullmutation       111
Name: count, dtype: int64


# Splits

In [16]:
# We want to save splits for 5 different folds as split_{fold_nr}.csv inside CLAM_DIR/splits/task_3_p53_classification_custom
""" format:
,train,val,test
0,slide_110,slide_403,slide_13
1,slide_112,slide_336,slide_353
2,slide_113,slide_423,slide_59
where each column thus has a different length
"""

from sklearn.model_selection import KFold

# First we need to get the train and test indices
train_labels = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_labels = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

# Filter label 3 out of the test set
if DISABLE_DOUBLECLONE:
    test_labels = test_labels[test_labels['label'] != 3]

train_indices = train_labels['id'].values
test_indices = test_labels['id'].values

print(len(train_indices), len(test_indices))

# Now we can create the splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

os.makedirs(os.path.join(CLAM_DIR, 'splits', 'task_3_p53_classification_custom'), exist_ok=True)

for fold_nr, (train_index, val_index) in enumerate(kf.split(train_indices)):
    train_slides = ['slide_' + str(i) for i in train_indices[train_index]]
    val_slides = ['slide_' + str(i) for i in train_indices[val_index]]
    test_slides = ['slide_' + str(i) for i in test_indices]

    # arrays need to be of same length so we pad with None
    max_len = max(len(train_slides), len(val_slides), len(test_slides))
    train_slides += [None] * (max_len - len(train_slides))
    val_slides += [None] * (max_len - len(val_slides))
    test_slides += [None] * (max_len - len(test_slides))

    split = pd.DataFrame({'train': train_slides, 'val': val_slides, 'test': test_slides})
    split.to_csv(os.path.join(CLAM_DIR, 'splits', 'task_3_p53_classification_custom', f'splits_{fold_nr}.csv'))

1472 45


# Features

In [10]:
import torch
import os
from load_data import DATA_DIR
CLAM_DIR = os.path.join(DATA_DIR, '..','..','code','CLAM')

bag_latents = torch.load(os.path.join(DATA_DIR, 'bag_latents_gs256_retccl.pt'))
# bag latents is a dictionary with keys as slide_id and values as tensors of shape (n_patches, 2048)

empty_bags = []
for slide_id, latent in bag_latents.items():
    if latent.shape[0] == 0:
        empty_bags.append(slide_id)
display(empty_bags)

[]

In [11]:
# we want to save one pt file per slide inside CLAM_DIR/data/p53_classification_retccl_features/pt_files
os.makedirs(os.path.join(CLAM_DIR, 'data', 'p53_classification_retccl_features', 'pt_files'), exist_ok=True)
for slide_id, patch_latents in bag_latents.items():
    torch.save(patch_latents.squeeze(1), os.path.join(CLAM_DIR, 'data', 'p53_classification_retccl_features', 'pt_files', f'slide_{slide_id}.pt'))

# Code

In [None]:
# Dataset initialization
    args.n_classes=4
    dataset = Generic_MIL_Dataset(csv_path = 'dataset_csv/p53_classification_clean.csv',
                            data_dir= os.path.join(args.data_root_dir, 'p53_classification_retccl_features'),
                            shuffle = False, 
                            seed = args.seed, 
                            print_info = True,
                            label_dict = {'wildtype':0, 'overexpression':1, 'nullmutation':2, 'doubleclone':3},
                            patient_strat=False,
                            ignore=[])