# Lunit + ABMIL Inference for Ovarian Cancer

This is an adapted version of my competion inference notebook, replacing CLAM with ABMIL, as the open-source code of CLAM was licensed under GPL-3, but with a clause excluding commercial use.

The featuere extraction, model training and inference code are located in the dataset "[abmil-for-ovarian-cancer](https://www.kaggle.com/datasets/dantee/abmil-for-ovarian-cancer-3rd-place)" later referenced as ABMIL.

1. Feature extraction via [Lunit-Dioo](https://github.com/lunit-io/benchmark-ssl-pathology) in the function ABMIL.extract_png_features . 
2. Training code of the [ABMIL](https://github.com/AMLab-Amsterdam/AttentionDeepMIL) model in ABMIL.main. This notebook does not run the training. It rather uses the weights from my local training run, located in the dataset .
3. Inference code of the [ABMIL](https://github.com/AMLab-Amsterdam/AttentionDeepMIL) model is the ABMIL.eval.eval_utils and uses the model weights in the dataset [traind-abmil-for-ovarian-cancer](https://www.kaggle.com/datasets/dantee/trained--abmil-for-ovarian-cancer). There are 5 models each trained on 80% of my training data as described in the [3rd place model summary](https://www.kaggle.com/competitions/UBC-OCEAN/discussion/465527).
   

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
if os.path.isdir('/kaggle/input'):
    !cp /kaggle/input/library-fastkaggle/fastkaggle-0.0.8-py3-none-any.whl /kaggle/working 
    !pip install  /kaggle/working/fastkaggle-0.0.8-py3-none-any.whl -q
from fastkaggle import setup_comp, iskaggle
!pip install /kaggle/input/library-histomicstk/wheelhouse/histomicstk-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --no-index --find-links /kaggle/input/library-histomicstk/wheelhouse -q
!pip install /kaggle/input/xformers-wheel/xformers/xformers-0.0.22.post7+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-index --find-links /kaggle/input/xformers-wheel/xformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
aiobotocore 2.11.2 requires botocore<1.34.35,>=1.33.2, but you have botocore 1.32.1 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.11.0 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
dask-cudf 23.8.0 requires dask==2023.7.1, but you have dask 2023.11.0 which 

In [3]:
comp_slug = 'UBC-OCEAN'
comp_path = setup_comp(comp_slug)
comp_path

Path('../input/UBC-OCEAN')

In [4]:
import os
import sys
import re
import json
from pathlib import Path
import time
# import psutil
import gc
import ctypes
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

In [5]:
os.environ['VIPS_CONCURRENCY'] = '4'
os.environ['VIPS_DISC_THRESHOLD'] = '15gb'

In [6]:
# Make Juptyter allow multi-processing in the data loader
# also seems to save memory
from multiprocessing import set_start_method
try:
    set_start_method('spawn')
except RuntimeError as ex:
    print(ex)

In [7]:
# torch.set_num_threads(4)

if iskaggle:
    abmil_path = '/kaggle/input/abmil-for-ovarian-cancer-3rd-place'    
else:
    abmil_path = '../abmil'
sys.path.append(abmil_path)

from ABMIL.extract_png_features import extract_png_features
from ABMIL.datasets.dataset_generic import Generic_WSI_Classification_Dataset
from ABMIL.utils.eval_utils import initiate_model

  warn(


In [8]:
ckpt_folder = Path('/kaggle/input/trained--abmil-for-ovarian-cancer')
with open(ckpt_folder/'settings.json', 'r') as file:
    settings = json.load(file)
print(settings)

model_path = '/kaggle/input/lunit-dino-weights/dino_vit_small_patch16_ep200.torch'
model = settings['FEATURE_EXTRACT_MODEL']
use_fp16 = settings['USE_FP16_FOR_FEATURE_EXTRACTION']
tile_size = settings['TILE_SIZE']
model_size = settings['MODEL_SIZE']
tma_megapixel_threshold = settings['TMA_MEGAPIXEL_THRESHOLD']
    
# Kaggle Submission
if len(os.listdir(comp_path/'test_images')) != 1:
    is_submission = True
    test_meta = pd.read_csv(comp_path/'test.csv')
    train_test = 'test'
    test_folder = comp_path/f'test_images'
# Kaggle Test Run
else:
    is_submission = False
    train_test = 'train'
    test_meta = pd.read_csv(comp_path/f'{train_test}.csv')
    # test_meta = test_meta[test_meta['image_id'].isin([39728, 39872, 39880, 29084, 44232, 34247, 42125, 5264])]
    # test_meta = test_meta[test_meta['image_id'].isin([45630, 36678, 8713, 14424])] # four largest images
    test_meta = test_meta.sample(5)
    test_folder = comp_path/f'{train_test}_images'
project_root = Path('/tmp')

file_sizes = []
formatted_sizes = []
for image_id in test_meta['image_id']:
    size = os.path.getsize(test_folder/f'{image_id}.png')
    file_sizes.append(size)
    formatted_sizes.append(f"{size / 1024**3:.2f} GB")
test_meta['file_gb'] = np.array(file_sizes) / 1024 **3
test_meta['formatted_file_size'] = formatted_sizes
test_meta['n_mega_pixels'] = (test_meta['image_width'] * test_meta['image_height'] // 1e6).astype(int)
n_classes = 6

{'COMP_DATA_DIR': './data/UBC-OCEAN', 'WORKING_DIR': './lunit_abmil', 'FEATURE_EXTRACT_MODEL': 'lunit_dino_16', 'FEATURE_EXTRACT_MODEL_PATH': 'dino_vit_small_patch16_ep200.torch', 'CHECKPOINT_DIR': './trained_lunit_ambil', 'TMA_MEGAPIXEL_THRESHOLD': 50, 'TILE_SIZE': 224, 'MODEL_SIZE': 'big', 'USE_FP16_FOR_FEATURE_EXTRACTION': True, 'TMA_HALF_RES': False}


In [9]:
test_meta.shape[0]

5

In [10]:
batch_size = 512
start = time.time()
extract_png_features(test_meta,
                     comp_path,
                     project_root,
                     train_or_test=train_test,
                     tma_megapixel_threshold=tma_megapixel_threshold,
                     model=model,
                     model_path=model_path,
                     use_fp16=use_fp16,
                     num_workers=4,
                     prefetch_factor=2,
                     tile_size=tile_size,
                     batch_size=batch_size,
                     print_every_batches=5,
                     tissue_threshold=0.05,
                     gc_after_batch=False,
                     print_memory=False,
                     skip_existing=False)
print(f'Finished in {(time.time()-start) // 60:.0f} min {(time.time()-start) % 60:.1f} s')

  warn(
  warn(
  warn(
  warn(


Start of batch 5: processed 0/5 images in 0 min 25 s
Start of batch 10: processed 0/5 images in 0 min 32 s
Start of batch 15: processed 0/5 images in 0 min 37 s
Start of batch 20: processed 0/5 images in 0 min 42 s
Start of batch 25: processed 0/5 images in 0 min 49 s
Start of batch 30: processed 1/5 images in 0 min 55 s
Start of batch 35: processed 1/5 images in 1 min 0 s
Start of batch 40: processed 1/5 images in 1 min 5 s
Start of batch 45: processed 1/5 images in 1 min 10 s
Start of batch 50: processed 1/5 images in 1 min 15 s
Start of batch 55: processed 1/5 images in 1 min 21 s
Start of batch 60: processed 1/5 images in 1 min 26 s
Start of batch 65: processed 1/5 images in 1 min 30 s
Start of batch 70: processed 2/5 images in 1 min 35 s
Start of batch 75: processed 2/5 images in 1 min 46 s
Start of batch 80: processed 3/5 images in 1 min 52 s
Start of batch 85: processed 3/5 images in 1 min 59 s
Start of batch 90: processed 3/5 images in 2 min 5 s
Start of batch 95: processed 3/5

## Load Model Checkpoints

In [11]:
labels = pd.read_csv(ckpt_folder/'label_mapping.csv', header=None)
labels.columns=['label', 'idx']
labels

Unnamed: 0,label,idx
0,HGSC,0
1,LGSC,1
2,EC,2
3,CC,3
4,MC,4
5,Other,5


In [12]:
drop_out=0.7
use_inst_predictions=False

re_checkpoint = re.compile(r's_\d+_checkpoint.pt')
ckpt_files = [path for path in os.listdir(ckpt_folder) if re_checkpoint.match(path)]
label_dict = {row['label']: row['idx'] for i, row in labels.iterrows()}
models = [initiate_model(label_dict, 
                         'abmil', 
                         os.path.join(ckpt_folder, ckpt_file),
                         model_size=model_size,
                         drop_out=drop_out,
                         feature_dim=384,
                         use_inst_predictions=use_inst_predictions) for ckpt_file in ckpt_files]

In [13]:
test_meta

Unnamed: 0,image_id,label,image_width,image_height,is_tma,file_gb,formatted_file_size,n_mega_pixels
367,43815,HGSC,38935,39255,False,1.992378,1.99 GB,1528
42,4877,CC,57724,42954,False,2.585611,2.59 GB,2479
250,30539,HGSC,58522,24345,False,0.513355,0.51 GB,1424
405,47984,HGSC,47165,17046,False,0.684987,0.68 GB,803
193,23523,MC,74723,45387,False,2.510577,2.51 GB,3391


In [14]:
bag_weight = 0.7

default_prediction = 'HGSC'
predictions = {}

not_found_count = 0
for i, row in test_meta.iterrows():
    image_id = row['image_id']
    is_tma = (row['n_mega_pixels'] <= 50)
    feature_path = project_root/'features'/f'{image_id}.pt'
    try:
        features = torch.load(feature_path, map_location='cuda')
    except FileNotFoundError:
        not_found_count += 1
        continue

    probs = []
    for model in models:
        model.eval()
        with torch.no_grad():
            result = model(features, bag_weight, is_tma)[0]
        probs.append(F.softmax(result, dim=1).cpu().numpy())
    model_avg = np.array(probs).mean(axis=0)

    predictions[image_id] = labels['label'].iloc[model_avg.argmax()]

In [15]:
predictions = pd.DataFrame(predictions.items(), columns=['image_id', 'label'])
if not_found_count > 0:
    print(f'Could not find {not_found_count} feature files.')

In [16]:
print(f"Replacing {predictions['label'].isnull().sum()} null labels")
predictions['label'].fillna(default_prediction, inplace=True)
print(f"Replacing {(~(predictions['label'].isin(labels['label']))).sum()} invalid labels")
predictions[~predictions['label'].isin(labels['label'])] = default_prediction
print(f"Fill default prediction for {(~test_meta['image_id'].isin(predictions['image_id'])).sum()} misisng image_ids.")
missing_ids = test_meta[~test_meta['image_id'].isin(predictions['image_id'])][['image_id']]
missing_ids['label'] = default_prediction
predictions = pd.concat([predictions, missing_ids])
predictions.sort_values('image_id').to_csv('/kaggle/working/submission.csv', index=False)

Replacing 0 null labels
Replacing 0 invalid labels
Fill default prediction for 0 misisng image_ids.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  predictions['label'].fillna(default_prediction, inplace=True)


In [17]:
!cat submission.csv

image_id,label
4877,CC
23523,MC
30539,HGSC
43815,HGSC
47984,LGSC


In [18]:
# while "submission.csv" not in os.listdir("/kaggle/working"):
#     predictions.to_csv('/kaggle/working/submission.csv', index=False)