# Train FPN-MIL with extracted features (PI-CAI) â€” no imgaug

Same as `kaggle_train_fpn_mil.ipynb` but **does not install imgaug** (avoids NumPy 2.0 issue). The repo's `dataset_concepts` imports imgaug; we stub it so the import succeeds. Offline MIL only uses pre-extracted features, so the stubs are never used.

**On Kaggle:**  
1. Add as **input** the dataset with `picai_extracted_features/multi_scale/` and `picai_labels.csv`.  
2. Set `FEAT_INPUT_PATH` in the paths cell.  
3. Run all cells. Checkpoints under `/kaggle/working`.

**Local:** Set `FEAT_INPUT_PATH` and `DATA_ROOT` as needed.

In [None]:
import sys

# ---- imgaug stub (repo imports it even if you don't use augmentations) ----
class BoundingBox: 
    pass

class BoundingBoxesOnImage: 
    pass

class _BBS:
    BoundingBox = BoundingBox
    BoundingBoxesOnImage = BoundingBoxesOnImage

class _Augmentables:
    bbs = _BBS()

class _FakeImgaug:
    augmentables = _Augmentables()

sys.modules["imgaug"] = _FakeImgaug()
sys.modules["imgaug.augmentables"] = _Augmentables()
sys.modules["imgaug.augmentables.bbs"] = _BBS()

print("imgaug stubbed.")

imgaug stubbed.


In [2]:
import os, sys
from pathlib import Path

REPO_DIR = Path("/kaggle/working/Multi-scale-Attention-based-MIL")
assert REPO_DIR.exists(), f"Repo not found at {REPO_DIR}"

# Make repo importable
if str(REPO_DIR) not in sys.path:
    sys.path.insert(0, str(REPO_DIR))

import main as repo_main
print("Imported repo_main from:", repo_main.__file__)

2026-02-24 11:07:14.458176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771931234.479819     448 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771931234.486605     448 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771931234.504035     448 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771931234.504054     448 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771931234.504057     448 computation_placer.cc:177] computation placer alr

Imported repo_main from: /kaggle/working/Multi-scale-Attention-based-MIL/main.py


In [3]:
import sys

_argv = sys.argv
sys.argv = [""]          # prevent argparse from reading notebook args

args = repo_main.config()  # IMPORTANT: this repo's config() takes NO args

sys.argv = _argv

print("Args built. Keys (first 25):", list(vars(args).keys())[:25])

Args built. Keys (first 25): ['output_dir', 'data_dir', 'clip_chk_pt_path', 'csv_file', 'feat_dir', 'img_dir', 'train', 'evaluation', 'eval_set', 'img_size', 'dataset', 'data_frac', 'label', 'num_classes', 'n_runs', 'start_run', 'val_split', 'n_folds', 'start_fold', 'mean', 'std', 'model_type', 'arch', 'swin_encoder', 'pretrained_swin_encoder']


In [4]:
args.train = True
args.evaluation = False
args.roi_eval = False

args.dataset = "PI_CAI"
args.label = "cs_pca"

args.data_dir = "/kaggle/working"
args.csv_file = "labels_with_split.csv"   # use the one that exists in /kaggle/working
args.feat_dir = "picai_extracted_features"
args.feature_extraction = "offline"

args.mil_type = "instance"
args.pooling_type = "attention"

args.output_dir = "/kaggle/working/MIL_runs"

# Optional: make it quick to test
# args.epochs = 1

print("train:", args.train)
print("epochs:", getattr(args, "epochs", None))
print("csv_file:", args.csv_file)
print("feat_dir:", args.feat_dir)
print("output_dir:", args.output_dir)

train: True
epochs: 9
csv_file: labels_with_split.csv
feat_dir: picai_extracted_features
output_dir: /kaggle/working/MIL_runs


In [5]:
from pathlib import Path

feat_root = Path(args.data_dir) / args.feat_dir
print("feat_root:", feat_root)
print("exists:", feat_root.exists())

# count a known file type quickly
n = len(list(feat_root.rglob("C4_patch_features.pt")))
print("Num C4_patch_features.pt under feat_root:", n)

# show a couple examples
examples = list(feat_root.rglob("C4_patch_features.pt"))[:3]
for e in examples:
    print("example:", e)

feat_root: /kaggle/working/picai_extracted_features
exists: True
Num C4_patch_features.pt under feat_root: 519
example: /kaggle/working/picai_extracted_features/multi_scale/11371_1001394/11371_1001394/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/10274_1000279/10274_1000279/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/11040_1001060/11040_1001060/C4_patch_features.pt


In [6]:
repo_main.main(args)


torch.cuda.current_device(): 0

Using device: cuda
output_path: /kaggle/working/MIL_runs/MIL_experiments/PI_CAI_data_frac_1.0/cs_pca/offline_feature_extraction/single_scale-patch_size_16/instance/encoder_mlp-dim_256-dropout_0.0/pooling_attention-dropout_0.0-softmax/2026-02-24
df shape: (519, 5)
Index(['patient_id', 'image_id', 'cs_pca', 'fold', 'split'], dtype='object')


In [7]:
from utils.generic_utils import seed_all
from MIL.MIL_experiment import do_experiments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

seed_all(args.seed)

do_experiments(args, device)

# The repo prints output_path during the run; also print what args contains:
print("args.output_dir:", args.output_dir)
print("args.output_path (if exists):", getattr(args, "output_path", None))

Using device: cuda
df shape: (519, 5)
Index(['patient_id', 'image_id', 'cs_pca', 'fold', 'split'], dtype='object')
args.output_dir: /kaggle/working/MIL_runs
args.output_path (if exists): /kaggle/working/MIL_runs/MIL_experiments/PI_CAI_data_frac_1.0/cs_pca/offline_feature_extraction/single_scale-patch_size_16/instance/encoder_mlp-dim_256-dropout_0.0/pooling_attention-dropout_0.0-softmax/2026-02-24


In [8]:
from pathlib import Path

out_path = Path(getattr(args, "output_path", args.output_dir))
print("Checking out_path:", out_path)
print("Exists?", out_path.exists())

if out_path.exists():
    # list some files
    files = sorted([p for p in out_path.rglob("*") if p.is_file()])
    print("Num files under out_path:", len(files))
    print("First 30 files:")
    for p in files[:30]:
        print(" -", p)

    # specifically search for checkpoints
    ckpts = [p for p in out_path.rglob("*.pth")]
    print("\nNum .pth checkpoints:", len(ckpts))
    for p in ckpts[:10]:
        print(" -", p)

Checking out_path: /kaggle/working/MIL_runs/MIL_experiments/PI_CAI_data_frac_1.0/cs_pca/offline_feature_extraction/single_scale-patch_size_16/instance/encoder_mlp-dim_256-dropout_0.0/pooling_attention-dropout_0.0-softmax/2026-02-24
Exists? True
Num files under out_path: 1
First 30 files:
 - /kaggle/working/MIL_runs/MIL_experiments/PI_CAI_data_frac_1.0/cs_pca/offline_feature_extraction/single_scale-patch_size_16/instance/encoder_mlp-dim_256-dropout_0.0/pooling_attention-dropout_0.0-softmax/2026-02-24/args.yaml

Num .pth checkpoints: 0


In [9]:
# Quick sanity check: are we accidentally doing 0 work?
keys_to_check = [
    "train", "n_runs", "epochs", "n_epochs", "max_epochs",
    "fold", "csv_file", "data_dir", "feat_dir",
    "feature_extraction", "mil_type", "pooling_type",
    "output_dir"
]

for k in keys_to_check:
    if hasattr(args, k):
        print(f"{k}: {getattr(args, k)}")

train: True
n_runs: 1
epochs: 9
csv_file: labels_with_split.csv
data_dir: /kaggle/working
feat_dir: picai_extracted_features
feature_extraction: offline
mil_type: instance
pooling_type: attention
output_dir: /kaggle/working/MIL_runs


In [10]:
from pathlib import Path
import glob

feat_root = Path(args.data_dir) / args.feat_dir
print("feat_root:", feat_root)
print("exists:", feat_root.exists())

# count patch feature files anywhere under feat_root
pt_files = list(feat_root.rglob("C4_patch_features.pt"))
print("Num C4_patch_features.pt under feat_root:", len(pt_files))

# show a few examples
for p in pt_files[:5]:
    print("example:", p)

feat_root: /kaggle/working/picai_extracted_features
exists: True
Num C4_patch_features.pt under feat_root: 519
example: /kaggle/working/picai_extracted_features/multi_scale/11371_1001394/11371_1001394/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/10274_1000279/10274_1000279/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/11040_1001060/11040_1001060/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/10643_1000659/10643_1000659/C4_patch_features.pt
example: /kaggle/working/picai_extracted_features/multi_scale/11342_1001365/11342_1001365/C4_patch_features.pt


In [15]:
import pandas as pd
from pathlib import Path

csv_path = Path(args.data_dir) / args.csv_file
print("Loading:", csv_path)

df = pd.read_csv(csv_path)
print("Loaded df shape:", df.shape)
print("Columns:", df.columns.tolist())

Loading: /kaggle/working/labels_with_split.csv
Loaded df shape: (519, 5)
Columns: ['patient_id', 'image_id', 'cs_pca', 'fold', 'split']


In [16]:
from pathlib import Path

# detect if you have a "multi_scale" subdir
feat_root = Path(args.data_dir) / args.feat_dir
ms = feat_root / "multi_scale"
print("multi_scale exists:", ms.exists(), "-", ms)

# use multi_scale if present
base = ms if ms.exists() else feat_root

# check how many CSV ids have a matching feature file
missing = []
found = 0

for img_id in df["image_id"].astype(str).tolist()[:200]:  # check first 200
    f = base / img_id / img_id / "C4_patch_features.pt"
    if f.exists():
        found += 1
    else:
        missing.append(img_id)

print("Checked 200 image_ids")
print("Found:", found)
print("Missing:", len(missing))
print("First 10 missing:", missing[:10])

multi_scale exists: True - /kaggle/working/picai_extracted_features/multi_scale
Checked 200 image_ids
Found: 200
Missing: 0
First 10 missing: []


In [17]:
from pathlib import Path

run0 = Path(args.output_dir) / "run_0"
print("run0:", run0, "exists:", run0.exists())

if run0.exists():
    print("All files:", list(run0.rglob("*"))[:50])
    print("Num total files:", sum(1 for _ in run0.rglob("*")))

run0: /kaggle/working/MIL_runs/run_0 exists: True
All files: []
Num total files: 0


In [18]:
import inspect
import MIL.MIL_experiment as me
print(inspect.getsource(me.do_experiments)[:2000])

def do_experiments(args, device):
        
    args.n_class = 1 # Binary classification setup (single output neuron)
        
    # Define class labels based on selected task
    if args.label.lower() == 'mass':
        class0 = 'not_mass'
        class1 = 'mass'
    elif args.label.lower() == 'suspicious_calcification':
        class0 = 'not_calcification'
        class1 = 'calcification'   

    # FALLBACK_CLASS_NAMES

    if 'class0' not in locals() or 'class1' not in locals():

        class0 = 'not_cs_pca'

        class1 = 'cs_pca'


    label_dict = {class0: 0, class1: 1}

    ############################ Data Setup ############################
    args.data_dir = Path(args.data_dir)
    
    args.df = pd.read_csv(args.data_dir / args.csv_file)
    args.df = args.df.fillna(0)
    
    print(f"df shape: {args.df.shape}")
    print(args.df.columns)

    # Split data into dev (train+val) and test sets
    dev_df = args.df[args.df['split'].isin(["train", "val"])].reset_index(drop=Tr

In [19]:
from pathlib import Path

file_path = Path("/kaggle/working/Multi-scale-Attention-based-MIL/MIL/MIL_experiment.py")

text = file_path.read_text()

text = text.replace(
    "dev_df = args.df[args.df['split'] == \"training\"].reset_index(drop=True)",
    "dev_df = args.df[args.df['split'].isin([\"train\", \"val\"])].reset_index(drop=True)"
)

file_path.write_text(text)

print("Updated MIL_experiment.py")

Updated MIL_experiment.py
