In [1]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [2]:
!apt-get update -qq
!apt-get install -y openslide-tools
!pip install openslide-python


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 103 not upgraded.
Need to get 104 kB of archives.
After this operation, 297 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopenslide0 amd64 3.4.1+dfsg-5build1 [89.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 openslide-tools amd64 3.4.1+dfsg-5build1 [13.8 kB]
Fetched 104 kB in 1s (105 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 121689 files and directories currentl

In [3]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from tqdm import tqdm


In [4]:
COHORT_PATH = "/content/drive/MyDrive/permanent_data_folder/cohort.csv"
cohort_df = pd.read_csv(COHORT_PATH)

print("Patients:", len(cohort_df))
cohort_df.head()


Patients: 193


Unnamed: 0,patient_id,time,event,rnaseq_path,slide_path
0,TCGA-A8-A07E,608.0,0,/content/drive/MyDrive/permanent_data_folder/T...,/content/drive/MyDrive/permanent_data_folder/T...
1,TCGA-D8-A146,643.0,0,/content/drive/MyDrive/permanent_data_folder/T...,/content/drive/MyDrive/permanent_data_folder/T...
2,TCGA-A8-A09K,912.0,0,/content/drive/MyDrive/permanent_data_folder/T...,/content/drive/MyDrive/permanent_data_folder/T...
3,TCGA-A2-A0YT,723.0,1,/content/drive/MyDrive/permanent_data_folder/T...,/content/drive/MyDrive/permanent_data_folder/T...
4,TCGA-C8-A130,370.0,0,/content/drive/MyDrive/permanent_data_folder/T...,/content/drive/MyDrive/permanent_data_folder/T...


In [5]:
IMG_SIZE = 256

transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    )
])


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet50(pretrained=True)
resnet.fc = nn.Identity()  # output = 2048-dim embedding
resnet = resnet.to(device)
resnet.eval()

for param in resnet.parameters():
    param.requires_grad = False




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:01<00:00, 90.6MB/s]


In [7]:
import openslide
import random

def extract_random_patches(svs_path, num_patches=50, level=0):
    slide = openslide.OpenSlide(svs_path)
    w, h = slide.level_dimensions[level]

    patches = []
    for _ in range(num_patches):
        x = random.randint(0, w - IMG_SIZE)
        y = random.randint(0, h - IMG_SIZE)
        patch = slide.read_region((x, y), level, (IMG_SIZE, IMG_SIZE))
        patch = patch.convert("RGB")
        patches.append(transform(patch))

    slide.close()
    return torch.stack(patches)


In [8]:
@torch.no_grad()
def embed_patient_slide_tokens(
    svs_path,
    num_patches=50,
    top_k=8
):
    patches = extract_random_patches(svs_path, num_patches)
    patches = patches.to(device)

    feats = resnet(patches)  # (num_patches, 2048)

    # IMPORTANT PART (this replaces mean pooling)
    norms = feats.norm(dim=1)
    topk_idx = norms.topk(top_k).indices

    tokens = feats[topk_idx]  # (K, 2048)
    return tokens.cpu().numpy()


In [9]:
# SAVE_DIR = "/content/drive/MyDrive/permanent_data_folder/TCGA_BRCA_data/image_embeddings"
# os.makedirs(SAVE_DIR, exist_ok=True)

# def embedding_exists(pid):
#     return os.path.exists(os.path.join(SAVE_DIR, f"{pid}.npy"))

# completed = sum(embedding_exists(pid) for pid in cohort_df["patient_id"])
# print(f"Already completed: {completed} / {len(cohort_df)}")

# for _, row in tqdm(cohort_df.iterrows(), total=len(cohort_df)):
#     pid = row["patient_id"]
#     svs_path = row["slide_path"]

#     if embedding_exists(pid):
#         continue  # ✅ checkpoint skip

#     try:
#         emb = embed_patient_slide(svs_path, num_patches=50)
#         np.save(os.path.join(SAVE_DIR, f"{pid}.npy"), emb)
#     except Exception as e:
#         print(f"Failed for {pid}: {e}")


In [None]:
TOKEN_DIR = "/content/drive/MyDrive/permanent_data_folder/TCGA_BRCA_data/image_tokens"
os.makedirs(TOKEN_DIR, exist_ok=True)

for _, row in tqdm(cohort_df.iterrows(), total=len(cohort_df)):
    pid = row["patient_id"]
    out_path = os.path.join(TOKEN_DIR, f"{pid}.npy")

    if os.path.exists(out_path):
        continue

    try:
        tokens = embed_patient_slide_tokens(row["slide_path"])
        np.save(out_path, tokens)  # shape (K, 2048)
    except Exception as e:
        print("Failed:", pid, e)


 92%|█████████▏| 177/193 [1:57:40<47:48, 179.31s/it]

In [None]:
len(os.listdir(TOKEN_DIR)), "embeddings saved"


In [None]:
import os

EMB_DIR = "/content/drive/MyDrive/permanent_data_folder/TCGA_BRCA_data/image_embeddings"

image_pids = {
    f.replace(".npy", "")
    for f in os.listdir(EMB_DIR)
    if f.endswith(".npy")
}

print("Image-valid patients:", len(image_pids))


In [None]:
cohort_img = cohort_df[
    cohort_df["patient_id"].isin(image_pids)
].copy()

print("Final image-valid cohort:", len(cohort_img))


In [None]:
IMG_COHORT_PATH = "/content/drive/MyDrive/permanent_data_folder/TCGA_BRCA_data/cohort_image_valid.csv"
cohort_img.to_csv(IMG_COHORT_PATH, index=False)

print("Saved:", IMG_COHORT_PATH)
