In [1]:
import os
import glob
import random
import cv2
import openslide
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from tqdm.auto import tqdm

In [None]:
# level_of_interest = 0
final_patch_size = 512
tissue_threshold = 0.7

project_list = glob.glob("/data/SR-Hist-Foundation/tcga*")

for level_of_interest in [0, 1]:
    if level_of_interest == 0:
        step_size = final_patch_size
        patch_size = final_patch_size
        sampling_rate = 0.1
    elif level_of_interest == 1:
        step_size = final_patch_size * 2
        patch_size = final_patch_size * 2
        sampling_rate = 0.4
    
    for project in project_list:
        print(project)
        project_name = project.split("/")[-1]
    
        f_path = f"{project}/*.svs"
        flist = glob.glob(f_path)
        
        for f in tqdm(flist, leave=False):
            # dest = "/".join(f.split("/")[:-1]) + "/"
            dest = "/data/SR-Hist-Foundation/"
            os.makedirs(dest + "HR", exist_ok=True)
            os.makedirs(dest + "LR-x4", exist_ok=True)
            # os.makedirs(dest + "LR-x8", exist_ok=True)
            
            slide = openslide.OpenSlide(f)
            dimensions = slide.level_dimensions
            
            scaling_factor = round(dimensions[0][0] / dimensions[-1][0])
            scaled_patch_size = round(patch_size / scaling_factor)
            scaled_step = round(step_size / scaling_factor)
            
            thumbnail = slide.read_region((0, 0), len(dimensions) - 1, dimensions[-1])
            thumbnail = np.array(thumbnail).astype(np.uint8)
            gray_image = cv2.cvtColor(thumbnail, cv2.COLOR_BGR2GRAY)
    
            _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            
            for x in range(0, dimensions[-1][1], scaled_step):
                for y in range(0, dimensions[-1][0], scaled_step):
                    patch = binary_image[x:x+scaled_patch_size, y:y+scaled_patch_size]
                    
                    tissue_ratio = patch.mean() / 255.
                    rng = random.random()
                    
                    if tissue_ratio >= tissue_threshold and rng < sampling_rate:
                        orig_coords = (y * scaling_factor, x * scaling_factor)
                        
                        ROI = slide.read_region(orig_coords, 0, (patch_size, patch_size))
                        if ROI.size[0] != final_patch_size:
                            ROI = ROI.resize((final_patch_size, final_patch_size), 3)
                            
                        # for resolution in ["HR", "LR-x4", "LR-x8"]:
                        for resolution in ["HR", "LR-x4"]:
                            if resolution == "HR":
                                dest_fname = project_name + "_" + f.split("/")[-1].split(".")[0] + f"_Level-{level_of_interest}-{orig_coords[0]}-{orig_coords[1]}.png"
                                ROI.save(os.path.join(dest, resolution, dest_fname))
                            elif resolution == "LR-x4":
                                dest_fname = project_name + "_" + f.split("/")[-1].split(".")[0] + f"_Level-{level_of_interest}-{orig_coords[0]}-{orig_coords[1]}.png"
                                down_sampled_ROI = ROI.resize((128, 128), 3)
                                down_sampled_ROI.save(os.path.join(dest, resolution, dest_fname))
                            # elif resolution == "LR-x8":
                            #     dest_fname = project_name + "_" + f.split("/")[-1].split(".")[0] + f"_Level-{level_of_interest}-{orig_coords[0]}-{orig_coords[1]}.png"
                            #     down_sampled_ROI = ROI.resize((32, 32), 3)
                            #     up_sampled_ROI = down_sampled_ROI.resize((128, 128), 3)
                            #     up_sampled_ROI.save(os.path.join(dest, resolution, dest_fname))

In [None]:
result = glob.glob("/data/SR-Hist-Foundation/HR/*.png")
result = pd.DataFrame(result, columns=["fname"])

result.loc[:, "project"] = result.fname.map(lambda x: x.split("_")[1])
result.loc[:, "level"] = result.fname.map(lambda x: 1 if x.find("Level-1") > 0 else 0)

pd.pivot_table(result, index="project", columns="level", aggfunc="count", fill_value=0)

In [5]:
train_df, valid_df, test_df = [], [], []
valid_ratio, test_ratio = 0.05 , 0.005

for project_id in pd.unique(result.project):
    for level_of_interest in [0, 1]:
        sample_df = result.loc[(result.project == project_id) & (result.level == level_of_interest)].sample(frac=1, random_state=42).reset_index(drop=True)
        total_len = len(sample_df)

        valid_samples = round(total_len * valid_ratio)
        test_samples = round(total_len * test_ratio)
        
        valid_df.extend(sample_df.loc[:valid_samples, "fname"].values.tolist())
        test_df.extend(sample_df.loc[valid_samples:valid_samples+test_samples, "fname"].values.tolist())
        train_df.extend(sample_df.loc[valid_samples+test_samples:, "fname"].values.tolist())

In [None]:
len(train_df), len(valid_df), len(test_df)

In [None]:
import os 
import copy
import shutil
from tqdm import tqdm

def copy_files_to_split_folder(df, split):
    assert split in ["train", "valid", "test"]

    os.makedirs(f"/data/SR-Hist-Foundation/HR_{split}", exist_ok=True)
    os.makedirs(f"/data/SR-Hist-Foundation/LR-x4_{split}", exist_ok=True)
                     
    for f in tqdm(df):
        try:
            HR_source = copy.deepcopy(f)
            HR_dest = HR_source.replace("HR", f"HR_{split}")

            if not os.path.exists(HR_dest):
                shutil.copy(HR_source, HR_dest)
        except:
            print(f"HR file missing: {HR_source}")
            pass

        try:
            LR_x4_source = copy.deepcopy(f).replace("HR", "LR-x4")
            LR_x4_dest = LR_x4_source.replace("LR-x4", f"LR-x4_{split}")

            if not os.path.exists(LR_x4_dest):
                shutil.copy(LR_x4_source, LR_x4_dest)
        except:
            print(f"LR-x4 file missing: {LR_x4_source}")
            os.remove(HR_dest)

copy_files_to_split_folder(test_df, "test")

In [None]:
len(glob.glob("/data/SR-Hist-Foundation/HR_test/*.png")), len(glob.glob("/data/SR-Hist-Foundation/LR-x4_test/*.png"))

In [None]:
copy_files_to_split_folder(valid_df, "valid")

In [None]:
len(glob.glob("/data/SR-Hist-Foundation/HR_valid/*.png")), len(glob.glob("/data/SR-Hist-Foundation/LR-x4_valid/*.png"))

In [12]:
# copy_files_to_split_folder(train_df, "train")

In [13]:
# len(glob.glob("/data/SR-Hist-Foundation/HR_train/*.png")), len(glob.glob("/data/SR-Hist-Foundation/LR-x4_train/*.png"))