In [1]:
import os
import numpy as np
import cv2
import openslide
from PIL import Image
from tqdm import tqdm

In [2]:
# -----------------------------
# PARAMETERS
# -----------------------------
WSI_DIR = "D:/data/images/bloc"
OUTPUT_DIR = "D:/data/patches_bloc_60"
PATCH_SIZE = 224
LEVEL = 0
TISSUE_THRESHOLD = 0.60 #OpenMidnightThreshold

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [19]:

# -----------------------------
# TISSUE MASK FUNCTION
# -----------------------------
def compute_tissue_mask(rgb_img):
    """
    Compute tissue mask using explicit HSV range rules:
    Hue        ∈ [90, 170]
    Saturation ∈ [8, 245]
    Value      ∈ [103, 245]
    Returns binary mask (uint8: 0 or 1)

    According to open midnight paper -> adjusted to safran
    """
    hsv = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2HSV)

    lower = np.array([90, 8, 103], dtype=np.uint8)
    upper = np.array([170, 245, 245], dtype=np.uint8)

    tissue_mask = cv2.inRange(hsv, lower, upper)

    # Convert to {0,1}
    tissue_mask = (tissue_mask > 0).astype(np.uint8)

    # Optional morphological cleanup (recommended for WSIs)
    kernel = np.ones((5, 5), np.uint8)
    tissue_mask = cv2.morphologyEx(tissue_mask, cv2.MORPH_OPEN, kernel)
    tissue_mask = cv2.morphologyEx(tissue_mask, cv2.MORPH_CLOSE, kernel)

    return tissue_mask

# -----------------------------
# PROCESS ALL WSI FILES
# -----------------------------
wsi_files = [
    f for f in os.listdir(WSI_DIR)
    if f.lower().endswith(".mrxs")
]

print(f"Found {len(wsi_files)} WSI files")

for wsi_file in wsi_files:
    wsi_path = os.path.join(WSI_DIR, wsi_file)
    wsi_name = os.path.splitext(wsi_file)[0]

    print(f"\nProcessing {wsi_file}")
    slide = openslide.OpenSlide(wsi_path)

    width, height = slide.level_dimensions[LEVEL]
    patch_id = 0

    for y in tqdm(range(0, height - PATCH_SIZE, PATCH_SIZE), desc=wsi_name):
        for x in range(0, width - PATCH_SIZE, PATCH_SIZE):

            patch = slide.read_region(
                (x, y),
                LEVEL,
                (PATCH_SIZE, PATCH_SIZE)
            ).convert("RGB")

            patch_np = np.array(patch)

            tissue_mask = compute_tissue_mask(patch_np)
            tissue_ratio = tissue_mask.mean()

            if tissue_ratio >= TISSUE_THRESHOLD:
                masked_patch = patch_np.copy()
                masked_patch[tissue_mask == 0] = 0

                out_name = (
                    f"x{x}_y{y}_patch_{patch_id}_{wsi_name}.png"
                )
                out_path = os.path.join(OUTPUT_DIR, out_name)

                Image.fromarray(masked_patch).save(out_path)
                patch_id += 1

    slide.close()

print("\nAll WSIs processed.")


Found 18 WSI files

Processing 25P13027_5_2_2_IHC.mrxs


25P13027_5_2_2_IHC: 100%|██████████| 1007/1007 [04:09<00:00,  4.03it/s]



Processing 25P13027_5_2_6_HES.mrxs


25P13027_5_2_6_HES: 100%|██████████| 1007/1007 [10:22<00:00,  1.62it/s]



Processing 25P16493_8_2_4_IHC.mrxs


25P16493_8_2_4_IHC: 100%|██████████| 1007/1007 [04:20<00:00,  3.86it/s]



Processing 25P16493_8_4_1_HES.mrxs


25P16493_8_4_1_HES: 100%|██████████| 1007/1007 [12:46<00:00,  1.31it/s]



Processing 25P16544_2_1_1_HES.mrxs


25P16544_2_1_1_HES: 100%|██████████| 1007/1007 [03:57<00:00,  4.24it/s]



Processing 25P16544_2_1_2_IHC.mrxs


25P16544_2_1_2_IHC: 100%|██████████| 1007/1007 [04:10<00:00,  4.02it/s]



Processing 25P17513_7_3_1_HES.mrxs


25P17513_7_3_1_HES: 100%|██████████| 1007/1007 [06:15<00:00,  2.68it/s]



Processing 25P17513_7_3_2_IHC.mrxs


25P17513_7_3_2_IHC: 100%|██████████| 1007/1007 [04:53<00:00,  3.44it/s]



Processing 25P18147_7_4_1_HES.mrxs


25P18147_7_4_1_HES: 100%|██████████| 1007/1007 [17:22<00:00,  1.04s/it]



Processing 25P18147_7_4_2_IHC.mrxs


25P18147_7_4_2_IHC: 100%|██████████| 1007/1007 [04:05<00:00,  4.10it/s]



Processing 25P7216_6_1_HES.mrxs


25P7216_6_1_HES: 100%|██████████| 1007/1007 [07:39<00:00,  2.19it/s]



Processing 25P7216_6_2_IHC.mrxs


25P7216_6_2_IHC: 100%|██████████| 1007/1007 [04:10<00:00,  4.03it/s]



Processing 25P8317_2_1_HES.mrxs


25P8317_2_1_HES: 100%|██████████| 1007/1007 [17:27<00:00,  1.04s/it]



Processing 25P8317_2_8_IHC.mrxs


25P8317_2_8_IHC: 100%|██████████| 1007/1007 [03:32<00:00,  4.74it/s]



Processing 25P9801_7_1_HES.mrxs


25P9801_7_1_HES: 100%|██████████| 1007/1007 [10:16<00:00,  1.63it/s]



Processing 25P9801_7_3_IHC.mrxs


25P9801_7_3_IHC: 100%|██████████| 1007/1007 [03:55<00:00,  4.28it/s]



Processing 25P9947_6_1_HES.mrxs


25P9947_6_1_HES: 100%|██████████| 1007/1007 [08:44<00:00,  1.92it/s]



Processing 25P9947_6_3_IHC.mrxs


25P9947_6_3_IHC: 100%|██████████| 1007/1007 [03:58<00:00,  4.22it/s]


All WSIs processed.





In [3]:
import glob
from pathlib import Path

In [4]:
paths = glob.glob('D:/data/patches_bloc_60/' + '*.png')
files = [Path(p).name for p in paths]


In [5]:
files
#remove patch number
import re

files_cleaned = [
    re.sub(r"_patch_\d+", "", f) for f in files
]


In [7]:
#Making a dataset out of it
import pandas as pd

patch_dict = {'patch_id' : files_cleaned}
patch_df = pd.DataFrame(patch_dict)
patch_df


Unnamed: 0,patch_id
0,x10080_y167776_25P8317_2_1_HES.png
1,x10080_y168896_25P8317_2_1_HES.png
2,x10080_y169120_25P8317_2_1_HES.png
3,x10080_y170688_25P8317_2_1_HES.png
4,x10080_y172256_25P8317_2_1_HES.png
...,...
317448,x9856_y35168_25P7216_6_1_HES.png
317449,x9856_y35392_25P7216_6_1_HES.png
317450,x9856_y35616_25P7216_6_1_HES.png
317451,x9856_y36512_25P7216_6_1_HES.png


In [8]:
pattern = (
    r"(?P<patch_coords>x\d+_y\d+)_"   # x..._y...
    r"(?P<slide_id>\d+P\d+)"           # 25P13027
    r"(?:_[^_]*)*"                     # any number of extra _xxx parts
    r"_(?P<stain>IHC|HES)"              # stain
)

patch_df[["patch_coords", "slide_id", "stain"]] = patch_df["patch_id"].str.extract(pattern)

patch_df

Unnamed: 0,patch_id,patch_coords,slide_id,stain
0,x10080_y167776_25P8317_2_1_HES.png,x10080_y167776,25P8317,HES
1,x10080_y168896_25P8317_2_1_HES.png,x10080_y168896,25P8317,HES
2,x10080_y169120_25P8317_2_1_HES.png,x10080_y169120,25P8317,HES
3,x10080_y170688_25P8317_2_1_HES.png,x10080_y170688,25P8317,HES
4,x10080_y172256_25P8317_2_1_HES.png,x10080_y172256,25P8317,HES
...,...,...,...,...
317448,x9856_y35168_25P7216_6_1_HES.png,x9856_y35168,25P7216,HES
317449,x9856_y35392_25P7216_6_1_HES.png,x9856_y35392,25P7216,HES
317450,x9856_y35616_25P7216_6_1_HES.png,x9856_y35616,25P7216,HES
317451,x9856_y36512_25P7216_6_1_HES.png,x9856_y36512,25P7216,HES


In [27]:
patch_df.to_csv("patch_bloc_60.csv", index=False)


In [32]:
patch_df.slide_id.unique()

array(['25P8317', '25P17513', '25P7216', '25P16493', '25P18147',
       '25P16544', '25P13027', '25P9947', '25P9801'], dtype=object)

In [28]:
import os
import zipfile
from pathlib import Path

SOURCE_DIR = Path("D:/data/patches_bloc_60")
OUTPUT_DIR = Path("D:/zips")
OUTPUT_DIR.mkdir(exist_ok=True)

MAX_SIZE = 4 * 1024**3  # 4 GB

zip_index = 1
current_size = 0
zipf = zipfile.ZipFile(OUTPUT_DIR / f"images_part_{zip_index}.zip", "w", zipfile.ZIP_DEFLATED)

for img in SOURCE_DIR.rglob("*"):
    if not img.is_file():
        continue

    img_size = img.stat().st_size

    if current_size + img_size > MAX_SIZE:
        zipf.close()
        zip_index += 1
        current_size = 0
        zipf = zipfile.ZipFile(
            OUTPUT_DIR / f"images_part_{zip_index}.zip", "w", zipfile.ZIP_DEFLATED
        )

    zipf.write(img, arcname=img.relative_to(SOURCE_DIR))
    current_size += img_size

zipf.close()


In [42]:
slides_target =  pd.read_excel("D:/data/wsi.xlsx")

In [43]:
slides_target

Unnamed: 0,slide_id,target
0,25P8317,negative
1,25P17513,positive
2,25P7216,positive
3,25P16493,positive
4,25P18147,positive
5,25P16544,positive
6,25P13027,positive
7,25P9947,positive
8,25P9801,positive


In [47]:
patch_df = patch_df.merge(
    slides_target,
    on="slide_id",
    how="left"
)
patch_df

Unnamed: 0,patch_id,patch_coords,slide_id,stain,target
0,x10080_y167776_25P8317_2_1_HES.png,x10080_y167776,25P8317,HES,negative
1,x10080_y168896_25P8317_2_1_HES.png,x10080_y168896,25P8317,HES,negative
2,x10080_y169120_25P8317_2_1_HES.png,x10080_y169120,25P8317,HES,negative
3,x10080_y170688_25P8317_2_1_HES.png,x10080_y170688,25P8317,HES,negative
4,x10080_y172256_25P8317_2_1_HES.png,x10080_y172256,25P8317,HES,negative
...,...,...,...,...,...
317448,x9856_y35168_25P7216_6_1_HES.png,x9856_y35168,25P7216,HES,positive
317449,x9856_y35392_25P7216_6_1_HES.png,x9856_y35392,25P7216,HES,positive
317450,x9856_y35616_25P7216_6_1_HES.png,x9856_y35616,25P7216,HES,positive
317451,x9856_y36512_25P7216_6_1_HES.png,x9856_y36512,25P7216,HES,positive


In [49]:
patch_df.to_csv("patch_bloc_60.csv", index=False,header=True)