### Code from create_poc_dataset.ipynb, exactly the same workflow, except that we now leave one out for test and use all the dataset for training. Note that after picking OTS_14684_3 as entirety, only the tiles with compositions not in ECM, Fat, and White are chosen (from the excel sheet).

In [1]:
### Main dataset, selecting all images but 1 WSI to train the US2mask segmentation model. Create the US-mask pair dataset below and create the train and test df to be used in training/inference:
import pandas as pd
import numpy as np
import os
import cv2
from glob import glob
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
from tqdm import tqdm

def calculate_tissue_composition(mask_image, num_classes=12):
    total_pixels = mask_image.size
    composition = np.zeros(num_classes)

    for label in range(1, num_classes + 1):
        mask = np.array(mask_image == label, dtype=np.uint8)
        label_pixels = np.sum(mask)
        composition[label - 1] = label_pixels / total_pixels
    composition = np.round(composition, 3)
    composition_freq = (composition > 0).astype('int')
    return composition, composition_freq


def create_train_test_df(train_mask_src_list, train_US_src_list):
    """
    Assumes train_mask_src and train_US_src split is known b/w train and test, and they must be both equal lists of the filepaths to the mask and the US images.
    """
    # initialize/create empty_df with column names:
    all_df = pd.DataFrame(columns=["id", "wsi_name", "image_path", "mask_path", "composition", "composition_freq"])

    for src_idx in tqdm(range(len(train_mask_src_list)), colour='red', desc='WSI Processed'):
        train_df = pd.DataFrame(columns=["id", "wsi_name", "image_path", "mask_path"])  # reinitilize every WSI
        train_df = train_df.reindex(range(len(train_mask_src_list)))
        train_mask_src = train_mask_src_list[src_idx]
        train_US_src = train_US_src_list[src_idx]
        train_masklist = [os.path.join(train_mask_src, x) for x in os.listdir(train_mask_src)]
        train_masklist = [x for x in train_masklist if x.endswith(".png")]
        train_USlist = [os.path.join(train_US_src, x) for x in os.listdir(train_US_src)]
        train_USlist = [x for x in train_USlist if x.endswith(".png")]
        if len(train_USlist) != len(train_masklist):
            print("Recheck the mask and US pair, number of files in one of the pairs is not equal for {} and {}".format(
                train_US_src, train_mask_src))
        id_list, wsi_name_list, image_path_list, mask_path_list = [], [], [], []  # reinitialize every new WSI
        for img_idx in tqdm(range(len(train_masklist)), colour='red', desc="Masks Processed per WSI"):
            masksrc = train_masklist[img_idx]
            imgsrc = train_USlist[img_idx]
            mask_img = np.array(Image.open(masksrc))
            composition, composition_freq = calculate_tissue_composition(mask_img)
            id = masksrc.split("\\")[-1].split(".png")[0]
            wsi_name = masksrc.split("\\")[-2]
            image_path = imgsrc
            mask_path = masksrc
            id_list.append(id)
            wsi_name_list.append(wsi_name)
            image_path_list.append(image_path)
            mask_path_list.append(mask_path)
            composition = np.array2string(composition)
            composition_freq = np.array2string(composition_freq)
            train_df.loc[img_idx, "composition"] = composition
            train_df.loc[img_idx, "composition_freq"] = composition_freq
        train_df["id"] = id_list
        train_df["wsi_name"] = wsi_name_list
        train_df["image_path"] = image_path_list
        train_df["mask_path"] = mask_path_list
        all_df = pd.concat([all_df, train_df], axis=0)
    return all_df

In [None]:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc, x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc, x) for x in os.listdir(masksrc)]
# # Let's just choose everything but OTS_14684_6 (that will be test data)
del(allmasksrc[3])
del(allUSsrc[3])
poc_train_df = create_train_test_df(allmasksrc, allUSsrc)
poc_train_df
dst_src = r"\\shelter\Kyu\unstain2mask\main"
poc_train_df.to_excel(os.path.join(dst_src, "train_df.xlsx"))

WSI Processed:   0%|[31m          [0m| 0/5 [00:00<?, ?it/s]
Masks Processed per WSI:   0%|[31m          [0m| 0/12905 [00:00<?, ?it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 4/12905 [00:00<06:24, 33.55it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 9/12905 [00:00<05:25, 39.58it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 14/12905 [00:00<05:10, 41.48it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 19/12905 [00:00<05:08, 41.82it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 24/12905 [00:00<05:04, 42.28it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 29/12905 [00:00<04:56, 43.38it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 34/12905 [00:00<04:51, 44.22it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 39/12905 [00:00<04:47, 44.78it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 44/12905 [00:01<04:52, 44.04it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 49/12905 [00:01<05

In [None]:
# do the same for inference to create test_df:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc, x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc, x) for x in os.listdir(masksrc)]
# Let's just choose OTS_14684_6!
poc_masksrc = allmasksrc[3]
poc_USsrc = allUSsrc[3]
poc_test_df = create_train_test_df([poc_masksrc], [poc_USsrc])
poc_test_df
dst_src = r"\\shelter\Kyu\unstain2mask\main"
poc_test_df.to_excel(os.path.join(dst_src, "test_df.xlsx"))