### Proof-of-concept (POC) dataset, selecting all images from 1 WSI to train the US2mask segmentation model. Create the US-mask pair dataset below and create the train and test df to be used in training/inference:

In [2]:
import pandas as pd
import numpy as np
import os
import cv2
from glob import glob
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
from tqdm import tqdm

 ### We want a train_df and a test_df to be the same style. We want a column of:
 1. id (raw picture name)
 2. WSI it belongs to (important, as id doesn't contain that information)
 3. the US image file path
 4. the mask image file path
 5. the area (tissue comp) and frequency for each of the 12 class labels. (a 12 x 1 array)
6. One-hot encoded for each class label, and also generate image

Write a function to extract this information and save it into an excel file of train and test df. This df will be used in the later pipeline for DataSet & DataLoaders. The train-valid split will be handled in the dataloaders when stratification is applied.

In [3]:
ex_masks = r"\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\63073_35603xy4914.png"
ex_masks = np.array(Image.open(ex_masks))
def calculate_tissue_composition(mask_image, num_classes=12):
    total_pixels = mask_image.size
    composition = np.zeros(num_classes)

    for label in range(1, num_classes + 1):
        mask = np.array(mask_image == label, dtype=np.uint8)
        label_pixels = np.sum(mask)
        composition[label - 1] = label_pixels / total_pixels
    composition = np.round(composition,3)
    composition_freq = (composition > 0).astype('int')
    return composition, composition_freq

In [4]:
def create_train_test_df(train_mask_src_list, train_US_src_list):
    """
    Assumes train_mask_src and train_US_src split is known b/w train and test, and they must be both equal lists of the filepaths to the mask and the US images.
    """
    # initialize/create empty_df with column names:
    all_df = pd.DataFrame(columns=["id","wsi_name","image_path","mask_path","composition","composition_freq"])

    for src_idx in tqdm(range(len(train_mask_src_list)),colour='red',desc='WSI Processed'):
        train_df = pd.DataFrame(columns=["id","wsi_name","image_path","mask_path"]) # reinitilize every WSI
        train_df = train_df.reindex(range(len(train_mask_src_list)))
        train_mask_src = train_mask_src_list[src_idx]
        train_US_src = train_US_src_list[src_idx]
        train_masklist = [os.path.join(train_mask_src,x) for x in os.listdir(train_mask_src)]
        train_masklist = [x for x in train_masklist if x.endswith(".png")]
        train_USlist = [os.path.join(train_US_src,x) for x in os.listdir(train_US_src)]
        train_USlist = [x for x in train_USlist if x.endswith(".png")]
        if len(train_USlist) != len(train_masklist):
            print("Recheck the mask and US pair, number of files in one of the pairs is not equal for {} and {}".format(train_US_src,train_mask_src))
        id_list, wsi_name_list, image_path_list, mask_path_list = [], [], [], [] # reinitialize every new WSI
        for img_idx in tqdm(range(len(train_masklist)),colour='red',desc="Masks Processed per WSI"):
            masksrc = train_masklist[img_idx]
            imgsrc = train_USlist[img_idx]
            mask_img = np.array(Image.open(masksrc))
            composition, composition_freq = calculate_tissue_composition(mask_img)
            id = masksrc.split("\\")[-1].split(".png")[0]
            wsi_name = masksrc.split("\\")[-2]
            image_path = imgsrc
            mask_path = masksrc
            id_list.append(id)
            wsi_name_list.append(wsi_name)
            image_path_list.append(image_path)
            mask_path_list.append(mask_path)
            composition = np.array2string(composition)
            composition_freq = np.array2string(composition_freq)
            train_df.loc[img_idx, "composition"] = composition
            train_df.loc[img_idx, "composition_freq"] = composition_freq
        train_df["id"] = id_list
        train_df["wsi_name"] = wsi_name_list
        train_df["image_path"] = image_path_list
        train_df["mask_path"] = mask_path_list
        all_df = pd.concat([all_df,train_df],axis=0)
    return all_df

In [5]:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc,x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc,x) for x in os.listdir(masksrc)]
# Let's just choose OTS_14684_3!
poc_masksrc = allmasksrc[2]
poc_USsrc = allUSsrc[2]
poc_train_df = create_train_test_df([poc_masksrc],[poc_USsrc])

WSI Processed:   0%|[31m          [0m| 0/1 [00:00<?, ?it/s]
Masks Processed per WSI:   0%|[31m          [0m| 0/7200 [00:00<?, ?it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 6/7200 [00:00<02:30, 47.93it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 12/7200 [00:00<02:18, 51.72it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 18/7200 [00:00<02:15, 53.18it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 24/7200 [00:00<02:13, 53.83it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 30/7200 [00:00<02:12, 54.20it/s][A
Masks Processed per WSI:   1%|[31m          [0m| 37/7200 [00:00<02:10, 54.82it/s][A
Masks Processed per WSI:   1%|[31m          [0m| 44/7200 [00:00<02:09, 55.25it/s][A
Masks Processed per WSI:   1%|[31m          [0m| 50/7200 [00:00<02:09, 55.15it/s][A
Masks Processed per WSI:   1%|[31m          [0m| 56/7200 [00:01<02:09, 55.00it/s][A
Masks Processed per WSI:   1%|[31m          [0m| 62/7200 [00:01<02:09, 55.01

In [6]:
poc_train_df

Unnamed: 0,id,wsi_name,image_path,mask_path,composition,composition_freq
0,44930_16605xy0001,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
1,44930_17629xy0002,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
2,44930_18653xy0003,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
3,44930_19677xy0004,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
4,44930_20701xy0005,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
...,...,...,...,...,...,...
7195,142210_89309xy7197,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
7196,142210_90333xy7198,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
7197,142210_91357xy7199,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
7198,142210_92381xy7200,OTS_14684_3,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_3\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]


In [7]:
dst_src = r"\\shelter\Kyu\unstain2mask\poc"
poc_train_df.to_excel(os.path.join(dst_src,"train_df.xlsx"))

In [7]:
# do the same for inference to create test_df:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc,x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc,x) for x in os.listdir(masksrc)]
# Let's just choose OTS_14684_6!
poc_masksrc = allmasksrc[3]
poc_USsrc = allUSsrc[3]
poc_test_df = create_train_test_df([poc_masksrc],[poc_USsrc])

WSI Processed:   0%|[31m          [0m| 0/1 [00:00<?, ?it/s]
Masks Processed per WSI:   0%|[31m          [0m| 0/13081 [00:00<?, ?it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 7/13081 [00:00<03:35, 60.74it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 14/13081 [00:00<03:45, 57.87it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 20/13081 [00:00<04:47, 45.37it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 26/13081 [00:00<04:29, 48.50it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 32/13081 [00:00<04:54, 44.38it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 39/13081 [00:00<04:20, 50.09it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 45/13081 [00:00<04:13, 51.43it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 51/13081 [00:01<04:08, 52.42it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 57/13081 [00:01<04:06, 52.89it/s][A
Masks Processed per WSI:   0%|[31m          [0m| 64/13081 [00:01<0

In [8]:
poc_test_df

Unnamed: 0,id,wsi_name,image_path,mask_path,composition,composition_freq
0,406_5016xy1031,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
1,406_6040xy1032,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
2,406_7064xy1033,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
3,406_8088xy1034,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
4,406_9112xy1035,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\4...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
...,...,...,...,...,...,...
13076,129430_105368xy14107,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
13077,129430_106392xy14108,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
13078,129430_107416xy14109,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]
13079,129430_108440xy14110,OTS_14684_6,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_6\1...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.],[0 0 0 0 0 0 0 0 0 0 0 1]


In [9]:
dst_src = r"\\shelter\Kyu\unstain2mask\poc"
poc_test_df.to_excel(os.path.join(dst_src,"test_df.xlsx"))