### Code from create_poc_dataset.ipynb:

In [4]:
### Code from create_poc_dataset.ipynb, exactly the same workflow, except that we now leave one out for test and use all the dataset for training. Note that after picking OTS_14684_6 as entirety, should pick only the tiles with compositions not in ECM, Fat, and White are chosen (from the excel sheet).
### Main dataset, selecting all images but 1 WSI to train the US2mask segmentation model. Create the US-mask pair dataset below and create the train and test df to be used in training/inference:
import pandas as pd
import numpy as np
import os
import cv2
from glob import glob
from PIL import Image

Image.MAX_IMAGE_PIXELS = None
from tqdm import tqdm


def calculate_tissue_composition(mask_image, num_classes=12):
    total_pixels = mask_image.size
    composition = np.zeros(num_classes)

    for label in range(1, num_classes + 1):
        mask = np.array(mask_image == label, dtype=np.uint8)
        label_pixels = np.sum(mask)
        composition[label - 1] = label_pixels / total_pixels
    composition = np.round(composition, 3)
    composition_freq = (composition > 0).astype('int')
    return composition, composition_freq


def create_train_test_df(train_mask_src_list, train_US_src_list):
    """
    Assumes train_mask_src and train_US_src split is known b/w train and test, and they must be both equal lists of the filepaths to the mask and the US images.
    """
    # initialize/create empty_df with column names:
    all_df = pd.DataFrame(columns=["id", "wsi_name", "image_path", "mask_path", "composition", "composition_freq"])

    for src_idx in tqdm(range(len(train_mask_src_list)), colour='red', desc='WSI Processed'):
        train_df = pd.DataFrame(columns=["id", "wsi_name", "image_path", "mask_path"])  # reinitilize every WSI
        train_df = train_df.reindex(range(len(train_mask_src_list)))
        train_mask_src = train_mask_src_list[src_idx]
        train_US_src = train_US_src_list[src_idx]
        train_masklist = [os.path.join(train_mask_src, x) for x in os.listdir(train_mask_src)]
        train_masklist = [x for x in train_masklist if x.endswith(".png")]
        train_USlist = [os.path.join(train_US_src, x) for x in os.listdir(train_US_src)]
        train_USlist = [x for x in train_USlist if x.endswith(".png")]
        if len(train_USlist) != len(train_masklist):
            print("Recheck the mask and US pair, number of files in one of the pairs is not equal for {} and {}".format(
                train_US_src, train_mask_src))
        id_list, wsi_name_list, image_path_list, mask_path_list = [], [], [], []  # reinitialize every new WSI
        for img_idx in tqdm(range(len(train_masklist)), colour='red', desc="Masks Processed per WSI"):
            masksrc = train_masklist[img_idx]
            imgsrc = train_USlist[img_idx]
            mask_img = np.array(Image.open(masksrc))
            composition, composition_freq = calculate_tissue_composition(mask_img)
            id = masksrc.split("\\")[-1].split(".png")[0]
            wsi_name = masksrc.split("\\")[-2]
            image_path = imgsrc
            mask_path = masksrc
            id_list.append(id)
            wsi_name_list.append(wsi_name)
            image_path_list.append(image_path)
            mask_path_list.append(mask_path)
            composition = np.array2string(composition)
            composition_freq = np.array2string(composition_freq)
            train_df.loc[img_idx, "composition"] = composition
            train_df.loc[img_idx, "composition_freq"] = composition_freq
        train_df["id"] = id_list
        train_df["wsi_name"] = wsi_name_list
        train_df["image_path"] = image_path_list
        train_df["mask_path"] = mask_path_list
        all_df = pd.concat([all_df, train_df], axis=0)
    return all_df


In [None]:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc, x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc, x) for x in os.listdir(masksrc)]
allUSsrc
# # Let's just choose everything but OTS_14684_3 (that will be test data)
del (allmasksrc[2])
del (allUSsrc[2])
poc_train_df = create_train_test_df(allmasksrc, allUSsrc)
poc_train_df
dst_src = r"\\shelter\Kyu\unstain2mask\main"
poc_train_df.to_excel(os.path.join(dst_src, "train_df.xlsx"))

In [None]:
# do the same for inference to create test_df:
masksrc = r"\\shelter\Kyu\unstain2mask\masks"
USsrc = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
allmasksrc = [os.path.join(masksrc, x) for x in os.listdir(masksrc)]
allUSsrc = [os.path.join(USsrc, x) for x in os.listdir(masksrc)]
# Let's just choose OTS_14684_3!
poc_masksrc = allmasksrc[2]
poc_USsrc = allUSsrc[2]
poc_test_df = create_train_test_df([poc_masksrc], [poc_USsrc])
poc_test_df
dst_src = r"\\shelter\Kyu\unstain2mask\main"
poc_test_df.to_excel(os.path.join(dst_src, "test_df.xlsx"))

In [None]:
# now edit train_df so that except for OTS_14684_3, the rest 4 of the WSIs don't sample ECM, Fat, and Whitespace tiles.
dst_src = r"\\shelter\Kyu\unstain2mask\main"
saved_train_df_src = os.path.join(dst_src, "train_df.xlsx")
saved_train_df = pd.read_excel(saved_train_df_src)
saved_train_df
wsi_names = np.unique(saved_train_df["wsi_name"])
wsi_names_skip = list(wsi_names[0:2]) + list(
    wsi_names[3:5])  # leave out OTS_14684_6 (sincve we will use all tiles of OTS_14684_6)
wsi_names_skip


In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Create an empty dataframe with the desired columns
new_train_df = pd.DataFrame(columns=["id", "wsi_name", "image_path", "mask_path", "composition", "composition_freq"])

# Iterate over the rows of saved_train_df
for idx, row in tqdm(saved_train_df.iterrows(), total=saved_train_df.shape[0]):
    if "OTS_14684_6" in row["wsi_name"]:
        # Don't edit rows with wsi_name "OTS_14684_6", simply append them to the new dataframe
        new_train_df = new_train_df.append(row, ignore_index=True)
    else:
        string_array = row["composition"]
        pattern = r'(\d+\.\d+|\d+)'  # Regular expression pattern to match floating-point numbers
        matches = re.findall(pattern, string_array)
        numpy_array = np.array([float(x) for x in matches])
        if np.sum(numpy_array[9:12]) > 0.7:
            continue
        else:
            new_train_df = new_train_df.append(row, ignore_index=True)

new_train_df
dst_src = r"\\shelter\Kyu\unstain2mask\main"
saved_train_df_src = os.path.join(dst_src, "new_train_df.xlsx")
new_train_df.to_excel(saved_train_df_src)

### Trying to drop some of the whitespace tiles...

In [14]:
src = r"\\shelter\Kyu\unstain2mask\main"
saved_train_df_src = os.path.join(src, "new_train_df.xlsx")
saved_train_df = pd.read_excel(saved_train_df_src)
saved_train_df

Unnamed: 0.1,Unnamed: 0,id,wsi_name,image_path,mask_path,composition,composition_freq
0,0,15969_41747xy0688,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.231 0.164 0. 0. 0.005 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 0 0 1]
1,1,15969_42771xy0689,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.36 0.13 0. 0. 0.007 0. 0. 0. ...,[1 1 0 0 1 0 0 0 0 0 0 1]
2,2,15969_43795xy0690,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.193 0.156 0. 0. 0.014 0. 0. 0.2...,[1 1 0 0 1 0 0 1 0 0 0 1]
3,3,15969_44819xy0691,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.33 0.167 0. 0. 0.02 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 1 0 1]
4,4,15969_45843xy0692,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.334 0.39 0. 0. 0.027 0. 0. 0.1...,[1 1 0 0 1 0 0 1 1 1 0 1]
...,...,...,...,...,...,...,...
14099,14099,137729_84016xy9191,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.188 0.232 0. 0.004 0.01 0. 0. 0.0...,[1 1 0 1 1 0 0 1 0 1 0 1]
14100,14100,138753_84016xy9283,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.176 0.176 0. 0.063 0.001 0. 0. 0.0...,[1 1 0 1 1 0 0 1 0 1 0 1]
14101,14101,139777_82992xy9374,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.108 0.186 0. 0.069 0.01 0. 0. 0.1...,[1 1 0 1 1 0 0 1 0 1 0 1]
14102,14102,140801_82992xy9466,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.159 0.345 0. 0. 0.004 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 1 0 1]


In [15]:
import random
def set_seed(seed = 42):
    np.random.seed(seed) #numpy specific random
    random.seed(seed) # python specific random (also for albumentation augmentations)
    os.environ['PYTHONHASHSEED'] = str(seed)  # set a fixed value for the hash seed, for hases like dictionary

set_seed()

In [16]:
import pandas as pd
import numpy as np
import random
import re

new_train_df = saved_train_df.copy()
for index, row in new_train_df.iterrows():
    string_array = row["composition"]
    pattern = r'(\d+\.\d+|\d+)'  # Regular expression pattern to match floating-point numbers
    matches = re.findall(pattern, string_array)
    numpy_array = np.array([float(x) for x in matches])
    if numpy_array[11] > 0.999:
        # 50% chance to drop the row
        if random.random() < 0.8:
            new_train_df.drop(index, inplace=True)
new_train_df.reset_index(drop=True, inplace=True)
new_train_df


Unnamed: 0.1,Unnamed: 0,id,wsi_name,image_path,mask_path,composition,composition_freq
0,0,15969_41747xy0688,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.231 0.164 0. 0. 0.005 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 0 0 1]
1,1,15969_42771xy0689,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.36 0.13 0. 0. 0.007 0. 0. 0. ...,[1 1 0 0 1 0 0 0 0 0 0 1]
2,2,15969_43795xy0690,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.193 0.156 0. 0. 0.014 0. 0. 0.2...,[1 1 0 0 1 0 0 1 0 0 0 1]
3,3,15969_44819xy0691,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.33 0.167 0. 0. 0.02 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 1 0 1]
4,4,15969_45843xy0692,OTS_14684_1,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_1\1...,[0.334 0.39 0. 0. 0.027 0. 0. 0.1...,[1 1 0 0 1 0 0 1 1 1 0 1]
...,...,...,...,...,...,...,...
6598,14099,137729_84016xy9191,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.188 0.232 0. 0.004 0.01 0. 0. 0.0...,[1 1 0 1 1 0 0 1 0 1 0 1]
6599,14100,138753_84016xy9283,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.176 0.176 0. 0.063 0.001 0. 0. 0.0...,[1 1 0 1 1 0 0 1 0 1 0 1]
6600,14101,139777_82992xy9374,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.108 0.186 0. 0.069 0.01 0. 0. 0.1...,[1 1 0 1 1 0 0 1 0 1 0 1]
6601,14102,140801_82992xy9466,OTS_14684_8,\\shelter\Kyu\unstain2stain\tiles\registered_t...,\\shelter\Kyu\unstain2mask\masks\OTS_14684_8\1...,[0.159 0.345 0. 0. 0.004 0. 0. 0.1...,[1 1 0 0 1 0 0 1 0 1 0 1]


In [17]:
src = r"\\shelter\Kyu\unstain2mask\main"
saved_train_df_src = os.path.join(src, "new_train_df2.xlsx")
new_train_df.to_excel(saved_train_df_src)