### To create train dataset, we use already existing registered and unstained 1024 x 1024 tiles. Since our US tiles are already registered to the dimensions of HE, they should have similar coordinates to the HE mask, the only problem is the resolution difference. Let's tackle this problem:

In [1]:
import numpy as np
import os
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
import cv2
from natsort import natsorted
import os
from matplotlib import pyplot as plt
from skimage.color import rgb2gray
from skimage.transform import warp
from skimage.registration import optical_flow_tvl1, optical_flow_ilk
import xml.etree.ElementTree as ET
import pandas as pd
import skimage.measure
import scipy.stats as stats
import time
import shutil
from tqdm import tqdm
from skimage.measure import label, regionprops_table
from natsort import natsorted

In [2]:
unstained_registered_tile_src = r"\\shelter\Kyu\unstain2stain\tiles\registered_tiles\US"
label_mask_src = r"\\shelter\Kyu\unstain2stain\unstain2stain_wsi\HE\1um\classification_v9_combined"
label_mask_list = [x for x in os.listdir(label_mask_src) if x.endswith(".png")]
label_mask_list = [x for x in label_mask_list if x[:3] == "OTS"]
label_mask_list = label_mask_list[2:]
del(label_mask_list[3]) # remove 'OTS_14684_5_he.png', poorly registrated
label_mask_list = natsorted(label_mask_list)
label_mask_name_list = [x.replace(".png","") for x in label_mask_list]
label_mask_name_list = [x.replace("_he","") for x in label_mask_name_list]
label_mask_src = [os.path.join(label_mask_src,x) for x in label_mask_list]
unstained_registered_tile_src = [os.path.join(unstained_registered_tile_src,x) for x in label_mask_name_list]

In [3]:
label_mask_name_list

['OTS_14684_1',
 'OTS_14684_2',
 'OTS_14684_3',
 'OTS_14684_6',
 'OTS_14684_7',
 'OTS_14684_8',
 'OTS_14684_9',
 'OTS_14832_1',
 'OTS_14832_2',
 'OTS_14832_5',
 'OTS_14832_8',
 'OTS_14832_9',
 'OTS_14832_10',
 'OTS_14832_11',
 'OTS_14832_12',
 'OTS_14832_13',
 'OTS_14832_14',
 'OTS_14832_15',
 'OTS_14832_16']

In [4]:
label_mask_src

['\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_1_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_2_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_3_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_6_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_7_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_8_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14684_9_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14832_1_he.png',
 '\\\\shelter\\Kyu\\unstain2stain\\unstain2stain_wsi\\HE\\1um\\classification_v9_combined\\OTS_14832_2_h

In [5]:
unstained_registered_tile_src

['\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_1',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_2',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_3',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_6',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_7',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_8',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14684_9',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14832_1',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14832_2',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14832_5',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14832_8',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\OTS_14832_9',
 '\\\\shelter\\Kyu\\unstain2stain\\tiles\\registered_tiles\\US\\

In [6]:
def calculate_offset(string):
    x1 = int(string.split("-")[0].split(" ")[0].split("[")[1])
    y1 = int(string.split("-")[0].split(" ")[1].split("]")[0])
    x2 = int(string.split("-")[1].split(" ")[1].split("[")[1])
    y2 = int(string.split("-")[1].split(" ")[2].split("]")[0])
    offset_x = x1 - x2
    offset_y = y1 - y2
    return (offset_x,offset_y)

offset_df_src = r"\\shelter\Kyu\unstain2stain\unstain2stain_wsi\wsi_list_230314.xlsx"
offset_df = pd.read_excel(offset_df_src)
offset_list = offset_df["offNSHE"].tolist()
offset_list = [x for x in offset_list if not isinstance(x,float)] #drop nan's
offset_list = [calculate_offset(x) for x in offset_list]
offset_list1 = []
for x in offset_list:
    if x not in offset_list1:
        offset_list1.append(x)

# delete the five images not used:
del offset_list1[15]
del offset_list1[14]
del offset_list1[12]
del offset_list1[11]
del offset_list1[3]
del offset_list1[3]

In [7]:
offset_list1

[(7675, -3411),
 (10441, 5631),
 (43804, 15479),
 (-10960, 3890),
 (27057, -6216),
 (35227, -1078),
 (-237, 4174),
 (24229, 19948),
 (47104, 3372),
 (35241, 3112),
 (23660, 2693),
 (34692, 19307),
 (44896, 4143),
 (41703, -1029),
 (81283, 13046),
 (35934, -1849),
 (21458, 14631),
 (31229, 15865),
 (54735, 7436)]

In [8]:
len(offset_list1)

19

In [9]:
# ### Let's first test the first WSI image:
# masksrc = label_mask_src[0]
# mask = Image.open(masksrc)
# maskra = np.array(mask)
# print(maskra.shape) # height x width (W x H in imagej and ndpi's)
# rsf = 9.0769 # this is the difference in size b/w the ndpi (the tiles are from ndpi so we have to look at ndpi 40x dimension) and the mask
# tile_size = round(1024/rsf)
# # Test the HE image "\\shelter\Kyu\unstain2stain\tiles\registered_tiles\HE\OTS_14684_1_he\37990_12390xy3324.png" to see if the mask does contain the right composition
# x,y = (41062,27750) # US x,y - offsetlist x,y = HE x,y, need to use HE x,y.
# x = round(x/rsf)
# y = round(y/rsf)
# corresp_mask = maskra[y:y+tile_size,x:x+tile_size]
# save corresp_mask and view!
### It works pretty well! Let's now run over all tiles and create the entire dataset:

In [10]:
rsf = 9.0769 # this is the difference in size b/w the ndpi (the tiles are from ndpi so we have to look at ndpi 40x dimension) and the mask
rsf_tile_size = round(1024/rsf)
dst_src = r'\\shelter\Kyu\unstain2mask\masks'

In [None]:
for wsi_idx in tqdm(range(len(label_mask_src)),colour='red',desc="WSIs Processed",total=len(label_mask_src)):
    label_mask = Image.open(label_mask_src[wsi_idx])
    label_mask = np.array(label_mask)
    US_tile_src = unstained_registered_tile_src[wsi_idx]
    all_tile_src = os.listdir(US_tile_src)
    all_tile_save_src = [os.path.join(US_tile_src,x) for x in all_tile_src if x.endswith(".png")]
    offset = offset_list1[wsi_idx]
    offset_x = offset[0]
    offset_y = offset[1]
    wsi_name = label_mask_name_list[wsi_idx]
    mask_save_src = os.path.join(dst_src,wsi_name)
    print("label mask size is",label_mask.shape)
    if not os.path.exists(mask_save_src):
        os.makedirs(mask_save_src)
    if os.path.exists(mask_save_src) and len(os.listdir(mask_save_src)) == len(all_tile_save_src):
        print("Mask for {} already made".format(wsi_name))
        continue
    print("Processing {}".format(wsi_name))
    for tile_idx in tqdm(range(len(all_tile_save_src)),colour='red',desc="Tiles Processed",total=len(all_tile_save_src)):
        tile_src = all_tile_save_src[tile_idx]
        tile = Image.open(tile_src)
        tile = np.array(tile)
        us_x = int(tile_src.split("xy")[0].split("\\")[-1].split("_")[0])
        us_y = int(tile_src.split("xy")[0].split("\\")[-1].split("_")[1])
        he_x = us_x - offset_x
        he_y = us_y - offset_y
        rsf_x = round(he_x/rsf)
        rsf_y = round(he_y/rsf)
        corresp_mask = label_mask[rsf_y:rsf_y+rsf_tile_size,rsf_x:rsf_x+rsf_tile_size]
        tile_save_src = os.path.join(mask_save_src,all_tile_src[tile_idx])
        Image.fromarray(corresp_mask).save(tile_save_src)

WSIs Processed:   0%|[31m          [0m| 0/19 [00:00<?, ?it/s]

label mask size is (10548, 16499)


WSIs Processed:   5%|[31m▌         [0m| 1/19 [00:05<01:40,  5.56s/it]

Mask for OTS_14684_1 already made
label mask size is (9307, 14384)


WSIs Processed:  11%|[31m█         [0m| 2/19 [00:08<01:13,  4.30s/it]

Mask for OTS_14684_2 already made
label mask size is (8687, 10999)


WSIs Processed:  16%|[31m█▌        [0m| 3/19 [00:12<00:59,  3.74s/it]

Mask for OTS_14684_3 already made
label mask size is (11789, 15653)
Processing OTS_14684_6



Tiles Processed:   0%|[31m          [0m| 0/13081 [00:00<?, ?it/s][A
Tiles Processed:   0%|[31m          [0m| 1/13081 [00:00<1:16:39,  2.84it/s][A
Tiles Processed:   0%|[31m          [0m| 3/13081 [00:00<53:35,  4.07it/s]  [A
Tiles Processed:   0%|[31m          [0m| 6/13081 [00:00<26:42,  8.16it/s][A
Tiles Processed:   0%|[31m          [0m| 9/13081 [00:01<18:44, 11.62it/s][A
Tiles Processed:   0%|[31m          [0m| 12/13081 [00:01<15:00, 14.51it/s][A
Tiles Processed:   0%|[31m          [0m| 15/13081 [00:01<13:12, 16.49it/s][A
Tiles Processed:   0%|[31m          [0m| 17/13081 [00:01<20:58, 10.38it/s][A
Tiles Processed:   0%|[31m          [0m| 20/13081 [00:01<16:59, 12.82it/s][A
Tiles Processed:   0%|[31m          [0m| 22/13081 [00:01<15:37, 13.93it/s][A
Tiles Processed:   0%|[31m          [0m| 25/13081 [00:02<13:42, 15.88it/s][A
Tiles Processed:   0%|[31m          [0m| 27/13081 [00:02<13:08, 16.55it/s][A
Tiles Processed:   0%|[31m          [0m| 30/13