### EMTOOLS -- Preparation of training data
This notebook processes labeled training data and the corresponding raw data, preprocesses the raw data by applying CLAHE (see also pre_uni-em) and crops it to the region where the data is labeled. These image pairs are then saved to a different directory for training of a DNN.

**Author:** Philip Ruthig, Paul Flechsig Institute for Brain Research Leipzig

**Contact:** philip.ruthig@medizin.uni-leipzig.de // philip.ruthig@gmail.com

**Publication:**
Please contact me if you want to use this code for any publication.

In [None]:
import tifffile as tf
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
import scipy.ndimage as ndi
import skimage
from skimage import exposure
from skimage.transform import rescale, resize, downscale_local_mean
import tqdm

plot = True # toggle plotting
include_training_images = True # should the (complete, not cropped to labeled area) training images also be processed?
edge_marker = True # Do you want a third label that corresponds to the 
edge_width = 2

# lists of paired segmentations and raw data
path_raw = (
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/ROI2_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/RO4_IMG1_3031x.TIF", #check data here. weird behavior when reading
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/ROI5_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/ROI1_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/ROI2_IMG3_3150x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/ROI5_IMG2_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/ROI1_IMG2_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/ROI3_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/ROI5_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/M2/ROI6_IMG1_3150x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/M2/ROI8_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/ROI1_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/ROI3_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/ROI5_IMG1_3031x.TIF",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/18_06_CC/EM/S3/18_06_CC_S3_ROI2_2.TIF",
            # to be tested
            #r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\I2\I2.TIF",#double check
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\G1\G1.TIF",
            #r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\G2\G2.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\M3\M3.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\PG2\PG2.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\18_06\Histo\EM\S3\S3.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\AM2\ROI1_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\AM2\ROI6_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\AM2\ROI8_IMG2_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G1\ROI1_IMG3_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G1\ROI4_IMG2_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G1\ROI6_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G2\ROI1_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G2\ROI2_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\G2\ROI5_IMG2_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\M2\ROI2_IMG2_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\M2\ROI4_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\M2\ROI5_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\PM2\ROI1_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\PM2\ROI4_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\PM2\ROI6_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\S5\ROI1_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\S5\ROI4_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\processed\20_01\Histo\EM\S5\ROI5_IMG1_3031x.TIF",
            )

path_seg = (
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/AM2_ROI2_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/AM2_ROI4_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/AM2/AM2_ROI5_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/G1_ROI1_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/G1_ROI2_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G1/G1_ROI5_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/G2_ROI1_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/G2_ROI3_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/G2/G2_ROI5_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/M2/M2_ROI6_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/M2/M2_ROI8_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/PM1_ROI1_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/PM1_ROI3_ALL_filled.png",
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/PM1/PM1_ROI5_ALL_filled.png",
            (r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/18_06_CC/EM/S3/Sara/S3_my_myelin.tif",#myelin
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/18_06_CC/EM/S3/Sara/S3_axons.tif"),#axons
            # to be tested
            #r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\18_06\I2\I2_Mascha_BP_ALL_filled.png", #double check
            r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/18_06/G1/G1_Mascha_BP_ALL_filled.png",
            #r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\18_06\G2\G2_Mascha_BP_ALL_filled.tif",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\18_06\M3\M3_Mascha_BP_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\18_06\PG2\PG2_Mascha_BP_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\18_06\S3\S3_Mascha_BP_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\AM2\AM2_ROI1_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\AM2\AM2_ROI6_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\AM2\AM2_ROI8_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G1\G1_ROI1_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G1\G1_ROI4_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G1\G1_ROI6_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G2\G2_ROI1_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G2\G2_ROI2_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\G2\G2_ROI5_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\M2\M2_ROI2_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\M2\M2_ROI4_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\M2\M2_ROI5_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\PM2\PM2_ROI1_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\PM2\PM2_ROI4_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\PM2\PM2_ROI6_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\S5\S5_ROI1_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\S5\S5_ROI4_ALL_filled.png",
            r"G:\AG_Morawski\Philip\EM\20230320_Mascha_complete\20_01\S5\S5_ROI5_ALL_filled.png",
            )

path_test = (
            #r"G:/AG_Morawski/Philip/EM/20230320_Mascha_complete/HH4/S3/S3_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\raw\20_01\Histo\EM\G1\ROI3_IMG1_3031x\ROI3_IMG1_3031x.TIF",
            r"G:\AG_Morawski\Philip\EM\20230411_SPP_all\raw\18_06_CC\Histo\EM\18_06_M3_ROI1.TIF",
            )

path_results = r"G:/AG_Morawski/Philip/EM/training_data"

n_cells_total = 0

def resolve_undersegmentation(outer_labels,inner_labels):
    '''
    resolves undersegmented cells with 'kissing' cells like in Mascha's manually labeled data of myelin.
    outer_labels = outer labels with undersegmented kissing cells
    inner_labels = inner labels without undersegmented kissing cells
    '''
    from skimage.segmentation import watershed
    from skimage.feature import peak_local_max
    # do distance transformation of combined binary image of outer+inner
    seg_dist = ndi.distance_transform_edt(outer_labels+inner_labels)
    # Generate the markers as local maxima of the distance to the background
    coords = peak_local_max(seg_dist, footprint=np.ones((3, 3)), labels=ndi.label(inner_labels)[0], num_peaks_per_label=1)
    mask = np.zeros(seg_dist.shape, dtype=bool)
    mask[tuple(coords.T)] = True
    markers, n = ndi.label(mask)
    cell_labels = watershed(-seg_dist, markers, mask=(outer_labels+inner_labels))
    cell_labels[inner_labels==True]=0
    return cell_labels


In [None]:
for i in tqdm.tqdm(range(len(path_raw))):
    if "18_06_CC" in path_raw[i]:# 18_06_CC has two seperate tifs
        raw = cv2.imread(path_raw[i],-1)
        seg_myelin = tf.imread(path_seg[i][0])
        seg_axon = tf.imread(path_seg[i][1])
        seg = np.zeros_like(seg_axon,dtype='uint8')
        seg[seg_myelin==True] = 2
        seg[seg_axon==True] = 3

    else:
        # read segmentation and raw image
        seg = cv2.imread(path_seg[i],-1)# -1 flag so its read as 16bit
        raw = cv2.imread(path_raw[i],-1)

    # convert to numpy array
    seg=np.array(seg)
    raw=np.array(raw)

    # contrast enhance raw image
    raw = exposure.equalize_adapthist(raw,clip_limit=0.01,kernel_size=127)

    # downsample both images
    raw = downscale_local_mean(raw,4)
    seg = downscale_local_mean(seg,4)
    
    if "18_06_CC" not in path_raw[i]:
        # print("rotating..")
        # rotate segmentation
        seg=np.rot90(seg)

        # mirror segmentation over x axis
        seg=np.flipud(seg)

    # split segmentation into seperate boolean arrays for inner and outer myelin
    seg_inner=np.zeros_like(seg)
    seg_outer=np.zeros_like(seg)

    seg_inner[seg==3]=True
    seg_outer[seg==2]=True

    # define segmented region, crop along outer boundaries
    coords = np.argwhere(seg_outer)
    x_min, y_min = coords.min(axis=0)
    x_max, y_max = coords.max(axis=0)
    seg_outer_cropped = seg_outer[x_min:x_max+1, y_min:y_max+1]
    seg_inner_cropped = seg_inner[x_min:x_max+1, y_min:y_max+1]
    raw_cropped = raw[x_min:x_max+1, y_min:y_max+1]

    seg_both_cropped = resolve_undersegmentation(seg_outer_cropped,seg_inner_cropped)

    # uni-em needs uint8 images
    raw_cropped = (raw_cropped*255).astype('uint8')

    # label data
    n_outer = np.max(seg_both_cropped)
    seg_outer_labeled = seg_both_cropped #outer is already labeled from watershed
    seg_inner_labeled,n_inner = ndi.label(seg_inner_cropped)
    n_cells_total += n_inner

    seg_inner_labeled = seg_inner_labeled.astype("uint16")
    seg_outer_labeled = seg_outer_labeled.astype("uint16")

    if edge_marker == True:
        # generate edge of outer label
        edges = np.zeros_like(seg_inner_labeled)
        # iterate over the cell IDs
        for cell_ID in np.unique(seg_outer_labeled)[1:]:

            cell_mask = ndi.binary_fill_holes(seg_outer_labeled)==cell_ID
            eroded_cell_mask = ndi.binary_erosion(cell_mask, iterations=edge_width) # Increase iterations to make boundary wider!
            
            # Create the cell edge mask
            edge_mask = np.logical_xor(cell_mask, eroded_cell_mask)
            
            # add the cell edge mask to the empty array generated above, labeling it with the cell's ID
            edges[edge_mask] = cell_ID

    # add images together for multi-class labeling.
    if edge_marker == False:
        green = downscale_local_mean(np.zeros_like(edges).astype('bool'),1), #G
    if edge_marker == True:
        green = downscale_local_mean(edges.astype('bool'),1) #G
    
    multiclass_rgb = cv2.merge((downscale_local_mean(seg_inner_labeled.astype('bool'),1), #R
                                green, #G
                                downscale_local_mean(seg_outer_labeled.astype('bool'),1)))            #B

    # plot for sanity checkPhilip/Code/experimental/preprocess_maschas_segmentation_uni_em.ipynb
    if plot == True:
        plt.figure(figsize=(10,10))
        plt.axis('off')
        plt.imshow(raw_cropped[:2000,:2000], interpolation='none', cmap='gray')
        plt.imshow(np.ma.array(multiclass_rgb[:2000,:2000]), interpolation='none', cmap='tab20', alpha=0.15)
        plt.show()

    # re-save segmentation and downscaled raw image in correct orientation
    skimage.io.imsave(path_results + r"/labels/" + str(i) + r"_labels.png", (multiclass_rgb*255).astype('uint8'),)
    skimage.io.imsave(path_results + r"/raw/" + str(i) + r"_raw.png", raw_cropped.astype('uint8'))

In [None]:
print("preparing Test Images")

if include_training_images == True:
    path_test = list(path_test)+list(path_raw)

i=0

for ii in tqdm.tqdm(path_test):
    i+=1
    test_img = cv2.imread(ii,-1)# -1 flag so its read as 16bit
    test_img_clahe = exposure.equalize_adapthist(test_img,clip_limit=0.01,kernel_size=127)
    test_img_downscaled = downscale_local_mean(test_img_clahe,4)
    test_img_rgb_png = cv2.merge((downscale_local_mean(test_img_downscaled,1),  #R
                                downscale_local_mean(test_img_downscaled,1),    #G
                                downscale_local_mean(test_img_downscaled,1)))   #B
    skimage.io.imsave(path_results + r"/test_img/test_" + ii + ".png", (test_img_rgb_png*255).astype('uint8'))
    # skimage.io.imsave(path_results + r"/test_img/test_" + str(i) + ".png", (test_img_rgb_png*255).astype('uint8'))


In [None]:
print("Total number of cells: " + str(n_cells_total))