# README
Run this notebook to create the dataset for mosaic training.

In [None]:
import random
import os

import numpy as np
from PIL import Image
import cv2

from tqdm import tqdm

from skimage import morphology


In [None]:
def get_patch_label(filename):
    label_str = '[' + filename.split('[')[-1].split(']')[0] + ']'
    if ',' not in label_str:
        if ' ' in label_str:
            # [0 0 0 1]
            label_str = label_str.replace(' ', ',')
        else:
            # [0001]
            label_str = str([int(i) for i in label_str[1:-1]])
    label = eval(label_str)
    return label

def create_data(train_data):
    only_tum_list = []
    only_nec_list = []
    only_lym_list = []
    only_tas_list = []
    train_image_list = os.listdir(train_data)
    for name in train_image_list:
        big_label = get_patch_label(name)
        if np.sum(big_label) == 1:
            train_image = os.path.join(train_data, name)
            if big_label[0] == 1:
                only_tum_list.append(train_image)
            elif big_label[1] == 1:
                only_nec_list.append(train_image)
            elif big_label[2] == 1:
                only_lym_list.append(train_image)
            elif big_label[3] == 1:
                only_tas_list.append(train_image)

    return only_tum_list, only_nec_list, only_lym_list, only_tas_list

In [None]:
def get_background(region):
    gray = cv2.cvtColor(region, cv2.COLOR_RGB2GRAY)
    ret, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
    binary = np.uint8(binary)    
    dst = morphology.remove_small_objects(binary==255,min_size=50,connectivity=1)
    mask = np.array(dst, dtype=np.uint8)
    mask = mask * 255
    return mask


In [None]:
import numpy as np
from scipy.special import binom
import matplotlib.pyplot as plt

bernstein = lambda n, k, t: binom(n,k)*t **k * (1.-t)**(n-k)

def bezier(points, num=200):
    N = len(points)
    t = np.linspace(0, 1, num=num)
    curve = np.zeros((num, 2))
    for i in range(N):
        curve += np.outer(bernstein(N - 1, i, t), points[i])
    return curve

# generate cubic bezier curves
class Segment():
    def __init__(self, p1, p2, angle1, angle2, **kw):
        self.p1 = p1; self.p2 = p2
        self.angle1 = angle1; self.angle2 = angle2
        self.numpoints = kw.get("numpoints", 100)
        r = kw.get("r", 0.3)
        d = np.sqrt(np.sum((self.p2-self.p1)**2))
        self.r = r*d
        self.p = np.zeros((4,2))
        self.p[0,:] = self.p1[:]
        self.p[3,:] = self.p2[:]
        self.calc_intermediate_points(self.r)

    def calc_intermediate_points(self,r):
        self.p[1,:] = self.p1 + np.array([self.r*np.cos(self.angle1),
                                    self.r*np.sin(self.angle1)])
        self.p[2,:] = self.p2 + np.array([self.r*np.cos(self.angle2+np.pi),
                                    self.r*np.sin(self.angle2+np.pi)]) # make sure the curve is C1 continuous
        self.curve = bezier(self.p,self.numpoints)


def get_curve(points, **kw):
    segments = []
    for i in range(len(points)-1):
        seg = Segment(points[i,:2], points[i+1,:2], points[i,2],points[i+1,2],**kw)
        segments.append(seg)
    curve = np.concatenate([s.curve for s in segments])
    return segments, curve

def ccw_sort(p): # counter clockwise sort
    d = p-np.mean(p,axis=0)
    s = np.arctan2(d[:,0], d[:,1])
    return p[np.argsort(s),:]

def get_bezier_curve(a, rad=0.2, edgy=0):
    """ given an array of points *a*, create a curve through
    those points. 
    *rad* is a number between 0 and 1 to steer the distance of
          control points.
    *edgy* is a parameter which controls how "edgy" the curve is,
           edgy=0 is smoothest."""
    p = np.arctan(edgy)/np.pi+.5
    a = ccw_sort(a)
    a = np.append(a, np.atleast_2d(a[0,:]), axis=0)
    d = np.diff(a, axis=0)
    ang = np.arctan2(d[:,1],d[:,0])
    f = lambda ang : (ang>=0)*ang + (ang<0)*(ang+2*np.pi) # map angles to range(0,2pi)
    ang = f(ang)
    ang1 = ang
    ang2 = np.roll(ang,1)
    ang = p*ang1 + (1-p)*ang2 + (np.abs(ang2-ang1) > np.pi )*np.pi 
    ang = np.append(ang, [ang[0]])
    a = np.append(a, np.atleast_2d(ang).T, axis=1) # x, y, angle
    s, c = get_curve(a, r=rad, method="var") 
    x,y = c.T
    return x,y, a


def get_random_points(n=5, scale=0.8, mindst=None, rec=0):
    """ create n random points in the unit square, which are *mindst*
    apart, then scale them."""
    mindst = mindst or .7/n
    a = np.random.rand(n,2)
    d = np.sqrt(np.sum(np.diff(ccw_sort(a), axis=0), axis=1)**2)
    if np.all(d >= mindst) or rec>=200:
        return a*scale
    else:
        return get_random_points(n=n, scale=scale, mindst=mindst, rec=rec+1)

In [None]:
def get_bezier_mask(n, scale, rad=0.2, edgy=0.05):
    a = get_random_points(n=n, scale=scale)
    x, y, _ = get_bezier_curve(a,rad=rad, edgy=edgy)
    x = np.round(x)
    y = np.round(y)
    mask = np.zeros((scale, scale), dtype=np.uint8)
    mask = cv2.fillPoly(mask, np.int32([np.stack([x, y], axis=1)]), 1)
    return mask

def get_onelabel_mask(category, scale):
    if category == "tum":
        return np.zeros((scale, scale), dtype=np.uint8)
    elif category == "nec":
        return np.ones((scale, scale), dtype=np.uint8)
    elif category == "lym":
        return (np.ones((scale, scale), dtype=np.uint8) * 2)
    else:
        return (np.ones((scale, scale), dtype=np.uint8) * 3)


In [None]:
train_dir = "../data/LUAD-HistoSeg/train/"
only_tum_list, only_nec_list, only_lym_list, only_tas_list = create_data(train_dir)
dataset_dict = {
    "tum": only_tum_list,
    "nec": only_nec_list,
    "lym": only_lym_list,
    'tas': only_tas_list
}

In [None]:
def synthesize_one(n=12, rad=0.2, edgy=0.05, background_class="tum", foreground_class="nec"):
    background_image_path = random.choice(dataset_dict[background_class])
    foreground_image_path = random.choice(dataset_dict[foreground_class])

    background_image = np.array(Image.open(background_image_path).resize((224, 224)))
    foreground_image = np.array(Image.open(foreground_image_path).resize((224, 224)))

    background_mask = get_onelabel_mask(background_class, scale=224)
    foreground_mask = get_onelabel_mask(foreground_class, scale=224)

    bezier_mask = get_bezier_mask(n=n, scale=224, rad=rad, edgy=edgy)

    synthesized_image = bezier_mask[:,:,np.newaxis] * foreground_image + (1 - bezier_mask)[:,:,np.newaxis] * background_image
    synthesized_mask = bezier_mask * foreground_mask + (1 - bezier_mask) * background_mask

    return synthesized_image, synthesized_mask


In [None]:
def synthesize_and_save(save_dir, i, n=12, rad=0.2, edgy=0.05, background_class="tum", foreground_class="nec"):
    synthesized_image, synthesized_mask = synthesize_one(n, rad, edgy, background_class, foreground_class)
    synthesized_image = Image.fromarray(synthesized_image)
    palette = [0]*15
    palette[0:3] = [205,51,51]          # Tumor epithelial (TE)
    palette[3:6] = [0,255,0]            # Necrosis (NEC)
    palette[6:9] = [65,105,225]         # Lymphocyte (LYM)
    palette[9:12] = [255,165,0]         # Tumor-associated stroma (TAS)
    palette[12:15] = [255, 255, 255]    # White background or exclude
    synthesized_mask = Image.fromarray(np.uint8(synthesized_mask), mode='P')
    synthesized_mask.putpalette(palette)

    label = [0, 0, 0, 0]
    if 'tum' in [background_class, foreground_class]:
        label[0] = 1
    if 'nec' in [background_class, foreground_class]:
        label[1] = 1
    if 'lym' in [background_class, foreground_class]:
        label[2] = 1
    if 'tas' in [background_class, foreground_class]:
        label[3] = 1

    synthesized_image.save(os.path.join(save_dir, 'img', f"{i:05d}-{label}.png"))
    synthesized_mask.save(os.path.join(save_dir, 'mask', f"{i:05d}-{label}.png"))

In [None]:
for run in range(0, 1):

    save_dir = f"../data/LUAD-HistoSeg/bezier224_5_0.2_0.05_1d1_run{run}"

    if not os.path.exists(os.path.join(save_dir, 'img')):
        os.makedirs(os.path.join(save_dir, 'img'))
    if not os.path.exists(os.path.join(save_dir, 'mask')):
        os.makedirs(os.path.join(save_dir, 'mask'))

    N_train = 10_000
    for i in tqdm(range(N_train), total=N_train):
        background_class, foreground_class = np.random.choice(['tum', 'nec', 'lym', 'tas'], size=2, replace=False)
        synthesize_and_save(save_dir, i, background_class=background_class, 
        foreground_class=foreground_class)
