In [None]:
import json
import os
import scipy.ndimage
import numpy as np
import matplotlib.path as mpltPath
from matplotlib.path import Path
from random import shuffle
from openslide import open_slide, ImageSlide
import scipy.io as sio
import pdb
import sys


In [None]:
#File paths
slide_path = '/mys3bucket/TCGA_LUSC'
slides = os.listdir(slide_path)
save_path = '/home/ubuntu/codebase/Semi-Supervised-GANs/dataset/patch_data'

no_patches = 1000
chunk_size = 20
img_size = 256

In [None]:
def split(data):
    N = len(data)
    trn_idx = int(np.ceil(0.8*N))
    train = data[:trn_idx]
    test = data[trn_idx:]
    
    return train,test

In [None]:
def get_mask(coords):
    unsorted_coords = coords
    coords.sort(key=lambda x: x[0],reverse=True)
    xmin,xmax = coords[-1][0],coords[0][0]
    coords.sort(key=lambda x: x[1],reverse=True)
    ymin,ymax = coords[-1][1],coords[0][1]
    maximum = max(xmax,ymax)
    minimum = min(xmin,ymin)
    x, y = np.meshgrid(np.arange(minimum,maximum), np.arange(minimum,maximum))
    print("Done generating meshgrid!")
    x, y = x.flatten(), y.flatten()
    points = np.vstack((x,y)).T
    p = Path(unsorted_coords)
    grid = p.contains_points(points)
    mask = grid.reshape((maximum-minimum),(maximum-minimum))
    mask = mask.astype(int)
    x_coords,y_coords = np.nonzero(mask)
    sample_idxs = np.random.choice(x_coords, no_patches)
    return x[sample_idxs],y[sample_idxs]
    

In [None]:
def read_patches(x_coords,y_coords,slide_src,label):
    gen_dataX = []
    gen_dataY = []
    image = open_slide(slide_src)
    for i in range(len(x_coords)):
        patch = image.read_region((x_coords[i]-(img_size//2),y_coords[i]-(img_size//2)),0,(256,256)) #find top left pixel
        patch = patch.convert("RGB")
        patch = np.array(patch)
        # check for black patches
        if not np.sum(patch)==0 :
            gen_dataX.append(patch)
            gen_dataY.append(label)
        
        #Code to save patches as images
        #outfile = os.path.join(slide_dest,"patch_"+str(i)+".jpg")
        #patch.save(outfile,'JPEG')
        #g.write(("patch_"+str(count)+","+str(x_coords[i])+","+str(y_coords[i])+"\n"))
    
    image.close()
    print("Generated patches!")
    return gen_dataX,gen_dataY

In [None]:
def get_slide_path(slideID):
    for slide in slides:
        if str(slideID) == str(slide.split('_')[0]):
            return os.path.join(slide_path,slide)
    return -1

In [None]:
def get_random_polygon(shape):
    if len(shape)>1:
        return shape
    return -1

In [None]:
def generate_data(data,mode,ltype):
       
    for slide in data:
        DATAX = []
        DATAY = [] 
        count = 0
        slide_src = get_slide_path(slide)
        print(str(slide)+" has "+ str(len(data[slide]))+" annotations")
        for polygon in data[slide]:
            count+=1
            coords = [tuple(x) for x in polygon]
            x_coords,y_coords = get_mask(coords)
            # Get label
            if ltype == 'cancer':
                label = 1
            else:
                label = 0

            X,Y = read_patches(x_coords,y_coords,slide_src,label)
            print(len(Y))
            DATAX.extend(X)
            DATAY.extend(Y)

            print(">>>>"+str(count))

            #Saving chunks of data containing slide_threshold*no_patches

        outfile = os.path.join(save_path,mode,str(slide))
        np.savez(outfile,np.asarray(DATAX),np.asarray(DATAY))
        print("*****************************************************")
        

In [None]:
def get_statistics(data):
    count_cancerous = 0
    count_noncancerous = 0
    cancer_dict = {}
    noncancer_dict = {}
    for annotation in data:
        slide = annotation['slideId']
        slide_src = get_slide_path(slide)
        shape = annotation['shape']
        polygon = get_random_polygon(shape)
        if not slide_src == -1 and not polygon == -1 :
            if (annotation['annotationSubstanceId'] in [330,331]) :
                if slide not in cancer_dict:
                    cancer_dict[slide] = []
                    cancer_dict[slide].append(polygon)
                else:
                    cancer_dict[slide].append(polygon)
                count_cancerous+=1
            else:
                if slide not in noncancer_dict:
                    noncancer_dict[slide] = []
                    noncancer_dict[slide].append(polygon)
                else:
                    noncancer_dict[slide].append(polygon)                
                count_noncancerous+=1
    return cancer_dict, noncancer_dict
        

In [None]:
#Shuffle the data
f = open("/mys3bucket/Annotations/annotations.txt", encoding="utf-8")
data = json.loads(f.read())
f.close()


#shuffle(data)
train,test = split(data)
cancer, noncancer = get_statistics(train)

In [None]:
#Generate Cancerous Patches
generate_data(cancer,'train','cancer')
print("Train data generated!")

In [None]:
#generate_data(dev, no_dev_slides,'dev')
#print("Dev data generated!")

In [None]:
#generate_data(test, no_test_slides,'test')
#print("Test data generated! ")