In [1]:
from zipfile import ZipFile
import shutil
import numpy as np
import os
import re
import random
from os.path import join
import tifffile
import cv2
import math
from osgeo import gdal

In [2]:
# This is required for me to have autocomplete
%config Completer.use_jedi = False
%matplotlib inline

In [3]:
# set seed
random.seed(42)

In [4]:
# set variables for dataset size
large_size = 400
medium_size = 350
small_size = 300

# set variables for padding
model_divisor = 32
BLACK = [0,0,0]

In [5]:
# create trainset folders if they don't exist
# sizes 
sizes = ["small","medium","large","small_transform","medium_transform","large_transform"]

# media
media = ["annotations","images"]

In [6]:
# create folder tree for trainset
trainset_path = "data/trainset/"

for size in sizes:
    for med in media:
        path = trainset_path+size+"/"+med
        print(path)
        os.makedirs(path, exist_ok=True)
        

# create first large folder for original tifs
os.makedirs(trainset_path+"large/"+"tifs/", exist_ok=True)

data/trainset/small/annotations
data/trainset/small/images
data/trainset/medium/annotations
data/trainset/medium/images
data/trainset/large/annotations
data/trainset/large/images
data/trainset/small_transform/annotations
data/trainset/small_transform/images
data/trainset/medium_transform/annotations
data/trainset/medium_transform/images
data/trainset/large_transform/annotations
data/trainset/large_transform/images


In [11]:
## set paths: original tif and annotation folder
# raw annotations and starting tiffs
tif_folder = "data/1_set_tiffs/"
ann_raw_folder = "data/1_set_raw_annotated/"

# starting tiffs from 0-set (no annotations of course)
tif_folder_0 = "data/0_set_tiffs/"


# set paths: transet folders
# output folder for tiffs and annotations, large dataset
ann_out_path = "data/trainset/large/annotations/"
tif_out_path = "data/trainset/large/tifs/"

# output folder for png images from tifs
im_out_path = "data/trainset/large/images/"

# tiff and annotation paths for medium set (drawn from large)
im_path_m = "data/trainset/medium/images/"
ann_path_m = "data/trainset/medium/annotations/"

# tiff and annotations paths for small set (drawn from medium)
im_path_s = "data/trainset/small/images/"
ann_path_s = "data/trainset/small/annotations/"

# Create large train dataset

### Place annotations & add corresponding tiffs

In [6]:
# find all zip files
ann_raw_paths = [join(ann_raw_folder,f) for f in os.listdir(ann_raw_folder) if f.endswith('.zip')]

In [7]:
# Extract all content of Segmentationclass folder for each zip
for filepath in ann_raw_paths:
    obj = ZipFile(filepath, 'r')
    filelist = obj.namelist()
    for file in filelist:
        if file.startswith("SegmentationClass"):
            obj.extract(file, path=ann_out_path)

In [8]:
# Move all files 1 level up
segmentation_path = os.listdir(ann_out_path+"SegmentationClass")

for mask in segmentation_path:
    shutil.move(ann_out_path+"SegmentationClass/"+mask, ann_out_path+mask)
    
# remove folder
os.removedirs(ann_out_path+"SegmentationClass")

In [9]:
# list id's of annotated tifs
tif_id = [re.sub("\.png$","",l) for l in os.listdir(ann_out_path)]
tif_list = [tif_folder+t+".tif" for t in tif_id]
tif_out_list = [tif_out_path+t+".tif" for t in tif_id]

In [10]:
# Copy tifs to largefolder
for tif, tif_out in zip(tif_list, tif_out_list):
    shutil.copy(tif, tif_out)

### Place random draw from 0-set tiffs and create empty annotations

In [11]:
# get tiff sample 800
neg_tiffs = os.listdir(tif_folder_0)
neg_tiffs = random.sample(neg_tiffs, large_size*2)

# create new tiff ID so they don't clash with existing tifs, and we know its 0-set
neg_tiff_id = ["negset_"+l for l in neg_tiffs]

#create new annotation ID as well
neg_ann_id = [re.sub("\.tif$",".png", l) for l in neg_tiff_id]

# create full paths
neg_full_in = [tif_folder_0+l for l in neg_tiffs]
neg_full_out = [tif_out_path+l for l in neg_tiff_id]

In [12]:
# copy tifs to large folder
for tif_in, tif_out in zip(neg_full_in, neg_full_out):
    shutil.copy(tif_in, tif_out)

In [13]:
# create 0-set annotations

# get size of sample image
neg_array = tifffile.imread(tif_out_path+neg_tiff_id[0])

# make all black array for mask
neg_array.fill(0)

# create full annotation path
neg_annotation_paths = [ann_out_path+re.sub("\.tif$",".png", l) for l in neg_tiff_id]

for annotation in neg_annotation_paths:
    cv2.imwrite(annotation, neg_array)

## Pad images and annotations in large dataset

In [14]:
image_paths = [tif_out_path+l for l in os.listdir(tif_out_path)]
annotation_paths = [ann_out_path+l for l in os.listdir(ann_out_path)]

In [15]:
for im in image_paths:
    # read image
    image = tifffile.imread(im)
    
    # calculate padding size
    modulo_h = image.shape[1] % model_divisor
    modulo_v = image.shape[0] % model_divisor
    
    if(modulo_h == 0)&(modulo_v == 0):
        continue
    
    padding_size_h = model_divisor-modulo_h
    padding_size_v = model_divisor-modulo_v
    
    # get exact pixel padding for horizontal plane
    if (padding_size_h % 2) != 0:
        half_size_h = padding_size_h/2

        left = math.floor(half_size_h)
        right = math.ceil(half_size_h)
    else:
        left = int(padding_size_h/2)
        right = int(padding_size_h/2)
     
    # get exact pixel padding for vertical plane
    if (padding_size_v % 2) != 0:
        half_size_v = padding_size_v/2

        top =  math.floor(half_size_v)
        bottom = math.ceil(half_size_v)
    else:
        top = int(padding_size_v/2)
        bottom = int(padding_size_v/2)
    
    # pad image
    image_pad = cv2.copyMakeBorder(image, top,bottom,left,right,cv2.BORDER_CONSTANT,value=BLACK)
    
    # write image
    tifffile.imsave(im, image_pad)

In [16]:
for ann in annotation_paths:
    # read mask
    mask = cv2.imread(ann)
    
    # calculate padding size
    padding_size_h = model_divisor-(mask.shape[1] % model_divisor)
    padding_size_v = model_divisor-(mask.shape[0] % model_divisor)
    
    # get exact pixel padding for horizontal plane
    if (padding_size_h % 2) != 0:
        half_size_h = padding_size_h/2

        left = math.floor(half_size_h)
        right = math.ceil(half_size_h)
    else:
        left = int(padding_size_h/2)
        right = int(padding_size_h/2)
     
    # get exact pixel padding for vertical plane
    if (padding_size_v % 2) != 0:
        half_size_v = padding_size_v/2

        top =  math.floor(half_size_v)
        bottom = math.ceil(half_size_v)
    else:
        top = int(padding_size_v/2)
        bottom = int(padding_size_v/2)
    
    # pad mask
    mask_pad = cv2.copyMakeBorder(mask, top,bottom,left,right,cv2.BORDER_CONSTANT,value=BLACK)
    
    # write mask
    cv2.imwrite(ann, mask_pad)

## Convert tif to png

In [30]:
for im in image_paths:
    ds=gdal.Open(im)
    driver= gdal.GetDriverByName('PNG')
    
    new_path = re.sub("\.tif$",".png",im)
    new_path = re.sub("tifs","images", new_path)
    
    driver.CreateCopy(new_path, ds)

In [37]:
# remove aux files
remove_xml = [f for f in os.listdir("data/trainset/large/images/") if f.endswith('xml')]
for x in remove_xml:
    os.remove("data/trainset/large/images/"+x)

## Make annotations 1 pixel value

In [43]:
for ann in annotation_paths:
    original = cv2.imread(ann)
    out = original.copy()
    out[out > 0] = 1
    cv2.imwrite(ann, out)

## Subsample for medium and small datasets

### positive subsample

In [45]:
# subsample for 350 en 300 & set paths
random_m = random.sample(tif_id, 350)
random_s = random.sample(random_m, 300)

In [64]:
im_list_m = [im_out_path+t+".png" for t in random_m]
im_out_list_m = [im_path_m+t+".png" for t in random_m]
ann_list_m = [ann_out_path+t+".png" for t in random_m]
ann_out_list_m = [ann_path_m+t+".png" for t in random_m]

In [57]:
# Copy annotations to medium folder
for ann, ann_out in zip(ann_list_m, ann_out_list_m):
    shutil.copy(ann, ann_out)

In [65]:
# Copy png to medium folder
for png, png_out in zip(im_list_m, im_out_list_m):
    shutil.copy(png, png_out)

In [66]:
im_list_s = [im_out_path+t+".png" for t in random_s]
im_out_list_s = [im_path_s+t+".png" for t in random_s]
ann_list_s = [ann_out_path+t+".png" for t in random_s]
ann_out_list_s = [ann_path_s+t+".png" for t in random_s]

In [67]:
# Copy annotations to 300 folder
for ann, ann_out in zip(ann_list_s, ann_out_list_s):
    shutil.copy(ann, ann_out)

In [70]:
# Copy tifs to 300 folder
for png, png_out in zip(im_list_s, im_out_list_s):
    shutil.copy(png, png_out)

### negative subsample

In [12]:
neg_id = [re.sub("\.png$","",f) for f in os.listdir(ann_out_path) if "negset" in f]

In [15]:
# subsample for 350 en 300 & set paths
random_m_neg = random.sample(neg_id, medium_size*2)
random_s_neg = random.sample(random_m_neg, small_size*2)

In [16]:
im_list_m = [im_out_path+t+".png" for t in random_m_neg]
im_out_list_m = [im_path_m+t+".png" for t in random_m_neg]
ann_list_m = [ann_out_path+t+".png" for t in random_m_neg]
ann_out_list_m = [ann_path_m+t+".png" for t in random_m_neg]

In [21]:
# Copy annotations to medium folder
for ann, ann_out in zip(ann_list_m, ann_out_list_m):
    shutil.copy(ann, ann_out)

In [22]:
# Copy images to medium folder
for png, png_out in zip(im_list_m, im_out_list_m):
    shutil.copy(png, png_out)

In [29]:
im_list_s = [im_out_path+t+".png" for t in random_s_neg]
im_out_list_s = [im_path_s+t+".png" for t in random_s_neg]
ann_list_s = [ann_out_path+t+".png" for t in random_s_neg]
ann_out_list_s = [ann_path_s+t+".png" for t in random_s_neg]

In [30]:
# Copy annotations to 300 folder
for ann, ann_out in zip(ann_list_s, ann_out_list_s):
    shutil.copy(ann, ann_out)

In [31]:
# Copy tifs to 300 folder
for png, png_out in zip(im_list_s, im_out_list_s):
    shutil.copy(png, png_out)

# Create test set

In [2]:
# Create folder tree for testset
testset_path = "data/testset/"

for lev1 in ["0_set/","1_set/"]:
    for lev2 in ["images","annotations", "tifs"]:
        path = testset_path+lev1+lev2
        os.makedirs(path, exist_ok=True)

In [3]:
# paths to test set folder

# positive
pos_test_im = testset_path+"1_set/images/"
pos_test_tifs = testset_path+"1_set/tifs/"

# negative
neg_test_im = testset_path+"0_set/images/"
neg_test_tifs = testset_path+"0_set/tifs/"
neg_test_ann = testset_path+"0_set/annotations/"

In [12]:
# trainset id's
trainset = [re.sub("\.tif$","", f) for f in os.listdir(tif_out_path)]

In [20]:
# random draw from 1-set not in trainset
pos_tifs = [f for f in os.listdir(tif_folder) if re.sub("\.tif$","", f) not in trainset]
neg_tifs = [f for f in os.listdir(tif_folder_0) if re.sub("\.tif$","", f) not in trainset]

In [38]:
# random draw from positive and negative tifs
pos_draw = random.sample(pos_tifs, 250)
neg_draw = random.sample(neg_tifs, 250)

In [53]:
# create full paths
pos_in = [tif_folder+f for f in pos_draw]
neg_in = [tif_folder_0+"negset_"+f for f in neg_draw]

pos_out = [pos_test_tifs+f for f in pos_draw]
neg_out = [neg_test_tifs+"negset_"+f for f in neg_draw]

In [None]:
# create image folder
pos_im_out = [pos_test_im+f for f in pos_draw]
neg_im_out = [neg_test_im+f for f in neg_draw]

In [50]:
# copy tifs to pos folder
for pos_in, pos_out in zip(pos_in, pos_out):
    shutil.copy(pos_in, pos_out)

In [51]:
# copy tifs to neg folder
for neg_in, neg_out in zip(neg_in, neg_out):
    shutil.copy(neg_in, neg_out)

## Pad tifs

In [57]:
for im in pos_out:
    # read image
    image = tifffile.imread(im)
    
    # calculate padding size
    modulo_h = image.shape[1] % model_divisor
    modulo_v = image.shape[0] % model_divisor
    
    if(modulo_h == 0)&(modulo_v == 0):
        continue
    
    padding_size_h = model_divisor-modulo_h
    padding_size_v = model_divisor-modulo_v
    
    # get exact pixel padding for horizontal plane
    if (padding_size_h % 2) != 0:
        half_size_h = padding_size_h/2

        left = math.floor(half_size_h)
        right = math.ceil(half_size_h)
    else:
        left = int(padding_size_h/2)
        right = int(padding_size_h/2)
     
    # get exact pixel padding for vertical plane
    if (padding_size_v % 2) != 0:
        half_size_v = padding_size_v/2

        top =  math.floor(half_size_v)
        bottom = math.ceil(half_size_v)
    else:
        top = int(padding_size_v/2)
        bottom = int(padding_size_v/2)
    
    # pad image
    image_pad = cv2.copyMakeBorder(image, top,bottom,left,right,cv2.BORDER_CONSTANT,value=BLACK)
    
    # write image
    tifffile.imsave(im, image_pad)

In [58]:
for im in neg_out:
    # read image
    image = tifffile.imread(im)
    
    # calculate padding size
    modulo_h = image.shape[1] % model_divisor
    modulo_v = image.shape[0] % model_divisor
    
    if(modulo_h == 0)&(modulo_v == 0):
        continue
    
    padding_size_h = model_divisor-modulo_h
    padding_size_v = model_divisor-modulo_v
    
    # get exact pixel padding for horizontal plane
    if (padding_size_h % 2) != 0:
        half_size_h = padding_size_h/2

        left = math.floor(half_size_h)
        right = math.ceil(half_size_h)
    else:
        left = int(padding_size_h/2)
        right = int(padding_size_h/2)
     
    # get exact pixel padding for vertical plane
    if (padding_size_v % 2) != 0:
        half_size_v = padding_size_v/2

        top =  math.floor(half_size_v)
        bottom = math.ceil(half_size_v)
    else:
        top = int(padding_size_v/2)
        bottom = int(padding_size_v/2)
    
    # pad image
    image_pad = cv2.copyMakeBorder(image, top,bottom,left,right,cv2.BORDER_CONSTANT,value=BLACK)
    
    # write image
    tifffile.imsave(im, image_pad)

## Convert to PNG

In [9]:
# create image folder
pos_im_out = [pos_test_im+re.sub("\.tif$",".png", f) for f in pos_draw]
neg_im_out = [neg_test_im+re.sub("\.tif$",".png", f) for f in neg_draw]

NameError: name 'pos_draw' is not defined

In [62]:
for im_in, im_out in zip(pos_out, pos_im_out):
    ds=gdal.Open(im_in)
    driver= gdal.GetDriverByName('PNG')
    
    driver.CreateCopy(im_out, ds)

In [65]:
# remove aux files
remove_xml = [f for f in os.listdir("data/testset/1_set/images/") if f.endswith('xml')]
for x in remove_xml:
    os.remove("data/testset/1_set/images/"+x)

In [66]:
for im_in, im_out in zip(neg_out, neg_im_out):
    ds=gdal.Open(im_in)
    driver= gdal.GetDriverByName('PNG')
    
    driver.CreateCopy(im_out, ds)

In [67]:
# remove aux files
remove_xml = [f for f in os.listdir("data/testset/0_set/images/") if f.endswith('xml')]
for x in remove_xml:
    os.remove("data/testset/0_set/images/"+x)

### Create 0-set annotations

In [68]:
neg_array = cv2.imread(neg_im_out[0])

In [72]:
# create 0-set annotations

# get size of sample image
neg_array = cv2.imread(neg_im_out[0])

# make all black array for mask
neg_array.fill(0)

# create full annotation path
neg_annotation_paths = [re.sub("images","annotations", f) for f in neg_im_out]

for annotation in neg_annotation_paths:
    cv2.imwrite(annotation, neg_array)

In [None]:
# errorfix: negset in filename
neg = ["data/testset/annotations/"+f for f in neg]
neglist = os.listdir("data/testset/0_set/annotations/")
neglist = ["data/testset/annotations/"+f for f in neglist]

for f in neglist:
    newname = re.sub("\/(?=\d)","/negset_", f)
    os.rename(f, newname)

### Get annotations from zip files

In [5]:
ann_raw_folder_test = "data/1_set_test_annotated/"
ann_out_path_test = "data/testset/1_set/annotations/"
ann_out_path_test_g = "data/testset/annotation_groups/"

In [8]:
# find all zip files
ann_raw_paths_test = [join(ann_raw_folder_test,f) for f in os.listdir(ann_raw_folder_test) if f.endswith('.zip')]

In [15]:
os.makedirs(ann_out_path_test = "data/testset/1_set/annotations/")
os.makedirs(ann_out_path_test_g = "data/testset/annotation_groups/")

['data/1_set_test_annotated/task_test1-2021_07_21_15_28_17-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test2-2021_07_21_09_11_05-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test3-2021_07_21_09_39_06-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test4-2021_07_21_12_18_12-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test5-2021_07_21_13_05_42-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test6-2021_07_21_14_37_52-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test7-2021_07_21_14_43_54-segmentation mask 1.1.zip',
 'data/1_set_test_annotated/task_test8-2021_07_21_15_24_32-segmentation mask 1.1.zip']

In [16]:
# Extract all content of Segmentationclass folder for each zip
for filepath in ann_raw_paths_test:
    obj = ZipFile(filepath, 'r')
    filelist = obj.namelist()
    for file in filelist:
        if file.startswith("SegmentationClass"):
            obj.extract(file, path=ann_out_path_test)

In [17]:
# Move all files 1 level up
segmentation_path = os.listdir(ann_out_path_test+"SegmentationClass")

for mask in segmentation_path:
    shutil.move(ann_out_path_test+"SegmentationClass/"+mask, ann_out_path_test+mask)
    
# remove folder
os.removedirs(ann_out_path_test+"SegmentationClass")

In [9]:
# do the same for annotation grouped
for filepath in ann_raw_paths_test:
    obj = ZipFile(filepath, 'r')
    filelist = obj.namelist()
    for file in filelist:
        if file.startswith("SegmentationObject"):
            obj.extract(file, path=ann_out_path_test_g)

In [10]:
# Move all files 1 level up
segmentation_path = os.listdir(ann_out_path_test_g+"SegmentationObject")

for mask in segmentation_path:
    shutil.move(ann_out_path_test_g+"SegmentationObject/"+mask, ann_out_path_test_g+mask)
    
# remove folder
os.removedirs(ann_out_path_test_g+"SegmentationObject")

In [16]:
# copy negset to annotation groups
negset = [f for f in os.listdir("data/testset/annotations") if "negset" in f]

In [18]:
for f in negset:
    shutil.copy("data/testset/annotations/"+f, "data/testset/annotation_groups/"+f)

### Merge 0-set and 1-set

In [28]:
os.makedirs(testset_path+"images/")
os.makedirs(testset_path+"annotations/")

In [34]:
for lev1 in ["0_set/","1_set/"]:
    for lev2 in ["images/","annotations/"]:
        files = os.listdir(testset_path+lev1+lev2)
        for f in files:
            shutil.move(testset_path+lev1+lev2+f, testset_path+lev2+f)

In [26]:
neg =  [f for f in os.listdir("data/testset/annotations/")]
neg = ["data/testset/annotations/"+f for f in neg]

In [29]:
neglist = []
for f in neg:
    file = cv2.imread(f)
    if np.sum(file) == 0:
        neglist.append(f)

In [5]:
# make prediction path
os.makedirs(testset_path+"predictions")

### make alternative annotations 1 value

In [9]:
ann_path_in =  [f for f in os.listdir("data/testset/annotations/")]

In [10]:
for ann in ann_path_in:
    original = cv2.imread("data/testset/annotations/"+ann)
    out = original.copy()
    out[out > 0] = 1
    cv2.imwrite("data/testset/alt_annotations/"+ann, out)

### negative set naming

In [None]:
# set data path for images
#image_path = "data/testset/images/"

In [None]:
# get list of image files and create input path
#im_files = os.listdir(image_path)
#input_path = [image_path+f for f in os.listdir(image_path)]

In [None]:
# get list of annotations, for negset list
#ann_files = [re.sub("negset_","", f) for f in os.listdir("../data/testset/annotations/") if "negset" in f]

In [None]:
#for f in im_files:
#    if f in ann_files:
#        os.rename(image_path+f, image_path+"negset_"+f)