In [13]:
import time
import numpy as np
import os
import h5py
import sys
from skimage import segmentation, measure
from scipy.ndimage import zoom

ORIGINAL_DATASET_PATH = 'data/cells/raw/'
NEW_DATASET_PATH = 'data/cells/lq_nobatch2/'

def multi_instance_semseg(target):
    boundaries = segmentation.find_boundaries(target, connectivity=3, mode='outer')
    target_segmentation = np.copy(boundaries) * 1
    boundaries = (target_segmentation * -1) + 1
    target *= boundaries
    target = np.ceil(target/1000).astype('i1')
    return target, measure.label(target, return_num=True)[1], target_segmentation


def get_highest_xyz_after_reduction(filenames, dims_reduction=(1,1,1)):
    file_paths = [ORIGINAL_DATASET_PATH + filename for filename in filenames]
    highest_xyz = [-1,-1,-1]
    for filename in filenames:
        with h5py.File(ORIGINAL_DATASET_PATH + filename, 'r') as fr:
            image = fr.get('imageSequenceInterpolated')
            z, x, y = image.shape
            x = (x * dims_reduction[0]) + 2
            y = (y * dims_reduction[1]) + 2
            z = (z * dims_reduction[2]) + 2
            if x > highest_xyz[0]:
                highest_xyz[0] = int(x)
            if y > highest_xyz[1]:
                highest_xyz[1] = int(y)
            if z > highest_xyz[2]:
                highest_xyz[2] = int(z)
    return highest_xyz


def print_sizes(filenames):
    file_paths = [ORIGINAL_DATASET_PATH + filename for filename in filenames]
    highest_xyz = [-1,-1,-1]
    for filename in filenames:
        with h5py.File(ORIGINAL_DATASET_PATH + filename, 'r') as fr:
            image = fr.get('imageSequenceInterpolated')
            print(image.shape)

def store_correct_cell_count(filenames):
    file_paths = [NEW_DATASET_PATH + filename for filename in filenames]
    for filename in filenames:
        with h5py.File(NEW_DATASET_PATH + filename, 'a') as f:
            start = time.time()
            target_og = np.array(f.get('labelledImage3D'), dtype='i1')
            target_og = np.moveaxis(target_og, 0, -1)
            correct_num_instances = multi_instance_semseg(target_og)[1]
            f.attrs.create('correct_cell_count', [correct_num_instances])
            print("Células original: {}.".format(correct_num_instances))

def generate_preprocessed_files(filenames, compression_level, dims_reduction=(1,1,1), delete=False, batch=False):
    file_paths = [ORIGINAL_DATASET_PATH + filename for filename in filenames]
    max_x, max_z, max_y = get_highest_xyz_after_reduction(filenames, dim_reduction)
    for filename in filenames:
        with h5py.File(ORIGINAL_DATASET_PATH + filename, 'r') as fr:
            newfile_name = "c{}{}-{}".format(compression_level, dims_reduction, filename)
            if not os.path.exists(NEW_DATASET_PATH):
                os.makedirs(NEW_DATASET_PATH)
            with h5py.File("{}{}".format(NEW_DATASET_PATH, newfile_name), 'a') as fw:
                if 'image' not in fw.keys():
                    start = time.time()
                    image = np.array(fr.get('imageSequenceInterpolated'), dtype='f4')
                    image = zoom(image, dims_reduction)
                    if batch:
                        z, x, y = image.shape
                        temp = np.zeros((max_z, max_x, max_y), dtype='f4')
                        temp[:z,:x,:y] = image
                        image = temp
                    print("Dimensiones image: {}".format(image.shape))
                    fw.create_dataset('image', shape=image.shape, dtype='f4', data=image, compression="gzip", compression_opts=compression_level)
                    print("Image {} {:.0f}s".format(newfile_name, time.time() - start))
                if 'target' not in fw.keys():
                    start = time.time()
                    labelledImage = fr.get('labelledImage3D')
                    correct_num_instances = fr.attrs['correct_cell_count']
                    target = np.array(labelledImage, dtype='i2')
                    target = zoom(target, dims_reduction)
                    if batch:
                        z, x, y = target.shape
                        temp = np.zeros((max_z, max_x, max_y), dtype='i2')
                        temp[:z,:x,:y] = target
                    fw.create_dataset('multi_instance_target', shape=target.shape, dtype='i2', data=target, compression="gzip", compression_opts=compression_level)
                    fw.create_dataset('not_segmented_target', shape=target.shape, dtype='i2', data=np.ceil(target/1000).astype('i2'), compression="gzip", compression_opts=compression_level)
                    if batch:
                        target = temp
                    target, num_instances, boundaries = multi_instance_semseg(target)
                    print("Dimensiones target: {}".format(target.shape))
                    print("Células original: {}. Células ahora: {}".format(correct_num_instances, num_instances))
                    fw.create_dataset('boundaries', shape=boundaries.shape, dtype='i2', data=boundaries, compression="gzip", compression_opts=compression_level)
                    fw.create_dataset('target', shape=target.shape, dtype='i2', data=target, compression="gzip", compression_opts=compression_level)
                    fw.attrs.create('correct_cell_count', correct_num_instances)
                    fw.attrs.create('cell_count', [num_instances])
                    print("Target {} {:.0f}s".format(newfile_name, time.time() - start))
        print()
        if delete:
            os.remove(ORIGINAL_DATASET_PATH + filename)

In [15]:
filenames = list(filter(lambda name: name[0] != 'p', list(os.walk(ORIGINAL_DATASET_PATH))[0][2]))
dim_reduction = (0.5,0.5,0.5)
generate_preprocessed_files(filenames, 4, dims_reduction=dim_reduction, batch=False)

Dimensiones image: (116, 512, 512)
Image c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_1a.mat 117s
Dimensiones target: (116, 512, 512)
Células original: [72]. Células ahora: 72
Target c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_1a.mat 55s

Dimensiones image: (132, 512, 512)
Image c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_3a.mat 89s
Dimensiones target: (132, 514, 512)
Células original: [83]. Células ahora: 84
Target c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_3a.mat 53s

Dimensiones image: (122, 512, 512)
Image c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_4a.mat 66s
Dimensiones target: (122, 512, 512)
Células original: [74]. Células ahora: 74
Target c4(0.5, 0.5, 0.5)-Wildtype_2017-12-04_4a.mat 60s

Dimensiones image: (126, 512, 512)
Image c4(0.5, 0.5, 0.5)-Wildtype_2018-04-10_1a.mat 87s
Dimensiones target: (126, 514, 512)
Células original: [67]. Células ahora: 67
Target c4(0.5, 0.5, 0.5)-Wildtype_2018-04-10_1a.mat 51s

Dimensiones image: (108, 512, 512)
Image c4(0.5, 0.5, 0.5)-Wildtype_2018-04-10_2a.mat 57s
Dimension