In [1]:
import csv
import random
import os
import glob

import pandas as pd
import numpy as np
import math
import cv2
import skimage.external.tifffile as tiff

from common import Statistics, dataset_source
from pathlib import Path

# Dataset generation

Below are two functions which: 

a) select from the original csv a certain percentage of images at random and save them in a second csv<br/>
b) based on an input csv, select image files, compose stacks of images according to selected channels and save them in a separate folder as tiff files.

## Generate train subset

In [2]:
# path = "datasets/Kaggle_HPA_2018/"
path = "G:/Downloads/Data/"

In [17]:
def generate_random_subset_csv(path, infile, outfile, percentage = 10, seed=None):

    perc = math.ceil((int(100 / percentage)))
    if seed: random.seed = seed # sharing outfiles directly might be easier
    rand_subset = []
    print(f"Generating random {perc}% subset of {infile}...")
    
    with open(path + infile, 'r') as d:
        reader = csv.reader(d)
        train_data = np.vstack(list(reader))

    for idx, file in enumerate(train_data[:,0]):
        number = random.randint(1,perc)
        if number == 1:
            rand_subset.append(train_data[idx])
    
    random_subset = np.vstack(rand_subset)
    
    # write to csv in the same directory:
    # some error-catching might be nice if outfile already exists
    df = pd.DataFrame(random_subset)    
    df.to_csv(path + outfile, header = False, index = False)
    print(f"Success: saved random {perc}% subset to {outfile}")
    
    return random_subset

In [18]:
# generate random subset csv file:

random_ten_percent = generate_random_subset_csv(path, 'train.csv', 'test.csv', seed = 1)
# print(random_ten_percent[:10])

Generating random 10% subset of train.csv...
Success: saved random 10% subset to test.csv


In [13]:
# Check for consistency between subset and source

a = list(train_data[:,0])
b = list(random_ten_percent[:,0])

c = set(a) & set(b)
assert len(b) == len(c), "Selected images not subset of original data"
print('Check successful')

NameError: name 'train_data' is not defined

## Generate dataset 

In [5]:
from tqdm import tqdm_notebook as tqdm

In [20]:
def dataset_gen(csv_path, source_path, output_path, channels = ['green']):
    
    with open(csv_path, 'r') as d:
        reader = csv.reader(d)
        fnames = np.vstack(list(reader))[:,0]
    
    assert not os.path.isdir(output_path), 'Chosen output directory already exists!'
    print(f"creating output directory: {os.path.basename(output_path)}")
    os.mkdir(output_path)
    
    for f in tqdm(fnames[:10], total = len(fnames[:10]), unit="files"):
        im_ = [cv2.imread(str(source_path + f + '_' + c + '.png'), cv2.IMREAD_GRAYSCALE) for c in channels]
        im = np.stack(im_)
        # im = np.rollaxis(im,0,3)

        assert im.dtype == 'uint8' # making sure files are expected dtype; may change for different datasets...
        tiff.imsave(output_path + f + '.tiff', im)

In [6]:
### - use generated subset csv to generate folder with tiff files composed of select channels

csv_path = path + 'train_10perc_v1.csv'
source_path = path + 'train_original/'
output_path = path + 'train_4chan_all/'
channels = ['blue','green','yellow','red'] # can also leave this out since default is 'green'



In [22]:
### - use generated subset csv to generate folder with tiff files composed of select channels

csv_path = path + 'train_10perc_v1.csv'
source_path = path + 'train_original/'
output_path = path + 'train_10perc_v1_green_v2/'
channels = ['green'] # can also leave this out since default is 'green'

# run
dataset_gen(csv_path, source_path, output_path, channels = channels)

creating output directory: 


HBox(children=(IntProgress(value=0, max=3127), HTML(value='')))

(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)
(1, 512, 512)



In [7]:
stats_name = "Kaggle2018_train_all.dict"
main_stats = Statistics.one_dataset(output_path, save_name = stats_name)

G:\Downloads\Data\train_10perc_v1_green
working on a dataset with length: 3127


In [8]:
print(len(main_stats[0]))
print(len(main_stats[1]))
print(main_stats)

1
1
(array([0.00021]), array([0.00044]))


In [9]:
# """ImageJ=1.52b
# images=2
# slices=2
# unit=micron
# spacing=0.3
# loop=false
# min=0.0
# max=65535.0"""

# path_yeast = "G:/Downloads/Data/"
# file = path_yeast + "num1_WP_E1_S0_F1_I5_C3_A0.tifstack.tif"
# image = tiff.imread(str(file))
# print(image.shape)

In [8]:
images = []
i = 0

for file in Path(output_path).iterdir():
    while i < 10:
        if ".tif" in str(file):
            image = tiff.imread(str(file))
            #print(image.shape)
            images.append(image)
        i+=1
print(f"working on a dataset with length: {len(images)}")

working on a dataset with length: 10


In [18]:
print(len(images))
print(images[0])

data = []
for im in images:
    for j in im:
        for k in j:
            for l in k:
                data.append(l)
print(len(data))

10
[[[  0   0   0 ...   0   2   1]
  [  0   0   0 ...   0   1   0]
  [  0   0   0 ...   0   3   0]
  ...
  [138 101  16 ...   4  15  25]
  [188 147  59 ...  17  40  18]
  [167 116 218 ...  34  13  14]]]
2621440


In [41]:
print(np.mean(images)/65536)
print(np.std(images)/65536)

0.0004309480427764356
0.0005540388663654671


In [22]:
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

def _ss(data):
    """Return sum of square deviations of sequence data."""
    c = mean(data)
    ss = sum((x-c)**2 for x in data)
    return ss

def stddev(data, ddof=0):
    """Calculates the population standard deviation
    by default; specify ddof=1 to compute the sample
    standard deviation."""
    n = len(data)
    if n < 2:
        raise ValueError('variance requires at least two data points')
    ss = _ss(data)
    pvar = ss/(n-ddof)
    return pvar**0.5

In [23]:
print(mean(data)/65536)
print(stddev(data)/65536)

0.0004309480427764356
0.0005540388663842037


# Original code:

In [9]:
yeast = tiff.imread(str('datasets/yeast_v11.1/train/02_WT/WT_WP_E2_Mito0_S2_F1_I1_C1_A0.tifstack.tif'))
yeast.shape

(2, 200, 200)