In [1]:
import csv
import random
import os
import glob

import pandas as pd
import numpy as np
import math
import cv2
import skimage.external.tifffile as tiff

# Dataset generation

Below are two functions which: 

a) select from the original csv a certain percentage of images at random and save them in a second csv<br/>
b) based on an input csv, select image files, compose stacks of images according to selected channels and save them in a separate folder as tiff files.

## Generate train subset

In [2]:
path = "datasets/Kaggle_HPA_2018/"
# path_up = "G:/Downloads/Data"

In [3]:
def generate_random_subset_csv(path, infile, outfile, percentage = 10, seed=None):

    perc = math.ceil((int(100 / percentage)))
    if seed: random.seed = seed # sharing outfiles directly might be easier
    rand_subset = []
    print(f"Generating random {perc}% subset of {infile}...")
    
    with open(path + infile, 'r') as d:
        reader = csv.reader(d)
        train_data = np.vstack(list(reader))

    for idx, file in enumerate(train_data[:,0]):
        number = random.randint(1,perc)
        if number == 1:
            rand_subset.append(train_data[idx])
    
    random_subset = np.vstack(rand_subset)
    
    # write to csv in the same directory:
    # some error-catching might be nice if outfile already exists
    df = pd.DataFrame(random_subset)    
    df.to_csv(path + outfile, header = False, index = False)
    print(f"Success: saved random {perc}% subset to {outfile}")
    
    return random_subset

In [None]:
# generate random subset csv file:

random_ten_percent = generate_random_subset_csv(path, 'train_all.csv', 'test.csv', seed = 1)
# print(random_ten_percent[:10])

In [None]:
# Check for consistency between subset and source

a = list(train_data[:,0])
b = list(random_ten_percent[:,0])

c = set(a) & set(b)
assert len(b) == len(c), "Selected images not subset of original data"
print('Check successful')

## Generate dataset 

In [11]:
from tqdm import tqdm_notebook as tqdm

In [15]:
def dataset_gen(csv_path, source_path, output_path, channels = ['green']):
    
    with open(csv_path, 'r') as d:
        reader = csv.reader(d)
        fnames = np.vstack(list(reader))[:,0]
    
    assert not os.path.isdir(output_path), 'Chosen output directory already exists!'
    print(f"creating output directory: {os.path.basename(output_path)}")
    os.mkdir(output_path)
    
    for f in tqdm(fnames, total = len(fnames), unit="files"):
        im_ = [cv2.imread(str(source_path + f + '_' + c + '.png'), cv2.IMREAD_GRAYSCALE) for c in channels]
        im = np.stack(im_)
        im = np.rollaxis(im,0,3)

        assert im.dtype == 'uint8' # making sure files are expected dtype; may change for different datasets...
        tiff.imsave(output_path + f + '.tiff', im)    

In [16]:
### - use generated subset csv to generate folder with tiff files composed of select channels

csv_path = path + 'train_10perc_v1.csv'
source_path = path + 'train_original/'
output_path = path + 'train_10perc_v1_green/'
channels = ['green'] # can also leave this out since default is 'green'

# run
dataset_gen(csv_path, source_path, output_path, channels = channels)

creating output directory: 


HBox(children=(IntProgress(value=0, max=3042), HTML(value='')))




# Original code:

In [2]:
path = "datasets/Kaggle_HPA_2018/"
# path_up = "G:/Downloads/Data"

In [None]:
chans = ['blue']

im_ =  [cv2.imread(str(path +'testing/' + 'fffdf7e0-bbc4-11e8-b2bc-ac1f6b6435d0'+ '_' + c + '.png'), cv2.IMREAD_GRAYSCALE) for c in (chans)]

im = np.stack(im_)
print(im.shape)
im_r = np.rollaxis(im,0,3)
print(im_r.shape)
print(im_r.dtype)
assert im_r.dtype == 'uint8'

tiff.imsave(path +'testing/' + 'fffdf7e0-bbc4-11e8-b2bc-ac1f6b6435d0' + '.tiff', im_r)


im_tf = tiff.imread(path +'testing/' + 'fffdf7e0-bbc4-11e8-b2bc-ac1f6b6435d0' + '.tiff')
print(im_tf.shape)


In [None]:
np.stack(im_).shape

In [None]:
x = os.listdir(path)

y = []
random_ten_percent = []

i=0

for file in x:
    if "green" in file:
        y.append(file)


seed = 1
random.seed=seed

for file in y:
    number = random.randint(1,10)
    if number == 1:
        random_ten_percent.append(file)
        
# print(os.getcwd())

os.chdir(path_up)

with open('train.csv', 'r') as d:
    reader = csv.reader(d)
    train_data = list(reader)
     



In [None]:
print(train_data[1][0])

test = train_data[1][0]

output = []

j = 0
for file in y:
    for point in train_data:
        if point[0] in file:
            output.append([file, point[1]])
            


In [None]:
with open('dataset.csv', 'w') as f:
    for line in output:
        f.write(line[0])
        f.write(",")
        f.write(line[1])
        f.write('\n')