In [8]:
import csv
import random
import os
import glob

import pandas as pd
import numpy as np
import math
import cv2
import skimage.external.tifffile as tiff

from common import Statistics, dataset_source
from pathlib import Path

from resources.conv_learner import *

from tqdm import tqdm_notebook as tqdm

In [14]:
path = "datasets/HPA_challenge_2018/"
# path = "datasets/Kagg"

# Dataset generation

Below are two functions which: 

a) select from the original csv a certain percentage of images at random and save them in a second csv<br/>
b) based on an input csv, select image files, compose stacks of images according to selected channels and save them in a separate folder as tiff files.

## Generate train subset

In [5]:
def generate_random_subset_csv(path, infile, outfile, percentage = 10, seed=None):

    perc = math.ceil((int(100 / percentage)))
    if seed: random.seed = seed # sharing outfiles directly might be easier
    rand_subset = []
    print(f"Generating random {perc}% subset of {infile}...")
    
    with open(path + infile, 'r') as d:
        reader = csv.reader(d)
        train_data = np.vstack(list(reader))

    for idx, file in enumerate(train_data[:,0]):
        number = random.randint(1,perc)
        if number == 1:
            rand_subset.append(train_data[idx])
    
    random_subset = np.vstack(rand_subset)
    
    # write to csv in the same directory:
    # some error-catching might be nice if outfile already exists
    df = pd.DataFrame(random_subset)    
    df.to_csv(path + outfile, header = False, index = False)
    print(f"Success: saved random {perc}% subset to {outfile}")
    
    return random_subset

In [6]:
# generate random subset csv file:

random_ten_percent = generate_random_subset_csv(path, 'train.csv', 'train_10perc_v2.csv', seed = 1)
# print(random_ten_percent[:10])

Generating random 10% subset of train.csv...
Success: saved random 10% subset to train_10perc_v2.csv


In [None]:
# Check for consistency between subset and source

a = list(train_data[:,0])
b = list(random_ten_percent[:,0])

c = set(a) & set(b)
assert len(b) == len(c), "Selected images not subset of original data"
print('Check successful')

## Generate dataset 

dataset_gen():

1) Generates composite images (if multiple channels have been selected) from the individual blue, green, yellow and red channels of the original images.<br/>
2) The input csv defines the exact images to be selected.<br/>
3) Images are stored in a newly generated folder. 

In [16]:
def dataset_gen(csv_path, source_path, output_path, channels = ['green']):
    
    with open(csv_path, 'r') as d:
        reader = csv.reader(d)
        fnames = np.vstack(list(reader))[1:,0] #### change depending on header ###
    
    assert not os.path.isdir(output_path), 'Chosen output directory already exists!'
    print(f"creating output directory: {os.path.basename(output_path)}")
    os.mkdir(output_path)
    
    for f in tqdm(fnames, total = len(fnames), unit="files"):
        im_ = [cv2.imread(str(source_path + f + '_' + c + '.png'), cv2.IMREAD_GRAYSCALE) for c in channels] # check '_' or '-'
        im = np.stack(im_)
        
        assert im.dtype == 'uint8' # making sure files are expected dtype; may change for different datasets...
        tiff.imsave(output_path + f + '.tiff', im, imagej = True) # imageJ = True generates ImageJ compatible Hyperstack

In [17]:
### - use generated subset csv to generate folder with tiff files composed of select channels

path = "datasets/HPA_challenge_2018/"

csv_path = path + 'HPA_labels.csv'
source_path = path + 'train_raw/'
output_path = path + 'Kaggle_train_GBRY/'
channels = ['green','blue','red','yellow'] # can also leave this out since default is 'green'

dataset_gen(csv_path, source_path, output_path, channels = channels)

creating output directory: 


HBox(children=(IntProgress(value=0, max=31071), HTML(value='')))




In [2]:
!ls datasets/HPA_challenge_2018/ -1 | wc -l

28


In [14]:
!rm -rf datasets/HPA_challenge_2018/Kaggle_train_GBRY

In [15]:
!ls datasets/HPA_challenge_2018/

HPA_labels.csv			       haystack_no_si.csv
HPAv18RBGY_wodpl.csv		       models
HPAv18_60x_def_dupes_removed.csv       needles.csv
HPAv18_BGRY_source		       sample_submission.csv
HPAv18_BGR_all			       test-raw
HPAv18_BGR_source		       test.zip
HPAv18_GBRY_60x_def_dupes_removed_all  test_BGR_all
HPAv18_def_dupes_removed.csv	       test_BGYR_all
HPAv18_dupes_400.csv		       tmp
HPAv18_dupes_5k.csv		       train.csv
HPAv18_dupes_no_si_dist5.csv	       train.zip
HPAv18_dupes_no_si_dist5.pkl	       train_BGR_all
HPAv18_dupes_no_si_phash_10_BGR.pkl    train_BGYR_all
HPAv18_wodpl_60x.csv		       train_raw


In [31]:
d = os.listdir('datasets/HPA_challenge_2018/HPAv18_BGR_all')
tiff.imread('datasets/HPA_challenge_2018/HPAv18_BGR_all/' + d[0]).shape

(3, 512, 512)

In [19]:
!rm -rf datasets/HPA_challenge_2018/HPAv18_BGR_all

In [4]:
# create non-header version of input .csv

# csv_path = path + 'HPA_labels.csv'


# with open(csv_path, 'r') as d:
#     reader = csv.reader(d)
#     fnames = np.vstack(list(reader))
    
# no_header = fnames[1:] # remove header line

# no_header_df = pd.DataFrame(no_header)
# no_header_df

# no_header_df.to_csv(path + 'HPA_labels.csv', header = False, index = False)

## Compiling test-set images

In [19]:
def testset_gen(path, test_folder, output_path, channels = ['green']):
    
    _fnames = test_names = read_dir(path, test_folder)
    _fnames = pd.DataFrame([os.path.basename(_fnames[i].split('_')[0]) for i, _ in enumerate(_fnames)])
    fnames = _fnames[0].unique();
    
    assert not os.path.isdir(output_path), 'Chosen output directory already exists!'
    print(f"creating output directory: {os.path.basename(output_path)}")
    os.mkdir(output_path)
    
    for f in tqdm(fnames, total = len(fnames), unit="files"):
        im_ = [cv2.imread(str(path + test_folder + f + '_' + c + '.png'), cv2.IMREAD_GRAYSCALE) for c in channels]
        im = np.stack(im_)
        # im = np.rollaxis(im,0,3)

        assert im.dtype == 'uint8' # making sure files are expected dtype; may change for different datasets...
        tiff.imsave(output_path + f + '.tiff', im, imagej = True) # imageJ = True generates ImageJ compatible Hyperstack

In [20]:
### - use generated subset csv to generate folder with tiff files composed of select channels

path = path
test_folder = 'test-raw/'
output_path = path + 'Kaggle_test_GBRY/'
channels = ['green','blue','red','yellow'] # can also leave this out since default is 'green'

testset_gen(path, test_folder, output_path, channels = channels)

creating output directory: 


HBox(children=(IntProgress(value=0, max=11702), HTML(value='')))




### Generate multi-folder csv:

In [21]:
!ls datasets/HPA_challenge_2018/

HPA_labels.csv			       Kaggle_train_GBRY
HPAv18RBGY_wodpl.csv		       haystack_no_si.csv
HPAv18_60x_def_dupes_removed.csv       models
HPAv18_BGRY_source		       needles.csv
HPAv18_BGR_all			       sample_submission.csv
HPAv18_BGR_source		       test-raw
HPAv18_GBRY_60x_def_dupes_removed_all  test.zip
HPAv18_def_dupes_removed.csv	       test_BGR_all
HPAv18_dupes_400.csv		       test_BGYR_all
HPAv18_dupes_5k.csv		       tmp
HPAv18_dupes_no_si_dist5.csv	       train.csv
HPAv18_dupes_no_si_dist5.pkl	       train.zip
HPAv18_dupes_no_si_phash_10_BGR.pkl    train_BGR_all
HPAv18_wodpl_60x.csv		       train_BGYR_all
Kaggle_test_GBRY		       train_raw


In [6]:
!du -h datasets/HPA_challenge_2018/Kaggle_train_GBRY
!du -h datasets/HPA_challenge_2018/Kaggle_test_GBRY
!du -h datasets/HPA_challenge_2018/HPAv18_GBRY_60x_def_dupes_removed_all

31G	datasets/HPA_challenge_2018/Kaggle_train_GBRY
12G	datasets/HPA_challenge_2018/Kaggle_test_GBRY
66G	datasets/HPA_challenge_2018/HPAv18_GBRY_60x_def_dupes_removed_all


In [19]:
print(f"Files in Kaggle_train_GBRY: {len(os.listdir('datasets/HPA_challenge_2018/Kaggle_train_GBRY'))}")
print(f"Files in Kaggle_test_GBRY: {len(os.listdir('datasets/HPA_challenge_2018/Kaggle_test_GBRY'))}")
print(f"Files in HPAv18_GBRY_60x_def_dupes_removed_all: {len(os.listdir('datasets/HPA_challenge_2018/HPAv18_GBRY_60x_def_dupes_removed_all'))}")
      
      
print(f"Image-dims in Kaggle_train_GBRY: {tiff.imread(path + 'Kaggle_train_GBRY/' + os.listdir('datasets/HPA_challenge_2018/Kaggle_train_GBRY')[0]).shape}")
print(f"Image-dims in Kaggle_test_GBRY: {tiff.imread(path + 'Kaggle_test_GBRY/' + os.listdir('datasets/HPA_challenge_2018/Kaggle_test_GBRY')[0]).shape}")
print(f"Image-dims in HPAv18_GBRY_60x_def_dupes_removed_all: {tiff.imread(path + 'HPAv18_GBRY_60x_def_dupes_removed_all/' + os.listdir('datasets/HPA_challenge_2018/HPAv18_GBRY_60x_def_dupes_removed_all')[0]).shape}")

Files in Kaggle_train_GBRY: 31071
Files in Kaggle_test_GBRY: 11702
Files in HPAv18_GBRY_60x_def_dupes_removed_all: 66921
Image-dims in Kaggle_train_GBRY: (4, 512, 512)
Image-dims in Kaggle_test_GBRY: (4, 512, 512)
Image-dims in HPAv18_GBRY_60x_def_dupes_removed_all: (4, 512, 512)


In [31]:
# add partial path to Id's in csv's

def multi_folder_csv(df_folderpath_dict):
    """
    df_folderpath_dict: dictionary of (at least 2 pairs) of {Partial_folder_path : label_dataframe}, 
    e.g. with label_dataframes containing columns 'Id' and 'Target'.
    
    res: one unified pandas DataFrame with Partial_folder_path added to 'Id's 
    """
    _dict = df_folderpath_dict.copy()
    for partial_path, df in _dict.items():
        for n in tqdm(df.Id, total = len(df.Id), unit="files"):
            df.Id.replace(n, partial_path + n, inplace=True)  
            
    res = list(_dict.values())[0].append(list(_dict.values())[1:])
    
    return res

In [50]:
!ls datasets/HPA_challenge_2018/

HPA_labels.csv			       Kaggle_train_GBRY
HPAv18RBGY_wodpl.csv		       haystack_no_si.csv
HPAv18_60x_def_dupes_removed.csv       models
HPAv18_BGRY_source		       needles.csv
HPAv18_BGR_all			       sample_submission.csv
HPAv18_BGR_source		       test-raw
HPAv18_GBRY_60x_def_dupes_removed_all  test.zip
HPAv18_def_dupes_removed.csv	       test_BGR_all
HPAv18_dupes_400.csv		       test_BGYR_all
HPAv18_dupes_5k.csv		       tmp
HPAv18_dupes_no_si_dist5.csv	       train.csv
HPAv18_dupes_no_si_dist5.pkl	       train.zip
HPAv18_dupes_no_si_phash_10_BGR.pkl    train_BGR_all
HPAv18_wodpl_60x.csv		       train_BGYR_all
Kaggle_test_GBRY		       train_raw


In [49]:
!rm datasets/HPA_challenge_2018/Kaggle_HPA_labels.csv

In [52]:
HPAv18_labels_df = pd.read_csv(path + 'HPA_labels.csv')
HPAv18_labels_df = pd.DataFrame(HPAv18_labels_df.values, columns=['Id', 'Target'])
HPAv18_labels_df.to_csv(path + 'Kaggle_HPA_labels.csv', index=False)

In [56]:
path = "datasets/HPA_challenge_2018/"
HPA_labels_df = pd.read_csv(path + 'Kaggle_HPA_labels.csv')
HPAv18_labels_df = pd.read_csv(path + 'HPAv18_60x_def_dupes_removed_labels.csv')

df_folderpath_dict = {'Kaggle_train_GBRY/':HPA_labels_df,
                     'HPAv18_GBRY_60x_def_dupes_removed_all/': HPAv18_labels_df}

In [57]:
mod_df = multi_folder_csv(df_folderpath_dict)

HBox(children=(IntProgress(value=0, max=31071), HTML(value='')))




HBox(children=(IntProgress(value=0, max=66921), HTML(value='')))




In [61]:
mod_df.head(10)

Unnamed: 0,Id,Target
0,Kaggle_train_GBRY/000a6c98-bb9b-11e8-b2b9-ac1f...,7 1 2 0
1,Kaggle_train_GBRY/000a9596-bbc4-11e8-b2bc-ac1f...,5
2,Kaggle_train_GBRY/000c99ba-bba4-11e8-b2b9-ac1f...,1
3,Kaggle_train_GBRY/001838f8-bbca-11e8-b2bc-ac1f...,18
4,Kaggle_train_GBRY/001bcdd2-bbb2-11e8-b2ba-ac1f...,0
5,Kaggle_train_GBRY/0020af02-bbba-11e8-b2ba-ac1f...,25 2
6,Kaggle_train_GBRY/002679c2-bbb6-11e8-b2ba-ac1f...,0
7,Kaggle_train_GBRY/00285ce4-bba0-11e8-b2b9-ac1f...,2 0
8,Kaggle_train_GBRY/002daad6-bbc9-11e8-b2bc-ac1f...,7
9,Kaggle_train_GBRY/002ff91e-bbb8-11e8-b2ba-ac1f...,23


In [60]:
mod_df.to_csv(path + 'Kaggle_AND_HPAv18_60x_NoDefDupes_labels.csv', index=False)

# Resources WIP/Code storage:

In [9]:
yeast = tiff.imread(str('datasets/yeast_v11.1/train/02_WT/WT_WP_E2_Mito0_S2_F1_I1_C1_A0.tifstack.tif'))
yeast.shape

(2, 200, 200)

In [9]:
# """ImageJ=1.52b
# images=2
# slices=2
# unit=micron
# spacing=0.3
# loop=false
# min=0.0
# max=65535.0"""