# Create Dataset from .zip files and calc normalization statistics

Step_1: create standard dataset to be re-used, either by creating folders or as hdf5 object. 

Step_2: calculate mean + std for train and test images

## Creating shuffled list of train, test and potentially validation images

In [4]:
def ShuffleZip(Dataset_name, input_path, output_path, val = False):
    
    """
    Function to unzip, shuffle, re-zip and store a set of images at a specified location.
    
    Arguments:
    
    Dataset_name: name of the dataset, e.g. WT_175 - should be descriptive 
    input_path: path to input .zip file
    output_path: path for shuffled .zip-file to be stored

    -> creates temp folder in same directory as .zip file to store unzipped files in, but deletes it once done. 
    -> shuffles and splits unzipped files between train, test and optionally val datasets.
    -> optionally re-zip or storage in hdf5 object (TODO)

    """

    import glob
    import os
    import zipfile



    ### OPTIONS ###
    # select cell designation, e.g. WT or mfb1KO - important for, well, naming... 
    Dataset_name = 'mmm1KO_230' #don't add .zip here

    # choose target .zip file
    ZPath = 'datasets/Exp1_data_storage/original_zips/mmm1KO_230.zip'




    ### Execution --------------------------------------------------------------------------------###
    TempPath = os.path.dirname(ZPath) + '/TEMP-' + Dataset_name # Path definition, also for later use

    # unzips files into temp folder
    if os.path.exists(TempPath):
        raise ValueError('temp folder already exists in directory; consider deleting and re-run')
    else:
        os.makedirs(TempPath)

    zip_ref = zipfile.ZipFile(ZPath, 'r') 
    zip_ref.extractall(TempPath)
    zip_ref.close()

In [5]:
import random
import glob

random.seed(1) #reproducible randomness

### OPTIONS ###
shuffle_data = True  # shuffle the addresses before saving
val = False # TODO: optional creation of validation dataset




### Execution --------------------------------------------------------------------------------###

# get list of files in TempPath
addrs = os.listdir(TempPath)

# create shuffled list
if shuffle_data:
    addrs = random.sample(addrs, k = len(addrs)) #creates shuffled list by random sampling from original list.
    
    
"""
Question: 
Generating train, test and optionally val datasets - Question: should there be the same absolute number of test/val 
images for each class or should the number vary depending on total number of images per class e.g. 
20 test images for a total of 100 class A images, but 40 for a total of 200 class B images?   

"""

# # Divide the hata into 60% train, 20% test, and optionally 20% val
# train_addrs = addrs[0:int(0.8*len(addrs))]
# test_addrs = addrs[int(0.8*len(addrs)):]
# # val_addrs = addrs[int(0.6*len(addrs)):int(0.8*len(addrs))]

# Select == 35 images for test and optionally val datasets; put the rest into train
test_addrs = addrs[0:35]
train_addrs = addrs[35:]

print(len(train_addrs))
print(len(test_addrs))

195
35


## Creating .zip files from shuffled data

In [7]:
# Creating .zip file of train, test and potentially validation images. 

from zipfile import ZipFile
from os.path import basename #required to use in zipfile.Zipfile.write(file, basename(file)) to avoid completed path to be archived
import shutil 

verbose = 0

save_path = 'datasets/Exp1_data_storage/shuffled_zips/'

if not os.path.exists(save_path):
    os.makedirs(save_path)

### processing train images: ###
if verbose:
    print('Following files will be zipped:')
    for addrs in train_addrs:
        print(addrs)
        
# writing files to a zipfile
with ZipFile(save_path + Dataset_name + '_train_data.zip','w') as zip:
    # writing each file one by one
    for addrs in train_addrs:
        zip.write(TempPath + '/' + addrs, basename(addrs))
        
    print('All training images zipped successfully!')
    
    
### processing Test images: ###
if verbose:
    print('Following files will be zipped:')
    for addrs in test_addrs:
        print(addrs)
        
# writing files to a zipfile
with ZipFile(save_path + Dataset_name + '_test_data.zip','w') as zip:
    # writing each file one by one
    for addrs in test_addrs:
        zip.write(TempPath + '/' + addrs, basename(addrs))
 
    print('All test images zipped successfully!')
    
zip_ref.close()

print('Files moved to:' + save_path)

All training images zipped successfully!
All test images zipped successfully!
Files moved to:datasets/Exp1_data_storage/shuffled_zips/


In [9]:
# To be used with shuffled data in zip files. 
# Extracts these to specified dataset folder in train/test subfolders

import glob
import os
import zipfile

### OPTIONS ###
# define path for files to be unzipped and stored in train and test directories
dataset_path = 'datasets/yeast_v3/'

# select cell designation, e.g. WT or mfb1KO - important for, well, naming... 
Dataset_name = 'mmm1KO_230' #don't add .zip here

# choose path where target zip-files are stored
ZPath = 'datasets/Exp1_data_storage/shuffled_zips'

# optionally add 'val' keyword if datasets (zip files) have been created accordingly
data_struct = ['train', 'test']



### Execution --------------------------------------------------------------------------------###

# unzips files correct folders or creates them

for i in data_struct:

    if not os.path.exists(dataset_path + '/' + i):
        os.makedirs(dataset_path + '/' + i)
        print(i + ' created')
        if not os.path.exists(dataset_path + '/' + i + Dataset_name):
            os.makedirs(dataset_path + '/' + i + '/' + Dataset_name)
            print(i + '/' + Dataset_name + ' created')
        else:
            print('WARNING: *added* images to existing folder:' + i + '/' + Dataset_name)
    
    zip_ref = zipfile.ZipFile(ZPath + '/' + Dataset_name + '_' + i + '_data.zip', 'r') 
    zip_ref.extractall(dataset_path + '/' + i + '/' + Dataset_name)
    zip_ref.close()    

train created
train/mmm1KO_230 created
test created
test/mmm1KO_230 created


## Creating HDF5 files from shuffled addrs lists

In [None]:
import time
import numpy as np
# import h5py
import matplotlib.pyplot as plt
import scipy
from PIL import Image
from scipy import ndimage
#from YNet_scripts import * ## importing scripts - currently backprop is defined outside this notebook.
import tables
from __future__ import division

# %matplotlib inline
# plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
# plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

# %load_ext autoreload
# %autoreload 2

np.random.seed(1) #reproducible randomness

In [None]:
# Creating HDF5 file - can create formats for either tf or th frameworks. 

data_order = 'tf'  # 'th' for Theano, 'tf' for Tensorflow
img_dtype = tables.UInt16Atom()  # dtype in which the images will be saved

# check the order of data and chose proper data shape to save images
if data_order == 'th':
    data_shape = (0, 2, 200, 200)
elif data_order == 'tf':
    data_shape = (0, 200, 200, 2)
    
# open a hdf5 file and create earrays
hdf5_file = tables.open_file(hdf5_path, mode='w')
train_storage = hdf5_file.create_earray(hdf5_file.root, 'train_img', img_dtype, shape=data_shape)
val_storage = hdf5_file.create_earray(hdf5_file.root, 'val_img', img_dtype, shape=data_shape)
test_storage = hdf5_file.create_earray(hdf5_file.root, 'test_img', img_dtype, shape=data_shape)
mean_storage = hdf5_file.create_earray(hdf5_file.root, 'train_mean', img_dtype, shape=data_shape)

# create the label arrays and copy the labels data in them
hdf5_file.create_array(hdf5_file.root, 'train_labels', train_labels)
hdf5_file.create_array(hdf5_file.root, 'val_labels', val_labels)
hdf5_file.create_array(hdf5_file.root, 'test_labels', test_labels)

In [None]:
#closing the dataset.hdf5 file in case you want to recreate
#hdf5_file.close()
#print (data_order)


In [None]:
from skimage import io
from skimage import transform

# a numpy array to save the mean of the images
mean = np.zeros(data_shape[1:], np.float32)

# loop over train addresses
for i in range(len(train_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Train data: {}/{}'.format(i, len(train_addrs))
        
    # read an image and resize to (2,64, 64)
    addr = train_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64,64)) #NOTE: currently resizing images is done in Section_2
    
    # Any first-line image pre-processing could be done here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1) #Check the reorder condition - it has nothing to do with tensorflow or theano atm. 
        
    # save the image and calculate the mean so far
    train_storage.append(img[None])
    mean += img / float(len(train_labels))
    
# loop over validation addresses
for i in range(len(val_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Validation data: {}/{}'.format(i, len(val_addrs))
        
    # read an image and resize to (2,64,64)
    # cv2 load images as BGR, convert it to RGB
    addr = val_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64, 64)) #NOTE: currently resizing images is done in Section_2
    
    # Any first-line image pre-processing could be done here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1)
        
    # save the image
    val_storage.append(img[None])
    
# loop over test addresses
for i in range(len(test_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Test data: {}/{}'.format(i, len(test_addrs))
        
    # read an image and resize to (2,64,64)
    # cv2 load images as BGR, convert it to RGB
    addr = test_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64,64)) #NOTE: currently resizing images is done in Section_2
    
    # Any first-line image pre-processing could be done here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1)
        
    # save the image
    test_storage.append(img[None])
    
# save the mean and close the hdf5 file
mean_storage.append(mean[None])
hdf5_file.close()

## Loading and processing images from HDF5 files

In [None]:
def load_data_yeast():
    
    yeast_dataset = tables.open_file('Yeast_ML_EXP1/dataset.hdf5', "r")
    train_set_x_orig = np.array(yeast_dataset.root.train_img) # our train set features
    train_set_y_orig = np.array(yeast_dataset.root.train_labels) # our train set labels

    test_set_x_orig = np.array(yeast_dataset.root.test_img) # our test set features
    test_set_y_orig = np.array(yeast_dataset.root.test_labels) # our test set labels

    #classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    classes = np.array(['WT','Mfb1KO']) #hardcoded for now
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [None]:
train_x_orig, train_y, test_x_orig, test_y, classes = load_data_yeast()

In [None]:
print(train_x_orig.shape)
print(test)