## Data-prep: shuffle images and generate train, test and optionally val datasets.



In [1]:
#Imports

import time
import numpy as np
import h5py
import matplotlib.pyplot as plt
import scipy
from PIL import Image
from scipy import ndimage
from __future__ import division


np.random.seed(1) #reproducible randomness 

In [2]:
# Run this to unzip more data

import zipfile
zip_ref = zipfile.ZipFile('Yeast_ML_EXP1/Zip_files/Mfb1KO_175.zip', 'r')
zip_ref.extractall('Yeast_ML_EXP1/train/')
zip_ref.close()

In [2]:
from random import shuffle
import glob

shuffle_data = True  # shuffle the addresses before saving
hdf5_path = 'Yeast_ML_EXP1/dataset.hdf5'  # address to where you want to save the hdf5 file
WT_v_Mfb1KO_train_path = 'Yeast_ML_EXP1/train/*.tif'

# read addresses and labels from the 'train' folder
addrs = glob.glob(WT_v_Mfb1KO_train_path)
labels = [0 if 'WT' in addr else 1 for addr in addrs]  # 0 = WT, 1 = Mfb1KO

# to shuffle data
if shuffle_data:
    c = list(zip(addrs, labels))
    shuffle(c)
    addrs, labels = zip(*c)
    
# Divide the hata into 60% train, 20% validation, and 20% test
train_addrs = addrs[0:int(0.6*len(addrs))]
train_labels = labels[0:int(0.6*len(labels))]
val_addrs = addrs[int(0.6*len(addrs)):int(0.8*len(addrs))]
val_labels = labels[int(0.6*len(addrs)):int(0.8*len(addrs))]
test_addrs = addrs[int(0.8*len(addrs)):]
test_labels = labels[int(0.8*len(labels)):]

labels_check = np.array(labels)
print (labels_check.shape) #correct size?
print (np.sum(labels_check)) #correct number of 1 and 0?

(350,)
175


In [3]:
# Creating HDF5 file

import tables

data_order = 'tf'  # 'th' for Theano, 'tf' for Tensorflow
img_dtype = tables.UInt16Atom()  # dtype in which the images will be saved

# check the order of data and chose proper data shape to save images
if data_order == 'th':
    data_shape = (0, 2, 200, 200)
elif data_order == 'tf':
    data_shape = (0, 200, 200, 2)
    
# open a hdf5 file and create earrays
hdf5_file = tables.open_file(hdf5_path, mode='w')
train_storage = hdf5_file.create_earray(hdf5_file.root, 'train_img', img_dtype, shape=data_shape)
val_storage = hdf5_file.create_earray(hdf5_file.root, 'val_img', img_dtype, shape=data_shape)
test_storage = hdf5_file.create_earray(hdf5_file.root, 'test_img', img_dtype, shape=data_shape)
mean_storage = hdf5_file.create_earray(hdf5_file.root, 'train_mean', img_dtype, shape=data_shape)

# create the label arrays and copy the labels data in them
hdf5_file.create_array(hdf5_file.root, 'train_labels', train_labels)
hdf5_file.create_array(hdf5_file.root, 'val_labels', val_labels)
hdf5_file.create_array(hdf5_file.root, 'test_labels', test_labels)

/test_labels (Array(70,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'python'
  byteorder := 'little'
  chunkshape := None

In [7]:
#closing the dataset.hdf5 file in case you want to recreate
#hdf5_file.close()
#print (data_order)

th


In [4]:
from skimage import io
from skimage import transform

# a numpy array to save the mean of the images
mean = np.zeros(data_shape[1:], np.float32)

# loop over train addresses
for i in range(len(train_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Train data: {}/{}'.format(i, len(train_addrs))
        
    # read an image and resize to (2,64, 64)
    addr = train_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64,64))
    
    # add any image pre-processing here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1) #Check the reorder condition - it has nothing to do with tensorflow or theano atm. 
        
    # save the image and calculate the mean so far
    train_storage.append(img[None])
    mean += img / float(len(train_labels))
    
# loop over validation addresses
for i in range(len(val_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Validation data: {}/{}'.format(i, len(val_addrs))
        
    # read an image and resize to (2,64,64)
    # cv2 load images as BGR, convert it to RGB
    addr = val_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64, 64))
    
    # add any image pre-processing here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1)
        
    # save the image
    val_storage.append(img[None])
    
# loop over test addresses
for i in range(len(test_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print 'Test data: {}/{}'.format(i, len(test_addrs))
        
    # read an image and resize to (2,64,64)
    # cv2 load images as BGR, convert it to RGB
    addr = test_addrs[i]
    img = io.imread(addr)
    #img = transform.resize(img, (2,64,64))
    
    # add any image pre-processing here
    # if the data order is Theano, axis orders should change
    if data_order == 'tf':
        img = np.moveaxis(img, 0,-1)
        
    # save the image
    test_storage.append(img[None])
    
# save the mean and close the hdf5 file
mean_storage.append(mean[None])
hdf5_file.close()

Train data: 100/210
Train data: 200/210


In [75]:
#from skimage import io
#from skimage import transform
#im = io.imread('WT_WP_E1_S0_F1_I2_C1_A0.tif')
#imresize = transform.resize(im,(2,64,64))
#print (im.shape)

#img = np.moveaxis(imresize, 0,-1)
#print (img.shape)
#plt.imshow(img[:,:,0])

#addr = train_addrs[0]
#imga = io.imread(addr)
#imgb = transform.resize(imga, (2,64, 64))

#imgc = np.moveaxis(imga, 0,-1)

#print (imga.shape)
#print (imgc.shape)

#print (imga[0,0,:])
#print (sum(imgc[0,:,1]))

#plt.imshow(imga[1,:,:])
#plt.imshow(imgc[:,:,1])





#train_storage_test = [0,0,0]
#train_storage_test.append(imgc[None])
#print (train_storage_test)

#hdf5_path = 'Yeast_ML_EXP1/dataset.hdf5'
#hdf5_file = tables.open_file(hdf5_path, mode='r')

#data_num = np.array(hdf5_file.root.train_img)
#print (data_num.shape)
#print (data_num[0,0,0,:])
#plt.imshow(data_num[0,:,:,1])



#hdf5_file.close()