# Create Dataset from .zip files and calc normalization statistics

Step_1: create standard dataset to be re-used, either by creating folders or as hdf5 object. 

Step_2: calculate mean + std for train and test images

In [8]:
import os

print(os.getcwd())
# trying to unzip the 
# Unzips, shuffles, re-zips and stores file at specified output_path

Dataset_name = 'mmr1KO_240'
input_path = f'../datasets/original_zips/{Dataset_name}.zip'
output_path = '../datasets/shuffled_zips/'

# Function call:

ShuffleZip(Dataset_name, input_path, output_path)

/Users/cerber/repos/WPColumbia/YNet_dev
205 images assigned to train
35 images assigned to test


NameError: name 'TempPath' is not defined

## 1. Create shuffled dataset in zip-files

In [30]:
# -> ShuffleZip and sub-functions

# Unzips, shuffles, re-zips and stores file at specified output_path

Dataset_name = 'mmm1KO_230'
input_path = 'datasets/Exp1_data_storage/original_zips/mmm1KO_230.zip'
output_path = 'datasets/Exp1_data_storage/shuffled_zips/'

# Function call:

ShuffleZip(Dataset_name, input_path, output_path)




195 images assigned to train
35 images assigned to test
All training images zipped successfully!
All test images zipped successfully!
Files moved to:datasets/Exp1_data_storage/shuffled_zips/


()

## 2. Unzip pairs of train/test images from zip files to correct folder structure

In [None]:
# -> ready_data function:

# Unzips a pair (or trio if val is used) of train & test zip files according to dataset_name into a target folder;
# creates, or respects folder structure according to:
#
# -folder
#   |-train
#      |-class_1
#      |-class_2
#   |-test
#      |-class_1
#      |-class_2

Dataset_name = 'mmm1KO_230'
input_path = 'datasets/Exp1_data_storage/shuffled_zips'
output_path = 'datasets/yeast_v3/'

ready_data(Dataset_name, input_path, output_path, data_struct = ['train', 'test'])



## Functions 

### 1.1. Creating shuffled list of train, test and potentially validation images

In [2]:
def ShuffleZip(Dataset_name, input_path, output_path, val = False):
    
    """
    Function to unzip, shuffle, re-zip and store a set of images at a specified location.
    
    Arguments:
    
    Dataset_name: name of the dataset, e.g. WT_175 - should be descriptive 
    input_path: path to input .zip file
    output_path: path for shuffled .zip-file to be stored

    -> creates temp folder in same directory as .zip file to store unzipped files in, but deletes it once done. 
    -> shuffles and splits unzipped files between train, test and optionally val datasets.
    -> optionally re-zip or storage in hdf5 object (TODO)

    """

    import glob
    import os
    import zipfile



    ### Arguments ###
    # select cell designation, e.g. WT or mfb1KO - important for, well, naming... 
    Dataset_name = Dataset_name #don't add .zip here

    # choose target .zip file
    ZPath = input_path
    
    # chose path to save shuffled .zip at
    save_path = output_path


    ### Execution --------------------------------------------------------------------------------###
    
    TempPath = os.path.dirname(ZPath) + '/TEMP-' + Dataset_name # Path definition, also for later use
    
    # unzips files into temp folder
    if os.path.exists(TempPath):
        raise ValueError('temp folder already exists in directory; consider deleting and re-run')
    else:
        os.makedirs(TempPath)

    zip_ref = zipfile.ZipFile(ZPath, 'r') 
    zip_ref.extractall(TempPath)
    zip_ref.close()
    
    
    ### shuffel images and zip
    test_addrs, train_addrs = shuffle_images(TempPath)
    
    # zip shuffled images and store at output_path
    zipup(save_path, test_addrs, train_addrs, Dataset_name, val_addrs = None)
    
#     return ()

### 1.2 Shuffling images using random.sample

In [3]:
def shuffle_images(TempPath, val = False):

    import random
    import os

    random.seed(1) #reproducible randomness

    ### OPTIONS ###
    shuffle_data = True  # shuffle the addresses before saving




    ### Execution --------------------------------------------------------------------------------###

    # get list of files in TempPath
    addrs = os.listdir(TempPath)

    # create shuffled list
    if shuffle_data:
        addrs = random.sample(addrs, k = len(addrs)) #creates shuffled list by random sampling from original list.


    """
    Question: 
    Generating train, test and optionally val datasets - Question: should there be the same absolute number of test/val 
    images for each class or should the number vary depending on total number of images per class e.g. 
    20 test images for a total of 100 class A images, but 40 for a total of 200 class B images?   

    """

    # # Divide the hata into 60% train, 20% test, and optionally 20% val
    # train_addrs = addrs[0:int(0.8*len(addrs))]
    # test_addrs = addrs[int(0.8*len(addrs)):]
    # # val_addrs = addrs[int(0.6*len(addrs)):int(0.8*len(addrs))]

    # Select == 35 images for test and optionally val datasets; put the rest into train
    test_addrs = addrs[0:35]
    train_addrs = addrs[35:]

    print(str(len(train_addrs)) + ' images assigned to train')
    print(str(len(test_addrs)) + ' images assigned to test')
    
    return test_addrs, train_addrs

### 1.3 Creating .zip files from shuffled data

In [4]:
def zipup(save_path, test_addrs, train_addrs, Dataset_name, val_addrs = None):

    # Creating .zip file of train, test and potentially validation images. 

    from zipfile import ZipFile
    from os.path import basename #required to use in zipfile.Zipfile.write(file, basename(file)) to avoid completed path to be archived
    import os
    import shutil 

    # use for debugging
    verbose = 0

    save_path = 'datasets/Exp1_data_storage/shuffled_zips/'

    # make subdirectory to store suffled zip files
    if not os.path.exists(save_path):
        os.makedirs(save_path)




    ### processing train images: ###
    if verbose:
        print('Following files will be zipped:')
        for addrs in train_addrs:
            print(addrs)

    # writing files to a zipfile
    with ZipFile(save_path + Dataset_name + '_train_data.zip','w') as zip:
        # writing each file one by one
        for addrs in train_addrs:
            zip.write(TempPath + '/' + addrs, basename(addrs))

        print('All training images zipped successfully!')


    ### processing Test images: ###
    if verbose:
        print('Following files will be zipped:')
        for addrs in test_addrs:
            print(addrs)

    # writing files to a zipfile
    with ZipFile(save_path + Dataset_name + '_test_data.zip','w') as zip:
        # writing each file one by one
        for addrs in test_addrs:
            zip.write(TempPath + '/' + addrs, basename(addrs))

        print('All test images zipped successfully!')

#     zip_ref.close()

    print('Files moved to:' + save_path)
    
#     return ()

### 2.1 Unzips pairs (or trio's if val included) of shuffled images in zip files according to dataset_name and creates/stores images in train/test subfolder structure

In [5]:
def ready_data(Dataset_name, input_path, output_path, data_struct = ['train', 'test']):

    # To be used with shuffled data in zip files. 
    # Extracts these to specified dataset folder in train/test subfolders

    import glob
    import os
    import zipfile

    ### OPTIONS ###
    
    # select cell designation, e.g. WT or mfb1KO - important for, well, naming... 
    Dataset_name = Dataset_name
    
    # choose path where target zip-files are stored
    ZPath = input_path
    
    # define path for files to be unzipped and stored in train and test directories
    output_path = output_path

    # optionally add 'val' keyword if datasets (zip files) have been created accordingly
    data_struct = data_struct



    ### Execution --------------------------------------------------------------------------------###

    # unzips files correct folders or creates them

    for i in data_struct:

        if not os.path.exists(output_path + '/' + i):
            os.makedirs(output_path + '/' + i)
            print(i + ' created')
            if not os.path.exists(output_path + '/' + i + Dataset_name):
                os.makedirs(output_path + '/' + i + '/' + Dataset_name)
                print(i + '/' + Dataset_name + ' created')
            else:
                raise ValueError('WARNING:' + i + '/' + Dataset_name + ' exists already - process cancelled to avoid overwriting')

        zip_ref = zipfile.ZipFile(ZPath + '/' + Dataset_name + '_' + i + '_data.zip', 'r') 
        zip_ref.extractall(output_path + '/' + i + '/' + Dataset_name)
        zip_ref.close()    
        
#         return