## Filter the passed input images and copy them to a common location. After that split them into train, validation and test sets.

#### Imports

In [1]:
import os
import shutil
import random
from glob import glob
from tqdm import tqdm
from math import floor

from sklearn.datasets import load_files
from keras.preprocessing.image import load_img

Using TensorFlow backend.


#### Helper functions

In [2]:
def check_image_size_compliance(file_name, min_side=384):
    try:
        img = load_img(file_name)
    except IOError:
        return False

    width, height = img.size
    if width < min_side or height < min_side:
        # print("Skipping image: ", file_name)
        return False
    
    return True

In [3]:
def get_size_complian_images(folder_path, min_side=384):
    compliant_images = []
    for img_path in tqdm(glob("{0}/*".format(folder_path))):
        compliant = check_image_size_compliance(img_path, min_side)
        if compliant:
            compliant_images.append(img_path)

    return compliant_images

In [4]:
def copy_images_to_path(images_list, destination_path):
    for file_path in tqdm(images_list):
        shutil.copy2(file_path, destination_path)

In [5]:
def filter_and_copy_images_to_folder(input_folder, destination_folder, min_side=384):
    print("Get valid images...")
    all_valid_images = get_size_complian_images(folder_path=input_folder, min_side=min_side)
    print("Copy the valid images to destionation...")
    copy_images_to_path(images_list=all_valid_images, destination_path=destination_folder)

In [6]:
def get_input_dataset_files(input_dataset):
    return glob("{0}/*".format(input_dataset))

In [7]:
def split_dataset(input_dataset_files, train_path, validation_path, test_path, train_ratio=0.75, validation_ratio=0.2, test_ratio=0.5):
    images_count = len(input_dataset_files)
    
    random.shuffle(input_dataset_files)

    train_start = 0
    validation_start = int(images_count * train_ratio)
    test_start = int(images_count * (train_ratio + validation_ratio))

    train_images = input_dataset_files[0:validation_start]
    validation_images = input_dataset_files[validation_start:test_start]
    test_images = input_dataset_files[test_start:]

    print('train_images: ', len(train_images))
    print('validation_images: ', len(validation_images))
    print('test_images: ', len(test_images))
    print('Copying train images...')
    
    for file_path in tqdm(train_images):
        shutil.copy2(file_path, train_path)

    print('Copying validation images...')
    for file_path in tqdm(validation_images):
        shutil.copy2(file_path, validation_path)
        
    print('Copying test images...')
    for file_path in tqdm(test_images):
        shutil.copy2(file_path, test_path)

### Filter and merge the input images for MS COCO

In [8]:
input_dataset_paths = ['D:/datasets/images/mscoco/train2014', 'D:/datasets/images/mscoco/train2017',
                       'D:/datasets/images/mscoco/val2014', 'D:/datasets/images/mscoco/val2017',
                       'D:/datasets/images/mscoco/test2014', 'D:/datasets/images/mscoco/test2015', 'D:/datasets/images/mscoco/test2017']

merged_dataset_path = 'D:/datasets/images/mscoco/merged'

train_dataset_path = 'D:/git/image_super_resolution/data/MSCOCO/train'
validation_dataset_path = 'D:/git/image_super_resolution/data/MSCOCO/val'
test_dataset_path = 'D:/git/image_super_resolution/data/MSCOCO/test'

In [9]:
for folder in input_dataset_paths:
    filter_and_copy_images_to_folder(folder, merged_dataset_path)

Get valid images...


100%|███████████████████████████████████████████████████████████████████████████| 82783/82783 [03:11<00:00, 431.71it/s]


Copy the valid images to destionation...


100%|███████████████████████████████████████████████████████████████████████████| 68382/68382 [08:15<00:00, 137.93it/s]


Get valid images...


100%|█████████████████████████████████████████████████████████████████████████| 118287/118287 [04:53<00:00, 402.75it/s]


Copy the valid images to destionation...


100%|████████████████████████████████████████████████████████████████████████████| 97799/97799 [48:38<00:00, 33.52it/s]


Get valid images...


100%|███████████████████████████████████████████████████████████████████████████| 40504/40504 [01:58<00:00, 343.18it/s]


Copy the valid images to destionation...


100%|████████████████████████████████████████████████████████████████████████████| 33489/33489 [10:17<00:00, 54.23it/s]


Get valid images...


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:32<00:00, 154.31it/s]


Copy the valid images to destionation...


100%|██████████████████████████████████████████████████████████████████████████████| 4072/4072 [01:40<00:00, 40.65it/s]


Get valid images...


100%|███████████████████████████████████████████████████████████████████████████| 40775/40775 [02:07<00:00, 320.22it/s]


Copy the valid images to destionation...


100%|████████████████████████████████████████████████████████████████████████████| 33694/33694 [15:21<00:00, 36.57it/s]


Get valid images...


100%|███████████████████████████████████████████████████████████████████████████| 81434/81434 [03:24<00:00, 398.05it/s]


Copy the valid images to destionation...


100%|████████████████████████████████████████████████████████████████████████████| 67374/67374 [31:45<00:00, 35.36it/s]


Get valid images...


100%|███████████████████████████████████████████████████████████████████████████| 40670/40670 [01:49<00:00, 372.28it/s]


Copy the valid images to destionation...


100%|████████████████████████████████████████████████████████████████████████████| 33508/33508 [15:03<00:00, 37.10it/s]


### Copy the images to the train, validation and test folders
- 75% train
- 20% validation
- 5% test

I picked only 5% for test since we are going to visualy inspect the images. If needed I can change the ratio.
Even the validation set might be too big, since on the GAN we don't need it.

In [10]:
split_dataset(get_input_dataset_files(merged_dataset_path), train_dataset_path, validation_dataset_path, test_dataset_path)

train_images:  253738
validation_images:  67664
test_images:  16916
Copying train images...


100%|████████████████████████████████████████████████████████████████████████| 253738/253738 [5:30:10<00:00, 12.81it/s]


Copying validation images...


100%|██████████████████████████████████████████████████████████████████████████| 67664/67664 [1:03:08<00:00, 17.18it/s]


Copying test images...


100%|████████████████████████████████████████████████████████████████████████████| 16916/16916 [15:30<00:00, 28.58it/s]


## Filter and split the images for CelebA

In [8]:
merged_dataset_path = 'C:/datasets/img_celeba'

train_dataset_path = 'C:/git/image_super_resolution/data/celeba/train'
validation_dataset_path = 'C:/git/image_super_resolution/data/celeba/val'
test_dataset_path = 'C:/git/image_super_resolution/data/celeba/test'

In [10]:
print("Total number of images: ", len(glob("{0}/*".format(merged_dataset_path))))
print("Getting the list of compliant images...")
compliant_images = get_size_complian_images(merged_dataset_path)
print("Number of compliant images: ", len(compliant_images))
split_dataset(compliant_images, train_dataset_path, validation_dataset_path, test_dataset_path)

Total number of images:  202599


100%|████████████████████████████████████████████████████████████████████████| 202599/202599 [00:31<00:00, 6529.10it/s]


Number of compliant images:  98102
train_images:  73576
validation_images:  19620
test_images:  4906
Copying train images...


100%|███████████████████████████████████████████████████████████████████████████| 73576/73576 [02:58<00:00, 412.95it/s]


Copying validation images...


100%|███████████████████████████████████████████████████████████████████████████| 19620/19620 [01:12<00:00, 271.44it/s]


Copying test images...


100%|█████████████████████████████████████████████████████████████████████████████| 4906/4906 [00:14<00:00, 346.05it/s]
