In [1]:
import os
import shutil
import random
from glob import glob
from tqdm import tqdm
from math import floor

from sklearn.datasets import load_files
from keras.preprocessing.image import load_img

Using TensorFlow backend.


In [2]:
#input_dataset_path = './data/train2017'
#train_dataset_path = './data/train'
#validation_dataset_path = './data/valid'
#test_dataset_path = './data/test'

input_dataset_path = './data/celeba/img_celeba'
train_dataset_path = './data/celeba/train'
validation_dataset_path = './data/celeba/valid'
test_dataset_path = './data/celeba/test'

In [3]:
def check_image_size_compliance(file_name, min_side=384):
    try:
        img = load_img(file_name)
    except IOError:
        return False

    width, height = img.size
    if width < min_side or height < min_side:
        return False
    
    return True

def get_size_complian_images(folder_path, min_side=384):
    compliant_images = []
    for img_path in tqdm(glob(folder_path)):
        compliant = check_image_size_compliance(img_path, min_side)
        if compliant:
            compliant_images.append(img_path)

    return compliant_images


In [4]:
all_valid_images = get_size_complian_images(input_dataset_path+'/*')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 202599/202599 [08:19<00:00, 405.79it/s]


In [5]:
random.shuffle(all_valid_images)

### Copy the images to the folders
- 75% train
- 20% validation
- 5% test

I picked only 5% for test since we are going to visualy inspect the images. If needed I can change the ratio.

In [6]:
images_count = len(all_valid_images)

train_start = 0
validation_start = int(images_count * 0.75)
test_start = int(images_count * 0.95)

train_images = all_valid_images[0:validation_start]
validation_images = all_valid_images[validation_start:test_start]
test_images = all_valid_images[test_start:]

print('train_images: ', len(train_images))
print('validation_images: ', len(validation_images))
print('test_images: ', len(test_images))

train_images:  73576
validation_images:  19620
test_images:  4906


In [7]:
print('Copying train images...')
for file_path in tqdm(train_images):
    shutil.copy2(file_path, train_dataset_path)

print('Copying validation images...')
for file_path in tqdm(validation_images):
    shutil.copy2(file_path, validation_dataset_path)

Copying train images...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73576/73576 [04:15<00:00, 287.76it/s]


Copying validation images...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19620/19620 [01:13<00:00, 267.86it/s]


In [8]:

print('Copying test images...')
for file_path in tqdm(test_images):
    shutil.copy2(file_path, test_dataset_path)

Copying test images...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4906/4906 [00:18<00:00, 263.85it/s]
