In [None]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path
import os

In [None]:
Path('datasets_preprocess').mkdir(exist_ok=True)
os.chdir('datasets_preprocess')

## 1 Flowers

This dataset must be downloaded separedly, you can find it in this link:
https://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html

After downloading, make sure the folder structure is as follows:
```
📂<PARENT> 
 ┗ 📂flowers 
    ┗ 📂imgs 
       ┣ 📄image_00001.jpg
       ┣ 📄image_00002.jpg
       ┗ 📄 ...
```

In [None]:
flowers_parent_path = '<PATH TO FOLDER CONTAINING THE DATASET>'
flowers_dir = os.path.join(flowers_parent_path, 'flowers', 'imgs')

In [None]:
files = os.listdir(flowers_dir)
data = np.empty(shape=(len(files), 48, 48, 3), dtype='uint8')
for i, f in enumerate(tqdm(files)):
    img = tf.keras.preprocessing.image.load_img(os.path.join(flowers_dir, f))
    width, height = img.size
    x0 = (width - 500) // 2
    y0 = (height - 500) // 2
    img = img.crop((x0, y0, x0 + 500, y0 + 500))
    img = img.resize((48, 48))
    data[i] = tf.keras.preprocessing.image.img_to_array(img, dtype='uint8')
np.save('flowers.npy', data)

In [None]:
np.save(os.path.join('..', '..', 'flowers2.npy'), data)

## 2 CelebA

This dataset must be downloaded separedly, you can find it in this link:
http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html

After downloading, make sure the folder structure is as follows:
```
📂<PARENT> 
 ┗ 📂celeba 
    ┗ 📂imgs 
       ┣ 📄000001.jpg 
       ┣ 📄000002.jpg 
       ┗ 📄 ...
```

In [None]:
celeba_parent_path = '<PATH TO FOLDER CONTAINING THE DATASET>'
celeba_dir = os.path.join(celeba_parent_path, 'celeba', 'imgs')

In [None]:
IMAGE_COUNT = len(os.listdir(celeba_dir))
IMAGE_HEIGHT = 218
IMAGE_WIDTH = 178
HEIGHT_CROP = 96
WIDTH_CROP = 96
HEIGHT_RESIZE = 48
WIDTH_RESIZE = 48

x0 = int(0.5*(IMAGE_WIDTH  - WIDTH_CROP ))
y0 = int(0.7*(IMAGE_HEIGHT - HEIGHT_CROP))
x1 = x0 + WIDTH_CROP
y1 = y0 + HEIGHT_CROP

print('Bounding box:', (x0, y0, x1, y1))

In [None]:
def process_images(images):
    images = tf.image.crop_to_bounding_box(images, y0, x0, HEIGHT_CROP, WIDTH_CROP)
    images = tf.image.resize(images, (HEIGHT_RESIZE, WIDTH_RESIZE), method='area')
    return tf.cast(images, 'uint8')

In [None]:
dataset = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(celeba_dir, '..'),
    label_mode=None,
    batch_size=32,
    image_size=(218, 178)
)
dataset = dataset.map(process_images)

In [None]:
# Building the dataset from batches because of Out of Memory errors
num_batches = 1 + (IMAGE_COUNT - 1) // 32
iterator = iter(dataset)
count = 0
while(num_batches >= 0):
    if (num_batches <= 500):
        rest = num_batches
    else:
        rest = 500
    num_batches -= 500
    data = iterator.get_next()
    for i in tqdm(range(1, rest)):
        data = tf.concat((data, iterator.get_next()), axis=0)
    np.save('celeba_{}.npy'.format(count), data)
    count += 1

In [None]:
# Concatenating all batches into a single dataset file
data = np.load('celeba_0.npy')
for i in range(1, 13):
    data = np.concatenate((data, np.load('celeba_{}.npy'.format(i))), axis=0)
np.save(os.path.join('..', '..', 'celeba.npy'), data)