# Create minority dataset

In [1]:
import os
import numpy as np
import pandas as pd
from models.vae import VAE
from utils.minority_dataset import load_celeba, MinorityDataset

# Parameters
input_shape = (128, 128, 3)
batch_size = 256
latent_dim = 200

# Specify paths for data, pre-trained weights and results.
data_folder = os.path.join('..', 'datasets', 'celeba')
image_folder = os.path.join(data_folder, 'img_align_celeba')
checkpoint_path = os.path.join('..', 'results', 'vae', 'celeba_v1', 'cp.ckpt')

mean_array_path = os.path.join(os.path.dirname(checkpoint_path), 'mean_array.npy')

# Load the model.
vae = VAE()
vae.load_weights(checkpoint_path)

# Load the data.
df = pd.read_csv(os.path.join(data_folder, 'list_attr_celeba.csv'))
data_flow = load_celeba(df, image_folder, input_shape[:2], batch_size)

# Create the minority dataset.
new_data = MinorityDataset(data_flow, vae, latent_dim)

if not os.path.exists(mean_array_path):
    mean = new_data.get_latent_mean()
    np.save(mean_array_path, mean)
else:
    mean = np.load(mean_array_path)

minority_list = new_data.get_sub_dataset(mean, bins=100, extremes=18)
percentage = minority_list.sum()/len(minority_list) * 100

output_folder = os.path.join(os.path.dirname(checkpoint_path), f'minority_{percentage:02.0f}')
os.mkdir(output_folder)
output_file = os.path.join(output_folder, 'minority_dataset.csv')

new_data.create_new_df(minority_list, df, output_file)

2022-02-16 15:55:35.991442: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-16 15:55:36.019209: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-16 15:55:36.019360: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-16 15:55:36.019833: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Found 202599 validated image filenames.
The minority dataset has 42143 images (20.80% of total).


In [2]:
df = pd.read_csv(output_file)
df.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000020.jpg,-1,-1,-1,-1,-1,-1,-1,-1,1,...,1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,000024.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,1,-1,1,1,-1,1
4,000030.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,-1
