# Loading data from IMDB-WIKI dataset

## config

In [8]:
IMAGE_SIZE = 56
min_score = 1.0

## utils

In [4]:
from scipy.io import loadmat
from datetime import datetime
import os
import numpy as np
import cv2
from tqdm import tqdm
from pathlib import Path


def calc_age(taken, dob):
    birth = datetime.fromordinal(max(int(dob) - 366, 1))

    # assume the photo was taken in the middle of the year
    if birth.month < 7:
        return taken - birth.year
    else:
        return taken - birth.year - 1


def get_meta(mat_path, db):
    meta = loadmat(mat_path)
    full_path = meta[db][0, 0]["full_path"][0]
    dob = meta[db][0, 0]["dob"][0]  # Matlab serial date number
    gender = meta[db][0, 0]["gender"][0]
    photo_taken = meta[db][0, 0]["photo_taken"][0]  # year
    face_score = meta[db][0, 0]["face_score"][0]
    second_face_score = meta[db][0, 0]["second_face_score"][0]
    age = [calc_age(photo_taken[i], dob[i]) for i in range(len(dob))]
    face_location = meta[db][0, 0]["face_location"]
    return full_path, dob, gender, photo_taken, face_score, second_face_score, age, np.squeeze(face_location)


def load_data(mat_path):
    d = loadmat(mat_path)

    return d["image"], d["gender"][0], d["age"][0], d["db"][0], d["img_size"][0, 0], d["min_score"][0, 0]


## read data and create database

In [9]:
db = 'imdb'
root_path = "./{}_crop/".format(db)
mat_path = root_path + "{}.mat".format(db)
full_path, dob, gender, photo_taken, face_score, second_face_score, age, face_location = get_meta(mat_path, db)

In [15]:
sample_num = len(face_score)
valid_sample_num = 0

In [26]:
# create database from WIKI Dataset
for i in tqdm(range(sample_num)):
    if face_score[i] < min_score:
        continue

    if (~np.isnan(second_face_score[i])) and second_face_score[i] > 0.0:
        continue

    if ~(0 <= age[i] <= 100):
        continue

    if np.isnan(gender[i]):
        continue

    if age[i] >= 86:
        continue
    if age[i] <= 0:
        continue

    img = cv2.imread(root_path + str(full_path[i][0]))

    if int(gender[i]) == 0:
        label_gender = 1
    else:
        label_gender = 0
    cv2.imwrite('./crop/' + str(age[i]) + '_' + str(label_gender) + '_' + str(i) + '.jpg', img)

    valid_sample_num += 1
print(valid_sample_num)

100%|██████████| 460723/460723 [12:02<00:00, 637.69it/s] 

343348





## Create data 3 age classes

In [68]:
def create_data_3():
    img_dir = Path('./crop/')
    img_size = IMAGE_SIZE
    num1_26 = 0
    num27_52 = 0
    num53_80 = 0
    male = 0
    female = 0
    data = []
    for img_path in tqdm(img_dir.glob('*.jpg')):
        name = img_path.name  # [age]_[gender]_[race]_[date&time].jpg
        age, gender = name.split('_')[:2]
        img = cv2.imread(str(img_path))

        age = int(age)
        if age >= 81:
            continue

        if 1 <= age <= 26:
            num1_26 += 1
            label_age = 0
        elif 27 <= age <= 52:
            num27_52 += 1
            label_age = 1
        else:
            num53_80 += 1
            label_age = 2

        label_gender = int(gender)

        if label_gender == 0:
            male += 1
        else:
            female += 1

        img = cv2.resize(img, (img_size, img_size), cv2.INTER_AREA)
        data.append((img, label_age, label_gender))

    print('Number of data')
    print('1-26: ', num1_26)
    print('27-52: ', num27_52)
    print('53-80: ', num53_80)
    print('male: ', male)
    print('female: ', female)
    with open('data_3.npy','wb') as f:
        np.save(f, data)

In [44]:
create_data_3()

171674it [07:03, 405.12it/s]


Number of data
1-26:  38514
27-52:  113914
53-80:  18890
male:  93379
female:  77939


## Create data 5 age classes

In [92]:
def create_data_5():
    img_dir = Path('./crop/')
    img_size = IMAGE_SIZE
    num1_13 = 0
    num14_23 = 0
    num24_39 = 0
    num40_55 = 0
    num56_80 = 0
    male = 0
    female = 0
    data = []
    for img_path in tqdm(img_dir.glob('*.jpg')):
        name = img_path.name    # [age]_[gender]_[race]_[date&time].jpg
        age, gender = name.split('_')[:2]

        img = cv2.imread(str(img_path))
        age = int(age)

        if 1 <= age <= 11:
            num1_13 += 1
            label_age = 0
        elif 12 <= age <= 23:
            num14_23 += 1
            label_age = 1
        elif 24 <= age <= 39:
            num24_39 += 1
            label_age = 2
        elif 40 <= age <= 55:
            num40_55 += 1
            label_age = 3
        else:
            num56_80 += 1
            label_age = 4

        label_gender = int(gender)
        if label_gender == 3:
            label_gender = 1
        if label_gender == 0:
            male += 1
        else:
            female += 1

        label_gender = int(gender)
        if label_gender == 3:
            label_gender = 1
        img = cv2.resize(img, (img_size, img_size), cv2.INTER_AREA)
        data.append((img, label_age, label_gender))

    print('Number of data')
    print('1-13: ', num1_13)
    print('14-23: ', num14_23)
    print('24-39: ', num24_39)
    print('40-55: ', num40_55)
    print('56-80: ', num56_80)
    print('male: ', male)
    print('female: ', female)

    with open('data_5.npy','wb') as f:
        np.save(f, data)

In [93]:
create_data_5()

171674it [07:20, 389.47it/s]


Number of data
1-13:  1849
14-23:  22454
24-39:  87446
40-55:  45170
56-80:  14755
male:  93623
female:  78051


## Create data 10 age classes

In [71]:
def create_data_10():
    img_dir = Path('./crop/')
    img_size = IMAGE_SIZE
    num1_8 = 0
    num9_16 = 0
    num17_24 = 0
    num25_32 = 0
    num33_40 = 0
    num41_48 = 0
    num49_56 = 0
    num57_64 = 0
    num65_72 = 0
    num73_80 = 0
    
    male = 0
    female = 0
    data = []
    for img_path in tqdm(img_dir.glob('*.jpg')):
        name = img_path.name  # [age]_[gender]_[race]_[date&time].jpg
        age, gender = name.split('_')[:2]
        img = cv2.imread(str(img_path))

        age = int(age)
        if age >= 81:
            continue
            

        if 1 <= age <= 8:
            num1_8 += 1
            label_age = 0
        elif 9 <= age <= 16:
            num9_16 += 1
            label_age = 1
        elif 17 <= age <= 24:
            num17_24 += 1
            label_age = 2
        elif 25 <= age <= 32:
            num25_32 += 1
            label_age = 3
        elif 33 <= age <= 40:
            num33_40 += 1
            label_age = 4
        elif 41 <= age <= 48:
            num41_48 += 1
            label_age = 5
        elif 49 <= age <= 56:
            num49_56 += 1
            label_age = 6  
        elif 57 <= age <= 64:
            num57_64 += 1
            label_age = 7   
        elif 65 <= age <= 72:
            num65_72 += 1
            label_age = 8      
        else:
            num73_80 += 1
            label_age = 9

        label_gender = int(gender)
        if label_gender == 3:
            label_gender = 1

        if label_gender == 0:
            male += 1
        else:
            female += 1

        img = cv2.resize(img, (img_size, img_size), cv2.INTER_AREA)
        data.append((img, label_age, label_gender))

    print('Number of training data')
    print('1_8: ', num1_8)
    print('9_16: ', num9_16)
    print('17_24: ', num17_24)
    print('25_32: ', num25_32)
    print('33_40: ', num33_40)
    print('41_48: ', num41_48)
    print('49_56: ', num49_56)
    print('57_64: ', num57_64)
    print('65_72: ', num65_72)
    print('73_80: ', num73_80)
    
    print('male: ', male)
    print('female: ', female)
    with open('data_10.npy','wb') as f:
        np.save(f, data)

In [72]:
create_data_10()

171674it [07:11, 397.57it/s]


Number of training data
1_8:  639
9_16:  5376
17_24:  22922
25_32:  44130
33_40:  43005
41_48:  27983
49_56:  14050
57_64:  7995
65_72:  3694
73_80:  1524
male:  93379
female:  77939


## Create data age class = 20

In [78]:
def create_data_20():
    img_dir = Path('./crop/')
    img_size = IMAGE_SIZE
    num1_4 = 0
    num5_8 = 0
    num9_12 = 0
    num13_16 = 0
    num17_20 = 0
    num21_24 = 0
    num25_28 = 0
    num29_32 = 0
    num33_36 = 0
    num37_40 = 0
    num41_44 = 0
    num45_48 = 0
    num49_52 = 0
    num53_56 = 0
    num57_60 = 0
    num61_64 = 0
    num65_68 = 0
    num69_72 = 0
    num73_76 = 0
    num77_80 = 0
    
    male = 0
    female = 0
    data = []
    for img_path in tqdm(img_dir.glob('*.jpg')):
        name = img_path.name  # [age]_[gender]_[race]_[date&time].jpg
        age, gender = name.split('_')[:2]
        img = cv2.imread(str(img_path))

        age = int(age)
        if age >= 81:
            continue

        if 1 <= age <= 4:
            num1_4 += 1
            label_age = 0
        elif 5 <= age <= 8:
            num5_8 += 1
            label_age = 1
        elif 9 <= age <= 12:
            num9_12 += 1
            label_age = 2
        elif 13 <= age <= 16:
            num13_16 += 1
            label_age = 3
        elif 17 <= age <= 20:
            num17_20 += 1
            label_age = 4
        elif 21 <= age <= 24:
            num21_24 += 1
            label_age = 5
        elif 25 <= age <= 28:
            num25_28 += 1
            label_age = 6  
        elif 29 <= age <= 32:
            num29_32 += 1
            label_age = 7   
        elif 33 <= age <= 36:
            num33_36 += 1
            label_age = 8  
            
        elif 37 <= age <= 40:
            num37_40 += 1
            label_age = 9
        elif 41 <= age <= 44:
            num41_44 += 1
            label_age = 10
        elif 45 <= age <= 48:
            num45_48 += 1
            label_age = 11  
        elif 49 <= age <= 52:
            num49_52 += 1
            label_age = 12   
        elif 53 <= age <= 56:
            num53_56 += 1
            label_age = 13   
        elif 57 <= age <= 60:
            num57_60 += 1
            label_age = 14      
        elif 61 <= age <= 64:
            num61_64 += 1
            label_age = 15
        elif 65 <= age <= 68:
            num65_68 += 1
            label_age = 16
        elif 69 <= age <= 72:
            num69_72 += 1
            label_age = 17
        elif 73 <= age <= 76:
            num73_76 += 1
            label_age = 18    
        else:
            num77_80 += 1
            label_age = 19

        label_gender = int(gender)


        if label_gender == 0:
            male += 1
        else:
            female += 1

        img = cv2.resize(img, (img_size, img_size), cv2.INTER_AREA)
        data.append((img, label_age, label_gender))

    print('Number of training data')
    print('1_4: ', num1_4)
    print('5_8: ', num5_8)
    print('9_12: ', num9_12)
    print('13_16: ', num13_16)
    print('17_20: ', num17_20)
    print('21_24: ', num21_24)
    print('25_28: ', num25_28)
    print('29_32: ', num29_32)
    print('33_36: ', num33_36)
    print('37_40: ', num37_40)
    print('41_44: ', num41_44)
    print('45_48: ', num45_48)
    print('49_52: ', num49_52)
    print('53_56: ', num53_56)
    print('57_60: ', num57_60)
    print('61_64: ', num61_64)
    print('65_68: ', num65_68)
    print('69_72: ', num69_72)
    print('73_76: ', num73_76)
    print('77_80: ', num77_80)
 
    print('male: ', male)
    print('female: ', female)
    np.save('data_20.npy', data)

In [79]:
create_data_20()

171674it [06:46, 422.69it/s]


Number of training data
1_4:  117
5_8:  522
9_12:  1806
13_16:  3570
21_24:  14947
29_32:  24242
33_36:  22742
37_40:  20263
41_44:  15821
45_48:  12162
49_52:  8373
53_56:  5677
57_60:  4697
61_64:  3298
65_68:  2185
69_72:  1509
77_80:  602
male:  93379
female:  77939
