Gender identification with celebA and VGG16 pretrained model


Dataset
The CelebA dataset contains over 200K images of celebrities labeled with 20 attributes including gender. The images are from the shoulders up, so most of the information is in the facial features and hair style.


For our experiment, we will be using 60k images with 20 selected attributes.




#### Feature and data extraction/preparation

    We’re going to use the VGG16 pretrained model and fine tune it to best identify gender from the celebrity images

In [1]:
# librairies
import pandas as pd
import numpy as np
import seaborn as sns
import os

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split


In [2]:
# import data
df=pd.read_csv('data/list_attr_celeba.csv')

df.head()

print(df.columns.values)
print(df.shape)

['image_id' '5_o_Clock_Shadow' 'Arched_Eyebrows' 'Attractive'
 'Bags_Under_Eyes' 'Bald' 'Bangs' 'Big_Lips' 'Big_Nose' 'Black_Hair'
 'Blond_Hair' 'Blurry' 'Brown_Hair' 'Bushy_Eyebrows' 'Chubby'
 'Double_Chin' 'Eyeglasses' 'Goatee' 'Gray_Hair' 'Heavy_Makeup'
 'High_Cheekbones' 'Male' 'Mouth_Slightly_Open' 'Mustache' 'Narrow_Eyes'
 'No_Beard' 'Oval_Face' 'Pale_Skin' 'Pointy_Nose' 'Receding_Hairline'
 'Rosy_Cheeks' 'Sideburns' 'Smiling' 'Straight_Hair' 'Wavy_Hair'
 'Wearing_Earrings' 'Wearing_Hat' 'Wearing_Lipstick' 'Wearing_Necklace'
 'Wearing_Necktie' 'Young']
(202599, 41)


In [3]:
# get labels for either gender
male=df[df['Male']==1][0:20000][['image_id', 'Male']]

female=df[df['Male']==-1][0:20000][['image_id','Male']]

male.head(3)

Unnamed: 0,image_id,Male
2,000003.jpg,1
6,000007.jpg,1
7,000008.jpg,1


In [5]:
# splitting train, test sets for either gender
m_train_X, m_test_X, train_y, test_y = train_test_split(male['image_id'],male['Male'], random_state = 0, test_size=.2)
f_train_X, f_test_X, train_y, test_y = train_test_split(female['image_id'],female['Male'], random_state = 0, test_size=.2)

m_test_X.head(3)


45740    045741.jpg
11812    011813.jpg
39695    039696.jpg
Name: image_id, dtype: object

In [6]:
import shutil
# creating folder to structure the data
origin_path= './data/img_align_celeba/'
train_path=  './data/Celeb_sets/train/'
valid_path=  './data/Celeb_sets/valid/'
test_path=   './data/Celeb_sets/test/'
fm='female/'
ml='male/'

# creating the directories
os.makedirs(train_path+ml)
os.makedirs(valid_path+ml)
os.makedirs(train_path+fm)
os.makedirs(valid_path+fm)


for file in m_train_X:
    #os.makedirs(origin_path+train_path+ml+file)
    shutil.copy(origin_path+file, train_path+ml+file)


In [7]:
m_test_X = m_test_X.iloc[2:]

for file in m_test_X:
    #os.makedirs(origin_path+valid_path+ml+file)
    shutil.copy(origin_path+file, valid_path+ml+file)


In [8]:

for file in f_train_X:
    #os.makedirs(origin_path+train_path+fm+file)
    shutil.copy(origin_path+file, train_path+fm+file)


In [9]:

for file in f_test_X:
    #os.makedirs(origin_path+valid_path+fm+file)
    shutil.copy(origin_path+file, valid_path+fm+file)

In [11]:
# construct seperate test set
test_m=df[df['Male']==1][-500:]
test_m=test_m.loc[:,'image_id']
test_f=df[df['Male']==-1][-500:]
test_f=test_f.loc[:,'image_id']

test_path='./data/Celeb_sets/test/'
os.makedirs(test_path+ml)
os.makedirs(test_path+fm)

for file in test_m:
    shutil.copy(origin_path+file, test_path+ml+file)

for file in test_f:
    shutil.copy(origin_path+file, test_path+fm+file)

#### Modeling

In [12]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, BatchNormalization


from tensorflow.keras import models
from tensorflow.keras import layers

In [13]:
num_classes=2

vgg=VGG16(include_top=False, pooling='avg', weights='imagenet',input_shape=(178, 218, 3))
vgg.summary()

# Freeze the layers except the last 2 layers
for layer in vgg.layers[:-5]:
    layer.trainable = False

# Check the trainable status of the individual layers
for layer in vgg.layers:
    print(layer, layer.trainable)
    

# Create the model
model = models.Sequential()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 178, 218, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 178, 218, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 178, 218, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 89, 109, 64)       0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 89, 109, 128)      73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 89, 109, 128)      147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 44, 54, 128)       0     

In [14]:
# Add the vgg convolutional base model
model.add(vgg)
 
# Add new layers
model.add(layers.Dense(128, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(num_classes, activation='sigmoid'))

model.summary()

# compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 512)               14714688  
_________________________________________________________________
dense (Dense)                (None, 128)               65664     
_________________________________________________________________
batch_normalization (BatchNo (None, 128)               512       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 14,781,122
Trainable params: 7,145,602
Non-trainable params: 7,635,520
_________________________________________________________________


In [15]:
import h5py
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

# use early stopping to optimally terminate training through callbacks
es=EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

# save best model automatically
mc= ModelCheckpoint('./CNN/Gender ID/best_model_2.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
cb_list=[es,mc]

In [19]:
from tensorflow.python.keras.applications.vgg16 import preprocess_input
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator



data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)


train_generator = data_generator.flow_from_directory(
        './data/Celeb_sets/train/',
        target_size=(178, 218),
        batch_size=12,
        class_mode='categorical')


validation_generator = data_generator.flow_from_directory(
        './data/Celeb_sets/valid/',
        target_size=(178, 218),
        batch_size=12,
        class_mode='categorical')


model.fit_generator(
        train_generator,
        epochs=5,
        steps_per_epoch=2667,
        validation_data=validation_generator,
        validation_steps=667, callbacks=cb_list)

Found 32000 images belonging to 2 classes.
Found 7998 images belonging to 2 classes.
Epoch 1/5

Epoch 00001: val_loss improved from 0.08378 to 0.06112, saving model to ./CNN/Gender ID/best_model_2.h5
Epoch 2/5

Epoch 00002: val_loss did not improve from 0.06112
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.06112
Epoch 00003: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f88b2cf7b90>

#### Testing the model


In [31]:
os.getcwd()
#root

'/Users/vickyyounang/Documents/PHD/winter2021/deep_learning/project_&_topic/Project/code/CNN/Gender ID'

In [36]:

# load a saved model
from tensorflow.keras.models import load_model

# changing directory to the best model saved
os.chdir('./CNN/Gender ID/')
saved_model = load_model('best_model_2.h5')

print(saved_model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f88b3103610>


In [49]:
root ='/Users/vickyyounang/Documents/PHD/winter2021/deep_learning/project_&_topic/Project/code/'

In [119]:

#os.chdir(root)

# generate data for test set of images
test_generator = data_generator.flow_from_directory(
        test_path,
        target_size=(178, 218),
        batch_size=1,
        class_mode='categorical',
        shuffle=False)

# obtain predicted activation values for the last dense layer
test_generator.reset()
pred=saved_model.predict_generator(test_generator, verbose=1, steps=1000)
# determine the maximum activation value for each sample
predicted_class_indices=np.argmax(pred,axis=1)

Found 1000 images belonging to 2 classes.


In [120]:
# label each predicted value to correct gender
labels = (test_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]


In [121]:
print(predictions[:10])

['female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female']


In [122]:
# format file names to simply male or female
filenames=test_generator.filenames

#print(filenames)

filenz=[0]
for i in range(0,len(filenames)):
    filenz.append(filenames[i].split('/')[0])
filenz=filenz[1:]


In [125]:
# determine the test set accuracy
match=[]
match_ml=[]
match_fm=[]

for i in range(0,len(filenames)):
    match.append(filenz[i]==predictions[i])
    if filenz[i]=='male':
        match_ml.append(filenz[i]==predictions[i])
    if filenz[i]=='female':
        match_fm.append(filenz[i]==predictions[i])
    
print('total accuracy = ', match.count(True)/1000)
print('male accuracy = ', match_ml.count(True)/500)
print('female accuracy = ', match_fm.count(True)/500)

total accuracy =  0.98
male accuracy =  0.982
female accuracy =  0.978


In [117]:
print(filenz[:10])
print(predictions[:10])

['female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'male']
['female', 'female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male']


In [126]:
results=pd.DataFrame({"Filename":filenz,"Predictions":predictions})

print(type(filenz), type(pd.Series(predictions)))

# let's see the false predictions

for i in range(1000):
    if filenz[i] != predictions[i]:
        print(i+1, False)


#pd.Series(filenz).str.match(pd.Series(predictions))



<class 'list'> <class 'pandas.core.series.Series'>
167 False
168 False
176 False
199 False
208 False
211 False
216 False
411 False
437 False
456 False
470 False
592 False
719 False
731 False
735 False
766 False
786 False
800 False
820 False
956 False


In [114]:

results.to_csv("GenderID_VGG16_test_results.csv",index=False)

In [127]:
# predict for pictures of children
test_generator = data_generator.flow_from_directory(
        root+'data/Celeb_sets/test-me',
        target_size=(178, 218),
        batch_size=1,
        class_mode='categorical',
        shuffle=False)


# obtain predicted activation values for the last dense layer
test_generator.reset()
#print(len(test_generator))
pred=saved_model.predict_generator(test_generator, verbose=1, steps=10)
# determine the maximum activation value for each sample
predicted_class_indices=np.argmax(pred,axis=1)



Found 10 images belonging to 2 classes.


In [128]:
# label each predicted value to correct gender
labels = (test_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

#print(len(labels), len(predictions))

# format file names to simply male or female
filenames=test_generator.filenames

print(filenames)

filenz=[0]
for i in range(0,len(filenames)):
    filenz.append(filenames[i].split('/')[0])
filenz=filenz[1:]

print(predictions)
print(filenz)

# determine the test set accuracy
match=[]
for i in range(0,len(filenames)):
    match.append(filenz[i]==predictions[i])
    
print(match)
match.count(True)/len(filenames)

['female/img_1.jpg', 'female/img_2.jpg', 'female/img_3.jpg', 'female/img_4.jpg', 'female/img_5.jpg', 'male/img_1.jpg', 'male/img_2.jpg', 'male/img_3.jpg', 'male/img_4.jpg', 'male/img_5.jpg']
['female', 'female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male']
['female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'male']
[True, True, True, True, True, False, True, True, True, True]


0.9

['female/img_1.jpg', 'female/img_2.jpg', 'female/img_3.jpg', 'female/img_4.jpg', 'female/img_5.jpg', 'male/img_1.jpg', 'male/img_2.jpg', 'male/img_3.jpg', 'male/img_4.jpg', 'male/img_5.jpg']
