# Ethnicity classifier with a Conv Net
A classifier to identify the ethnicity of a person's face. The core of the classifier machine is a Convolutional Net. 

In [2]:
__author__ = "Joann H. Tang, PhD"
__copyright__ = "Copyright 2018"
__email__ = "eagtang2007@gmail.com"
__status__ = "Prototype"

In [3]:
import keras
import keras.utils
from keras.models import Sequential
#Core layers
from keras.layers import Dense, Dropout, Activation, Flatten
#CNN layers
from keras.layers import SeparableConv2D, Conv2D, MaxPooling2D
from keras.models import model_from_json

import numpy as np
import pandas as pd
import h5py

import io,json

import sqlite3

from sklearn.utils import shuffle 


Using TensorFlow backend.


#### Set the directory path

In [4]:
path = "/Users/huizhentang/Documents/Repos/Pet-projects/Ethnicity-Classifier/Datasets/"
save_path = "/Users/huizhentang/Documents/Repos/Pet-projects/Ethnicity-Classifier/"

#### Setting up parameters for the classifier¶

In [5]:
batch_size = 25
num_classes = 4
epochs = 100
dropout = 0.2

#### Load data

In [6]:
#Connect to the database. 
conn = sqlite3.connect(path + 'ethface.db')
cur = conn.cursor() 

#Load data from databse using pandas.read_sql_query. 
x = pd.read_sql_query("SELECT * FROM ethface_asian_features;", conn)
y = pd.read_sql_query("SELECT * FROM ethface_asian_labels;", conn)

#Close connection to database
cur.close() 
conn.close()

In [7]:
x.drop('index', axis=1, inplace=True)
y.drop('index', axis=1, inplace=True)
x = x.values #Convert dataframe to array
n = int(len(x[:,0])/(128*128))
x = np.reshape(x,(n,128,128,3), order='C')
y = y.values #Convert dataframe to array

#### Data formatting and normalization

In [8]:
#Ensuring data casting to the right data type
x = (x).astype('float32')
#Feature normalization
x /= 255
#Convert class vectors to binary class matrices¶
y = keras.utils.to_categorical(np.asarray(y), num_classes)

#### Splitting dataset into training, validation, and test set¶

In [9]:
def split_dataset(X,Y,p_train,p_val,p_test):
    """
    Randomly select a defined percentge of the dataset as train set 
    and divide the rest for validation set and test set 
   
    Arguments:
    X -- numpy array of feature data, here, they are the pixel values of images
    Y -- numpy array of label data
    p_train -- the percentage of the total dataset that assigned as training set
    p_train -- the percentage of the total dataset that assigned as validation set
    p_train -- the percentage of the total dataset that assigned as testing set
    
    Returns:
    x_train -- pixel values of images in the training set
    y_train -- labels of images in the training set
    x_val -- pixel values of images in the validation set
    y_val -- labels of images in the validation set
    x_test -- pixel values of images in the test set
    y_test -- labels of images in the test set

    """
    X, Y = shuffle(X, Y, random_state=42) 
    n_train = round(len(Y[:,0])*p_train)
    n_val = round(len(Y[:,0])*p_val)
    n_test = round(len(Y[:,0])*p_test)
    idx_train = list(range(0,n_train))
    idx_val = list(range(n_train,n_train+n_val))
    idx_test = list(range(n_train+n_val,len(Y[:,0])))
    x_train = X[idx_train,:]
    y_train = Y[idx_train,:]
    x_val = X[idx_val,:]
    y_val = Y[idx_val,:]
    x_test = X[idx_test,:]
    y_test = Y[idx_test,:]
    return x_train, y_train, x_val, y_val, x_test, y_test

In [10]:
x_train,y_train,x_val,y_val,x_test,y_test=split_dataset(x,y,0.6,0.3,0.1)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(x_test.shape)
print(y_test.shape)

(2696, 128, 128, 3)
(2696, 4)
(1348, 128, 128, 3)
(1348, 4)
(449, 128, 128, 3)
(449, 4)


#### Build a convolutional neural network

In [11]:
#Declare a sequential model
model = Sequential()
#CNN input layer 
model.add(SeparableConv2D(32, kernel_size =(3,3), 
                 activation='relu', 
                 depth_multiplier = 3,
                 padding = 'same',
                 input_shape=x_train.shape[1:]))

#Add hidden layers to the model 
model.add(Conv2D(32,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))

#Fully connected Dense layers 
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))

#### Generate model summary

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
separable_conv2d_1 (Separabl (None, 128, 128, 32)      401       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 126, 126, 32)      9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 63, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 63, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 61, 61, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 30, 64)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 30, 64)        0         
__________

#### Compile model

In [13]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

#### Train the neural network model

In [14]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val))

Train on 2696 samples, validate on 1348 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


#### Save model and weights

In [15]:
#Serialize model to JSON
model_json = model.to_json()
with io.open('model.json', 'w', encoding='utf-8') as json_file:
    json_file.write(model_json)

#Serialize weights to HDF5
model.save_weights(save_path + "model.h5")

print("Model saved to: " + save_path)

Model saved to: /Users/huizhentang/Documents/Repos/Pet-projects/Ethnicity-Classifier/


#### Run evaluation on the trained model with the test set

In [18]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 2.72974288596
Test accuracy: 0.436525612472
