# Ethnicity classifier with a Conv Net

In [None]:
__author__ = "Dr. Joann H. Tang & Dr. Rahul Remanan"
__copyright__ = "Copyright 2018"
__email__ = "eagtang2007@gmail.com,rahul@remanan.net"
__status__ = "Prototype"

In [None]:
import keras
import keras.utils
from keras.models import Sequential
#Core layers
from keras.layers import Dense, Dropout, Activation, Flatten
#CNN layers
from keras.layers import SeparableConv2D, Conv2D, MaxPooling2D

from sklearn.utils import shuffle 
import numpy as np
import h5py

from PIL import Image
import glob
import gc

#### Set the directory path

In [None]:
path = "./Pooled/"
save_path = "./model_weights.h5"
load_trained_model = True

#### Function for extracting pixel values from images

In [None]:
def extract_pixel_values(prefix,y_label,width,height):
    """
    Resize image to the given width and height. 
    Then extract pixel values from the resized images. 
    
    Arguments:
    prefix -- The images for each class were labeled with a specific prefix
    y_label -- The number assigned to represent a specific ethnicity group
    width, height -- the width and height of the target image size
    
    Returns:
    X -- pixel values of images, 
         numpy array of shape(number_of_images,width,height,3)
    Y -- the ethnicity group label of images, 
         numpy array of shape(number_of_images)
    """
    ims = glob.glob(path + prefix + "*.jpg")
    
    Y = np.zeros((np.size(ims),1))
    Y[:] = int(y_label)
    X = np.zeros((np.size(ims),width,height,3)) 
    for i, im in enumerate(ims):
        img = Image.open(im)
        img_resized = img.resize([width,height],Image.ANTIALIAS) 
        X[i] = np.asarray(img_resized)
    return X, Y

#### Function for splitting dataset into training set and validation set

In [None]:
def split_dataset(X,Y,percentage):
    """
    Randomly select a defined percentge of the dataset as train set 
    and divide the rest for validation set and test set 
   
    Arguments:
    X -- numpy array of feature data, here, they are the pixel values of images
    Y -- numpy array of label data
    percentage -- the percentage of the total dataset that assigned as training set
    
    Returns:
    x_train -- pixel values of images in the training set
    y_train -- labels of images in the training set
    x_val -- pixel values of images in the validation set
    y_val -- labels of images in the validation set
    x_test -- pixel values of images in the test set
    y_test -- labels of images in the test set

    """
    idx_train = np.random.randint(np.size(Y), size=round(np.size(Y)*percentage))
    idx_val = np.random.randint(np.size(Y), size=round(np.size(Y)*0.5*(1-percentage)))
    idx_test = np.random.randint(np.size(Y), size=round(np.size(Y)*0.5*(1-percentage)))
    x_train = X[idx_train,:]
    y_train = Y[idx_train]
    x_val = X[idx_val,:]
    y_val = Y[idx_val]
    x_test = X[idx_test,:]
    y_test = Y[idx_test]
    return x_train, y_train, x_val, y_val, x_test, y_test

#### Generating dataset
This project uses the United States Census Bureau classification of ethinicites in the US.

Currently the dataset is limited to four ethnic groups in the US:
1) Asian American

2) Black and African American

3) Caucasian/White and Euroopean American

4) Hispanic and Latino American

The dataset is missing information on:
1) Native American and Alaska Native

2) Native Hawaiian and other Pacific Islander

#### Ethnicity group -- Asian American

In [None]:
X, Y = extract_pixel_values("A",0,128,128) 
x1_train,y1_train,x1_val,y1_val,x1_test,y1_test = split_dataset(X,Y,0.6)

#### Ethnicity group -- Black and African American

In [None]:
X, Y = extract_pixel_values("B",1,128,128)                          
x2_train,y2_train,x2_val,y2_val,x2_test,y2_test = split_dataset(X,Y,0.6)

#### Ethnicity group -- Caucasian/White and European American

In [None]:
X, Y = extract_pixel_values("C",2,128,128)                          
x3_train,y3_train,x3_val,y3_val,x3_test,y3_test = split_dataset(X,Y,0.6)

#### Ethnicity group -- Hispanic and Latino Americans

In [None]:
X, Y = extract_pixel_values("H",3,128,128)                          
x4_train,y4_train,x4_val,y4_val,x4_test,y4_test = split_dataset(X,Y,0.6)

#### Create training dataset 

In [None]:
n = min(len(y1_train),len(y2_train),len(y3_train),len(y4_train))
x_train = np.concatenate((x1_train[0:n], x2_train[0:n], x3_train[0:n], x4_train[0:n]), axis=0)
y_train = np.concatenate((y1_train[0:n], y2_train[0:n], y3_train[0:n], y4_train[0:n]), axis=0)

#### Create validation dataset 

In [None]:
n = min(len(y1_val),len(y2_val),len(y3_val),len(y4_val))
x_val = np.concatenate((x1_val, x2_val, x3_val, x4_val), axis=0)
y_val = np.concatenate((y1_val, y2_val, y3_val, y4_val), axis=0)

#### Randomize the training and validation datasets

In [None]:
x_train, y_train = shuffle(x_train, y_train, random_state=1024)
x_val, y_val = shuffle(x_val, y_val, random_state=1024)

#### Ensuring data casting to the right data type

In [None]:
x_train = (x_train).astype('float32')
x_val = (x_val).astype('float32')

#### Feature normalization

In [None]:
x_train /= 255
x_val /= 255

#### Setting up parameters for the classifier¶

In [None]:
batch_size = 64
num_classes = 4
epochs = 5
dropout = 0.7

#### Convert class vectors to binary class matrices¶

In [None]:
y_train = keras.utils.to_categorical(np.asarray(y_train), num_classes)
y_val = keras.utils.to_categorical(np.asarray(y_val), num_classes)

#### Build a convolutional neural network

In [None]:
#Declare a sequential model
model = Sequential()
#CNN input layer 
model.add(SeparableConv2D(32, kernel_size =(3,3), 
                 activation='relu', 
                 depth_multiplier = 3,
                 padding = 'same',
                 input_shape=x_train.shape[1:]))

#Add hidden layers to the model 
model.add(Conv2D(32,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))

#Fully connected Dense layers 
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))

#### Generate model summary

In [None]:
model.summary()

#### Compile model

In [None]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model_json = model.to_json()
import io, json
with io.open('model.config', 'w', encoding='utf-8') as f:
  f.write(json.dumps(model_json, ensure_ascii=False))

In [None]:
if load_trained_model:
    model.load_weights(save_path, by_name = False)

#### Train the neural network model

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val))

#### Save model weights

In [None]:
model.save(save_path)

#### Run evaluation on the trained model

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])