# Ethnicity classifier with a Conv Net

In [None]:
__author__ = "Joann H. Tang, Dr. Rahul Remanan"
__copyright__ = "Copyright 2018"
__email__ = "eagtang2007@gmail.com, rahul@remanan.net"
__status__ = "Prototype"

### Import dependent libraries

In [None]:
import keras
import keras.utils
from keras.models import Sequential
#Core layers
from keras.layers import Dense, Dropout, Activation, Flatten
#CNN layers
from keras.layers import SeparableConv2D, Conv2D, MaxPooling2D

from sklearn.utils import shuffle

import numpy as np

import h5py

from PIL import Image
import glob
import gc

### Set the directory path

In [None]:
path = "./Pooled/"
save_path = "./model_weights.h5"
load_trained_model = True

In [None]:
def extract_pixel_values(prefix,y_label):
    """
    Extract pixel values from images 
    
    Arguments:
    prefix -- The images for each class were labeled with a specific prefix
    y_label -- The number assigned to represent a specific ethnicity group
    Returns:
    X -- pixel values of images, 
         numpy array of shape(number_of_images,250,250,3)
    Y -- the ethnicity group label of images, 
         numpy array of shape(number_of_images)
    """
    ims = glob.glob(path + prefix + "*.jpg")
    Y = np.zeros((np.size(ims),1))
    Y[:] = int(y_label)
    X = np.zeros((np.size(ims),250,250,3)) #All images were resized to 250X250
    for i, im in enumerate(ims):
        img = Image.open(im)
        X[i] = np.asarray(img)
    return X, Y

In [None]:
def split_dataset(X,Y,percentage):
    """
    Shuffle and split the dataset into train and test subsets
   
    Arguments:
    X -- numpy array of feature data, here, they are the pixel values of images
    Y -- numpy array of label data
    percentage -- the percentage of the total dataset that assigned as training set
    
    Returns:
    x_train -- pixel values of images in the training set
    y_train -- labels of images in the training set
    x_val -- pixel values of images in the test set
    y_val -- labels of images in the test set

    """
    print ("Group sample size: " + str(np.size(Y)))
    idx_train = np.random.randint(np.size(Y), size=round(np.size(Y)*percentage))
    idx_val = np.random.randint(np.size(Y), size=round(np.size(Y)*(1-percentage)))
    x_train = X[idx_train,:]
    y_train = Y[idx_train]
    x_val = X[idx_val,:]
    y_val = Y[idx_val]
    return x_train, y_train, x_val, y_val

### Generating dataset

This project uses the [United States Census Bureau classification of ethinicites in the US](https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States).

#### Currently the dataset is limited to four ethnic groups in the US:

1) Asian American

2) Black and African American

3) Caucasian/White and Euroopean American

4) Hispanic and Latino American

#### The dataset is missing information on:

1) Native American and Alaska Native

2) Native Hawaiian and other Pacific Islander

### Ethnicity group -- Asian American

In [None]:
X, Y = extract_pixel_values("A",0)                          
x1_train,y1_train,x1_val,y1_val = split_dataset(X,Y,0.8)

### Ethnicity group -- Black and African American

In [None]:
X, Y = extract_pixel_values("B",1)                          
x2_train,y2_train,x2_val,y2_val = split_dataset(X,Y,0.8)

### Ethnicity group -- Caucasian/White and European American 

In [None]:
X, Y = extract_pixel_values("C",2)                          
x3_train,y3_train,x3_val,y3_val = split_dataset(X,Y,0.8)

### Ethnicity group -- Hispanic and Latino Americans

In [None]:
X, Y = extract_pixel_values("H",3)                          
x4_train,y4_train,x4_val,y4_val = split_dataset(X,Y,0.8)

### Create training data

In [None]:
x_train = np.concatenate((x1_train, x2_train, x3_train, x4_train), axis=0)
y_train = np.concatenate((y1_train, y2_train, y3_train, y4_train), axis=0)

### Create validation data

In [None]:
x_val = np.concatenate((x1_val, x2_val, x3_val, x4_val), axis=0)
y_val = np.concatenate((y1_val, y2_val, y3_val, y4_val), axis=0)

### Randomize the training and validation datasets

In [None]:
x_train, y_train = shuffle(x_train, y_train, random_state=1024)
x_val, y_val = shuffle(x_val, y_val, random_state=1024)

### Ensuring data casting to the right data type

In [None]:
x_train = (x_train).astype('float32')
x_val = (x_val).astype('float32')

### Feature normalization 

In [None]:
x_train /= 255
x_val /= 255

### Setting up  parameters for the classifier

In [None]:
batch_size = 64
num_classes = 4
epochs = 5
dropout = 0.7

### Convert class vectors to binary class matrices

In [None]:
y_train = keras.utils.to_categorical(np.asarray(y_train), num_classes)
y_val = keras.utils.to_categorical(np.asarray(y_val), num_classes)

### Build a convolutional neural network

In [None]:
#Declare a sequential model
model = Sequential()
#CNN input layer 
model.add(SeparableConv2D(32, kernel_size =(3,3), 
                 activation='relu', 
                 depth_multiplier = 3,
                 padding = 'same',
                 input_shape=x_train.shape[1:]))

#Add hidden layers to the model 
model.add(Conv2D(32,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(dropout))

#Fully connected Dense layers 
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))

### Generate model summary

In [None]:
model.summary()

### Compile model

In [None]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model_json = model.to_json()
import io, json
with io.open('model.config', 'w', encoding='utf-8') as f:
  f.write(json.dumps(model_json, ensure_ascii=False))

In [None]:
if load_trained_model:
    model.load_weights(save_path, by_name = False)

### Train the neural network model

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val))

### Save model weights

In [None]:
model.save(save_path)

### Run evaluation on the trained model

In [None]:
score = model.evaluate(x_val, y_val, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])