# multi-ethnicity classifier with a Conv Net
There are four ethnicity groups -- Asian, Black, Caucasian, Hispanic 

In [143]:
__author__ = "Joann H. Tang"
__copyright__ = "Copyright 2018"
__email__ = "eagtang2007@gmail.com"
__status__ = "Prototype"

In [144]:
import keras
import keras.utils
from keras.models import Sequential
#Core layers
from keras.layers import Dense, Dropout, Activation, Flatten
#CNN layers
from keras.layers import SeparableConv2D, Conv2D, MaxPooling2D

import numpy as np

from PIL import Image
import glob

from sklearn.utils import shuffle

In [145]:
#Set up the directory where the images are stored
path = "/Users/huizhentang/Desktop/Insight demo/Datasets/Tarrlab/Pooled/"

In [146]:
def extract_pixel_values(prefix,y_label):
    """
    Extract pixel values from images 
    
    Arguments:
    prefix -- The images for each class were labeled with a specific prefix
    y_label -- The number assigned to represent a specific ethnicity group
    Returns:
    X -- pixel values of images, 
         numpy array of shape(number_of_images,250,250,3)
    Y -- the ethnicity group label of images, 
         numpy array of shape(number_of_images)
    """
    ims = glob.glob(path + prefix + "*.jpg")
    Y = np.zeros((np.size(ims),1))
    Y[:] = int(y_label)
    X = np.zeros((np.size(ims),250,250,3)) #All images were resized to 250X250
    for i, im in enumerate(ims):
        img = Image.open(im)
        X[i] = np.asarray(img)
    return X, Y

In [147]:
def split_dataset(X,Y,percentage):
    """
    Shuffle and split the dataset into train and test subsets
   
    Arguments:
    X -- numpy array of feature data, here, they are the pixel values of images
    Y -- numpy array of label data
    percentage -- the percentage of the total dataset that assigned as training set
    
    Returns:
    x_train -- pixel values of images in the training set
    y_train -- labels of images in the training set
    x_test -- pixel values of images in the test set
    y_test -- labels of images in the test set

    """
    idx_train = np.random.randint(np.size(Y), size=round(np.size(Y)*percentage))
    idx_test = np.random.randint(np.size(Y), size=round(np.size(Y)*(1-percentage)))
    x_train = X[idx_train,:]
    y_train = Y[idx_train]
    x_test = X[idx_test,:]
    y_test = Y[idx_test]
    return x_train, y_train, x_test, y_test

In [148]:
#All the images are saved in the same folder under the same directory defined by "path". The 
#file names of each ethnicity group start with the same letter. "A" for Asian, "B" for Black, 
#"C" for Caucasian, "H" for hispanic. 

#Ethnicity group -- Asian 
X, Y = extract_pixel_values("A",0)                          
x1_train,y1_train,x1_test,y1_test = split_dataset(X,Y,0.8)
#Ethnicity group -- Black 
X, Y = extract_pixel_values("B",1)                          
x2_train,y2_train,x2_test,y2_test = split_dataset(X,Y,0.8)
#Ethnicity group -- Caucasian  
X, Y = extract_pixel_values("C",2)                          
x3_train,y3_train,x3_test,y3_test = split_dataset(X,Y,0.8)
#Ethnicity group -- Hispanic
X, Y = extract_pixel_values("H",3)                          
x4_train,y4_train,x4_test,y4_test = split_dataset(X,Y,0.8)

x_train = np.concatenate((x1_train, x2_train, x3_train, x4_train), axis=0)
y_train = np.concatenate((y1_train, y2_train, y3_train, y4_train), axis=0)

x_test = np.concatenate((x1_test, x2_test, x3_test, x4_test), axis=0)
y_test = np.concatenate((y1_test, y2_test, y3_test, y4_test), axis=0)

In [149]:
#Shuffle the subsets
shuffle(x_train, y_train, random_state=42)
shuffle(x_test, y_test, random_state=42)

[array([[[[ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          ..., 
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.]],
 
         [[ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          ..., 
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.]],
 
         [[ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          ..., 
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.]],
 
         ..., 
         [[ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          ..., 
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.]],
 
         [[ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          [ 255.,  255.,  255.],
          ..., 
  

In [150]:
#Assure the data is casted to the right datatype. 
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

#Feature normalization 
x_train /= 255
x_test /= 255

In [151]:
#Setting up  parameters for the classifier
batch_size = 64
num_classes = 4
epochs = 30

In [152]:
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [153]:
#Declare a sequential model
model = Sequential()
#CNN input layer 
model.add(SeparableConv2D(32, kernel_size =(3,3), 
                 activation='relu', 
                 depth_multiplier = 3,
                 padding = 'same',
                 input_shape=x_train.shape[1:]))

#Add hidden layers to the model 
model.add(Conv2D(32,3,3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64,3,3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64,3,3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))

#Fully connected Dense layers 
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
separable_conv2d_2 (Separabl (None, 250, 250, 32)      401       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 248, 248, 32)      9248      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 124, 124, 32)      0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 124, 124, 32)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 122, 122, 64)      18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 61, 61, 64)        0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 61, 61, 64)        0         
__________

  # This is added back by InteractiveShellApp.init_path()
  


In [154]:
#Compile model
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [142]:
#Training
from keras.preprocessing.image import ImageDataGenerator

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                   validation_data=(x_test, y_test))


score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ValueError: Error when checking target: expected dense_2 to have 2 dimensions, but got array with shape (4861, 4, 4)