# Imports & test data preprocessing

In [56]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import classification_report 

In [4]:
# Website for the dataset:
# https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data?select=icml_face_data.csv

CLASS_NAMES = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"]
CLASS_NAMES_WITHOUT_DISGUST = ["Angry", "Fear", "Happy", "Sad", "Surprise", "Neutral"]
FILE_NAME = "train.csv" # Insert file name
WHITE_IMAGES = [6458,  7629, 10423, 11286, 13148, 13402, 13988, 15894, 22198, 22927, 28601, 59]

In [5]:
data_path = ["data"] # Insert data file path
file_path = os.sep.join(data_path + [FILE_NAME])
data = pd.read_csv(file_path)
data = data.drop(index=WHITE_IMAGES, axis=0)

In [6]:
data_exclude_disgust = data[data['emotion'] != 1] # Drops the emotion Disgust
data_exclude_disgust = data_exclude_disgust.replace({
    2 : 1,
    3 : 2,
    4 : 3,
    5 : 4,
    6 : 5
})

In [7]:
def data_X_y(data):
    # Split data into X & y
    X = data.drop('emotion', axis='columns')
    y = data['emotion']

    # Reshapes X into 3D array
    X = [pixels.split(" ") for pixels in data["pixels"]]
    X = np.array(X)
    X = X.astype("int32")
    X = np.array([image.reshape(48, 48) for image in X])
    X = X/255.0
    X = X.reshape(len(X), 48, 48, 1)
    
    return X,y

In [8]:
X,y = data_X_y(data_exclude_disgust)

In [9]:
file_path = os.sep.join(data_path + ['emotions.csv'])
data_test = pd.read_csv(file_path)
test = ["PrivateTest", "PublicTest"]
data_test = data_test[data_test['Usage'].isin(test) ]

# If icml_face_data.csv is used
data_test.drop('Usage', axis=1, inplace=True)

data_test.head()

Unnamed: 0,emotion,pixels
28709,0,254 254 254 254 254 249 255 160 2 58 53 70 77 ...
28710,1,156 184 198 202 204 207 210 212 213 214 215 21...
28711,4,69 118 61 60 96 121 103 87 103 88 70 90 115 12...
28712,6,205 203 236 157 83 158 120 116 94 86 155 180 2...
28713,3,87 79 74 66 74 96 77 80 80 84 83 89 102 91 84 ...


In [10]:
data_test.shape

(7178, 2)

In [11]:
data_test = data_test[data_test['emotion'] != 1] # Drops the emotion Disgust
data_test = data_test.replace({
    2 : 1,
    3 : 2,
    4 : 3,
    5 : 4,
    6 : 5
})
data_test.shape, data_test.emotion.unique()

((7067, 2), array([0, 3, 5, 2, 1, 4]))

In [12]:
def reshape_X(X):
    num_training = X.shape[0]
    mask = list(range(num_training))
    X_reshape = X[mask]

    # Reshape the image data into rows
    X_reshape = np.reshape(X, (X.shape[0], -1))
    
    return X_reshape

In [13]:
X_test, y_test = data_X_y(data_test)

In [14]:
X_test = reshape_X(X_test)

# CNN Model testing

##### Only the 3 best CNN models will be used for the testing. Otherwise there would be too much models to test if we thoroughly tested the worse models since these 3 models are the best combinations that came out of the previous models.

### More imports

In [15]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import os
from resizeimage import resizeimage
from PIL import Image, ImageOps
import numpy as np
import pandas as pd
from tqdm import tqdm

### Model loading

In [16]:
# CNNModelx1 = tf.keras.models.load_model("models/CNNmodelx-1")
# CNNModelx2 = tf.keras.models.load_model("models/CNNmodelx-2")
# CNNModelx3 = tf.keras.models.load_model("models/CNNmodelx-3")
# CNNModelx4 = tf.keras.models.load_model("models/CNNmodelx-4")
# CNNModelx5 = tf.keras.models.load_model("models/CNNmodelx-5")
# CNNModelx6 = tf.keras.models.load_model("models/CNNmodelx-6")
# CNNModelx7 = tf.keras.models.load_model("models/CNNmodelx-7")
# CNNModelx8 = tf.keras.models.load_model("models/CNNmodelx-8")
# CNNModelx9 = tf.keras.models.load_model("models/CNNmodelx-9")

CNNModel3 = tf.keras.models.load_model("models/CNNmodel3")
CNNModel4 = tf.keras.models.load_model("models/CNNmodel4")
CNNModel5 = tf.keras.models.load_model("models/CNNmodel5")
# CNNModel6 = tf.keras.models.load_model("models/CNNmodel6")

### Model predictions

In [17]:
%%time
# models = {0: {"model": CNNModelx1, "predictions": None}, 1: {"model": CNNModelx2, "predictions": None}, 
#           2: {"model": CNNModelx3, "predictions": None}, 3: {"model": CNNModelx4, "predictions": None}, 
#           4: {"model": CNNModelx5, "predictions": None}, 5: {"model": CNNModelx6, "predictions": None}, 
#           6: {"model": CNNModelx7, "predictions": None}, 7: {"model": CNNModelx8, "predictions": None}, 
#           8: {"model": CNNModelx9, "predictions": None}, 9: {"model": CNNModel3, "predictions": None}, 
#           10: {"model": CNNModel4, "predictions": None}, 11: {"model": CNNModel5, "predictions": None}
#          }

models = {0: {"model": CNNModel3, "predictions": None}, 
          1: {"model": CNNModel4, "predictions": None}, 
          2: {"model": CNNModel5, "predictions": None}
         }

X_test_resh = X_test.reshape(len(X_test), 48, 48, 1)

for key in tqdm(models.keys()):
    models[key]["predictions"] = models[key]["model"].predict(X_test_resh)

100%|██████████| 3/3 [04:26<00:00, 88.91s/it]

CPU times: user 50min 10s, sys: 1min 13s, total: 51min 23s
Wall time: 4min 26s





In [23]:
for key in tqdm(models.keys()):
    for x in range(len(models[key]["predictions"])):
        models[key]["predictions"][x] = np.argmax(models[key]["predictions"][x])
     

100%|██████████| 3/3 [00:00<00:00, 53.79it/s]


In [38]:
for key in tqdm(models.keys()):
     models[key]["predictions"] = models[key]["predictions"][:, :1]

100%|██████████| 3/3 [00:00<00:00, 44938.97it/s]


# Model results

### Model 1

##### This is one the three CNN models that peformed really great on the model evaluation. As you can see below it has a test accuracy of ~93%. This means from all the test samples it correctly classified ~93% of them. But to see whether it really performs as great as we want we also will have a look at the precision, recall and F1 score.

In [49]:
print(f"\nThe accuracy of the first model is: {CNNModel3.evaluate(X_test_resh, y_test)[1]}%")


The accuracy of the first model is: 0.9272676110267639%


##### The structure of the model can be seen below which has been trained on the random oversampled data. With Adam as the optimizer.

In [42]:
CNNModel3.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 48, 48, 64)        128       
_________________________________________________________________
batch_normalization (BatchNo (None, 48, 48, 64)        256       
_________________________________________________________________
dropout (Dropout)            (None, 48, 48, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 48, 48, 128)       73856     
_________________________________________________________________
batch_normalization_1 (Batch (None, 48, 48, 128)       512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 48, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 48, 48, 256)      

##### Here below we can see the precision, recall and F1 score of the model. The highest precision score is on the happy emotion which is 97%. So this tells us what proportion of happy classifications was actually correct. But the recall for the happy emotion is one of the lowest which means that of all the actual happy faces it only correctly identified 93% of them correctly. A reason why the recall is lower could be that since this model was trained on random oversampled data, the happy emotion had the most emotions meaning that the happy emotions didn't get oversampled. Which resulted in the model not being able to learn the invariants of the happy emotion.

##### The F1 score for each of the emotions are higher than 90% which is really high. This gives us a good comprehension that this model its performance is really high.

In [46]:
print(classification_report(y_test, models[0]["predictions"], target_names = CLASS_NAMES_WITHOUT_DISGUST))

              precision    recall  f1-score   support

       Angry       0.92      0.93      0.93       958
        Fear       0.91      0.91      0.91      1024
       Happy       0.97      0.93      0.95      1774
         Sad       0.90      0.92      0.91      1247
    Surprise       0.96      0.97      0.97       831
     Neutral       0.90      0.92      0.91      1233

    accuracy                           0.93      7067
   macro avg       0.93      0.93      0.93      7067
weighted avg       0.93      0.93      0.93      7067



## Model 2

##### This model was trained with the same model architecture as the first model. But instead of random oversampled we used smothe on this model to see whether this model would perform better or worse with a more complicated oversampling method. This has the same accuracy as the first model (off by 0.001%). But does this mean the precision, recall and F1 score are also the same? That is what we will be looking into.

In [51]:
print(f"\nThe accuracy of the second model is: {CNNModel4.evaluate(X_test_resh, y_test)[1]}%")


The accuracy of the second model is: 0.9258525371551514%


##### Like with the last model the surpise and happy emotion classifications are higher than the others. But in general compared with the previous model, this model has the same F1 score for happy. Even though the precision of the first model for happy was 0.97 and recall 0.93. And for this model the precision is 0.95 and the recall 0.95 which proves that the F1 score is basically the ratio between precision and recall.

In [52]:
print(classification_report(y_test, models[1]["predictions"], target_names = CLASS_NAMES_WITHOUT_DISGUST))

              precision    recall  f1-score   support

       Angry       0.90      0.93      0.92       958
        Fear       0.91      0.91      0.91      1024
       Happy       0.95      0.95      0.95      1774
         Sad       0.90      0.90      0.90      1247
    Surprise       0.95      0.97      0.96       831
     Neutral       0.94      0.90      0.92      1233

    accuracy                           0.93      7067
   macro avg       0.92      0.93      0.93      7067
weighted avg       0.93      0.93      0.93      7067



## Model 3

##### This model has been trained on the augmented data where the images have been horizontally flipped, rotated a bit and zoomed in and out. Using the augmented data the model will be able to better learn the invariants of the emotions. Looking at the test accuracy below the model has an accuracy of ~94% which is the highest scoring CNN model we have.

In [55]:
print(f"\nThe accuracy of the third model is: {CNNModel5.evaluate(X_test_resh, y_test)[1]}%")


The accuracy of the third model is: 0.9377387762069702%


##### The most noticable improvement of this model compared to the previous models is that this model has done better in classifying the fear emotion. This model its precision is 0.3% higher than the previous models. Which increased its F1 score to 0.93 which is the best one we got. Because fear has a resembles with surprise it was always lower than the others.

In [54]:
print(classification_report(y_test, models[2]["predictions"], target_names = CLASS_NAMES_WITHOUT_DISGUST))

              precision    recall  f1-score   support

       Angry       0.93      0.94      0.94       958
        Fear       0.94      0.91      0.93      1024
       Happy       0.97      0.94      0.95      1774
         Sad       0.90      0.94      0.92      1247
    Surprise       0.95      0.97      0.96       831
     Neutral       0.93      0.92      0.93      1233

    accuracy                           0.94      7067
   macro avg       0.94      0.94      0.94      7067
weighted avg       0.94      0.94      0.94      7067



In [70]:
from sklearn.metrics import confusion_matrix

In [77]:
print(confusion_matrix(models[0]["predictions"], y_test, normalize = "true", labels = []))

[[0.9160696  0.02763562 0.02149437 0.01535312 0.00307062 0.01637666]
 [0.00976562 0.90625    0.02636719 0.02539062 0.0078125  0.02441406]
 [0.00413956 0.00236546 0.9704317  0.01064459 0.00295683 0.00946186]
 [0.01877934 0.02425665 0.02973396 0.89514867 0.00391236 0.02816901]
 [0.0011919  0.02145411 0.00834327 0.00238379 0.96305125 0.00357569]
 [0.01669316 0.0127186  0.0317965  0.03338633 0.00158983 0.90381558]]
