In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

###Preprocessing
As said before, we intend to normalize and truncate our dataset to 547 for each emotion (which still comes out to be a hefty 3829 images). Normalizing will ensure optimal comparisons and truncating the dataset will make computations inherently faster.

We intend to DROP the emotion and usage categories in order to find our own classifications.

In [27]:
df = pd.read_csv('fer2013.csv')
print(df)

       emotion                                             pixels     Usage
0            0  70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...  Training
1            0  151 150 147 155 148 133 111 140 170 174 182 15...  Training
2            2  231 212 156 164 174 138 161 173 182 200 106 38...  Training
3            4  24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...  Training
4            6  4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...  Training
...        ...                                                ...       ...
15498        3  213 213 213 212 212 213 190 153 119 137 174 17...  Training
15499        4  196 198 200 199 198 196 197 198 204 202 159 11...  Training
15500        0  196 207 185 211 165 155 176 179 174 203 193 20...  Training
15501        5  2 6 10 12 12 16 37 120 166 182 196 203 207 213...  Training
15502        0  7 7 5 6 10 15 29 46 42 24 32 49 27 18 28 15 17...       NaN

[15503 rows x 3 columns]


In [28]:
LIMIT = 547

def isLimit(map):
    for key in map:
        if map[key] < LIMIT:
            return False
    return True

def saveData(data):
    with open('face-emo.csv', 'w') as file:
        file.writelines(data)
    print("data saved under face-emo.csv")
    return


"""
    { 0: "Angry", 1: "Disgust", 2: "Fear", 3: "Happy", 4: "Sad", 5: "Surprize", 6: "Neutral" }

    This function is used to balance the data between emotions so we have an
    even number of pixels for each categories. The new data is saved in a new
    file called face-emo.csv.

    TODO: We might need to remove the Usage Column
"""
def fetch_data():
    data = []
    classes = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

    with open('fer2013.csv', 'r') as file:
        data.append(file.readline())

        for line in file:
            emo = int(line.split(',')[0])

            if classes[emo] != LIMIT:
                data.append(line)
                classes[emo] += 1

            if isLimit(classes) == True:
                break

        print(classes)

        # print(f"length of data should equal 547 * 7. data = {len(data) - 1} == {547 * 7}")
        # Saving new data
        saveData(data)

    return


if __name__ == "__main__":
    fetch_data()

{0: 547, 1: 246, 2: 547, 3: 547, 4: 547, 5: 547, 6: 547}
data saved under face-emo.csv


In [29]:
modified_df = pd.read_csv('face-emo.csv')
print(modified_df)

      emotion                                             pixels     Usage
0           0  70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...  Training
1           0  151 150 147 155 148 133 111 140 170 174 182 15...  Training
2           2  231 212 156 164 174 138 161 173 182 200 106 38...  Training
3           4  24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...  Training
4           6  4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...  Training
...       ...                                                ...       ...
3523        1  247 247 247 247 247 245 253 167 188 215 202 19...  Training
3524        1  0 0 0 0 3 0 63 191 196 208 213 212 213 219 221...  Training
3525        1  223 225 226 228 230 233 231 234 232 240 214 11...  Training
3526        1  81 77 108 121 98 81 101 93 128 137 118 118 136...  Training
3527        1  214 211 209 205 205 160 102 93 109 121 110 133...  Training

[3528 rows x 3 columns]


In [30]:
#normalizing pixel data of our data frame

from sklearn.preprocessing import MinMaxScaler

#each pixel entry in the data frame is of type string, so first we must convert them into a list of arrays
#function to convert string into array of numbers
def string_to_array(numbers_string):
    number_strings = numbers_string.split()
    numbers_array = np.array([float(number) for number in number_strings])
    return numbers_array

# Apply the function to the DataFrame column
modified_df['pixels'] = modified_df['pixels'].apply(string_to_array)


#drop the unnecessary columns (label column and column with string data)
pixelnormdf = modified_df.drop(columns =['emotion', 'Usage'])

print(pixelnormdf)

scaler = MinMaxScaler()

#since minmax scalar only works on one value, we must iterate over the array in each row of number in the data frame
for i, row in pixelnormdf.iterrows():
    scaled_array = scaler.fit_transform(np.array(row['pixels']).reshape(-1, 1))
    pixelnormdf.loc[i, 'pixels'] = scaled_array.flatten()

print(pixelnormdf)

                                                 pixels
0     [70.0, 80.0, 82.0, 72.0, 58.0, 58.0, 60.0, 63....
1     [151.0, 150.0, 147.0, 155.0, 148.0, 133.0, 111...
2     [231.0, 212.0, 156.0, 164.0, 174.0, 138.0, 161...
3     [24.0, 32.0, 36.0, 30.0, 32.0, 23.0, 19.0, 20....
4     [4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
...                                                 ...
3523  [247.0, 247.0, 247.0, 247.0, 247.0, 245.0, 253...
3524  [0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 63.0, 191.0, 19...
3525  [223.0, 225.0, 226.0, 228.0, 230.0, 233.0, 231...
3526  [81.0, 77.0, 108.0, 121.0, 98.0, 81.0, 101.0, ...
3527  [214.0, 211.0, 209.0, 205.0, 205.0, 160.0, 102...

[3528 rows x 1 columns]
                                                 pixels
0     [0.2964824120603015, 0.34673366834170855, 0.35...
1     [0.6894977168949772, 0.684931506849315, 0.6712...
2     [0.9058823529411765, 0.8313725490196078, 0.611...
3     [0.053811659192825115, 0.08968609865470852, 0....
4     [0.01659751037344

However, we need to also convert these back into a string. Allows us to do things like .split and just preserves the original nature of the dataset

In [31]:
#https://sparkbyexamples.com/python/convert-list-to-string-in-python/#:~:text=The%20format()%20method%20in,string%20present%20in%20the%20list.

def convertEntriesToString():
  for i in range(len(pixelnormdf)):
    temp = ''
    for eachNumber in pixelnormdf['pixels'][i]:
      if i == len(pixelnormdf): #If the last number
        temp = temp + str(eachNumber) #To prevent leading space at the last number
      else:
        temp = temp + str(eachNumber) + " "

    pixelnormdf['pixels'][i] = temp

convertEntriesToString()


df = pixelnormdf

###Building the first model

As mentioned before, we intend to use kmeans clustering to find classifications for the images. This will make it so we could find our own classes/emotions and see if our model can possible identify any others.

In [42]:
print(df)
print(df["pixels"].apply(lambda n:(len(n.split()))))

                                                 pixels
0     0.2964824120603015 0.34673366834170855 0.35678...
1     0.6894977168949772 0.684931506849315 0.6712328...
2     0.9058823529411765 0.8313725490196078 0.611764...
3     0.053811659192825115 0.08968609865470852 0.107...
4     0.016597510373443983 0.0 0.0 0.0 0.0 0.0 0.0 0...
...                                                 ...
3523  0.9686098654708521 0.9686098654708521 0.968609...
3524  0.0 0.0 0.0 0.0 0.012145748987854251 0.0 0.255...
3525  0.8991935483870968 0.907258064516129 0.9112903...
3526  0.22325581395348837 0.20465116279069767 0.3488...
3527  0.8805309734513275 0.8672566371681416 0.858407...

[3528 rows x 1 columns]
0       2304
1       2304
2       2304
3       2304
4       2304
        ... 
3523    2304
3524    2304
3525    2304
3526    2304
3527    2304
Name: pixels, Length: 3528, dtype: int64


In [67]:
emotionLabels = {0:"Angry", 1:"Disgust", 2:"Fear", 3:"Happy", 4:"Sad", 5:"Surprise", 6:"Neutral"}

import re

def displayClasses():
  #The first and second instances of each emotion class.
  #Feel free to change the indices array to look at other images in the dataset
  #However, you may need to change the size of the plot if so. Supports 7 rows and columns now.
    indices = [0,299,2,7,3,15,4,1,388,5,8,6,26,12]
    plt.figure(figsize=(25,25))

    for i,locationIt in zip(indices,range(0,len(indices))):
        #Get the entry and convert it to an array

        #Can see it's fine
        print(df['pixels'][i])

        #Convert array to 48*48 matrix
        pixelMatrix = np.reshape(pixelArray,(48,48))

        #Subplot/plot settings
        plt.subplot(7,7,locationIt+1)
        plt.axis("off")
        plt.imshow(pixelMatrix,cmap='gray')
        plt.title(emotionLabels[df['emotion'][i]]+" from entry "+str(i))

displayClasses()


0.2964824120603015 0.34673366834170855 0.35678391959798994 0.3065326633165829 0.23618090452261303 0.23618090452261303 0.24623115577889448 0.26130653266331655 0.21608040201005024 0.23618090452261303 0.24623115577889448 0.185929648241206 0.39195979899497485 0.5226130653266332 0.5527638190954773 0.542713567839196 0.5226130653266332 0.4974874371859297 0.4371859296482412 0.40201005025125625 0.36683417085427134 0.36683417085427134 0.3969849246231156 0.4422110552763819 0.4974874371859297 0.5778894472361809 0.6633165829145728 0.7135678391959799 0.7386934673366834 0.8040201005025126 0.7939698492462312 0.8090452261306532 0.7939698492462312 0.7738693467336684 0.592964824120603 0.4974874371859297 0.5125628140703518 0.4824120603015075 0.4221105527638191 0.3417085427135678 0.27638190954773867 0.2562814070351759 0.22613065326633164 0.23115577889447236 0.25125628140703515 0.20603015075376885 0.1608040201005025 0.1507537688442211 0.271356783919598 0.25125628140703515 0.23618090452261303 0.2311557788944

NameError: ignored

<Figure size 2500x2500 with 0 Axes>