In [1]:
import pandas as pd
import tensorflow as tf 
import matplotlib.pyplot as plt 
import numpy as np 

2025-06-06 17:36:56.755133: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("../data/icml_face_data.csv")
data.head()

Unnamed: 0,emotion,Usage,pixels
0,0,Training,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,0,Training,151 150 147 155 148 133 111 140 170 174 182 15...
2,2,Training,231 212 156 164 174 138 161 173 182 200 106 38...
3,4,Training,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,6,Training,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...


In [3]:
# train_data = data[data["Usage"] == "Training"]
# test_data = data[data["Usage"] == "Testing"]

# train_data.head()

print(data.columns)

Index(['emotion', ' Usage', ' pixels'], dtype='object')


Given that the column names contain spaces at random intervals, we perform some data cleaning tasks. 

In [4]:
data_columns = data.columns

for string in data_columns: 
    data = data.rename(columns = {string: string.strip()})

print(data.columns)

Index(['emotion', 'Usage', 'pixels'], dtype='object')


Now lets seperate the training and testing data. Lets first explore what Values the Usage column takes. 

In [5]:
data["Usage"].value_counts()

Usage
Training       28709
PublicTest      3589
PrivateTest     3589
Name: count, dtype: int64

So we will use the Public Test and Private Test and group them together in the testing data. 

In [6]:
train_data = data[data["Usage"] == "Training"]
test_data = data[data["Usage"] != "Training"]

train_data.head()

Unnamed: 0,emotion,Usage,pixels
0,0,Training,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,0,Training,151 150 147 155 148 133 111 140 170 174 182 15...
2,2,Training,231 212 156 164 174 138 161 173 182 200 106 38...
3,4,Training,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,6,Training,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...


In [7]:
test_data.head()

Unnamed: 0,emotion,Usage,pixels
28709,0,PublicTest,254 254 254 254 254 249 255 160 2 58 53 70 77 ...
28710,1,PublicTest,156 184 198 202 204 207 210 212 213 214 215 21...
28711,4,PublicTest,69 118 61 60 96 121 103 87 103 88 70 90 115 12...
28712,6,PublicTest,205 203 236 157 83 158 120 116 94 86 155 180 2...
28713,3,PublicTest,87 79 74 66 74 96 77 80 80 84 83 89 102 91 84 ...


Given that we've now segregated our data, there is no reason to keep the usage column and so we simply drop it. 

In [8]:
train_data.drop(columns=["Usage"], inplace = True)
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(columns=["Usage"], inplace = True)


Unnamed: 0,emotion,pixels
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,0,151 150 147 155 148 133 111 140 170 174 182 15...
2,2,231 212 156 164 174 138 161 173 182 200 106 38...
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...


In [9]:
test_data.drop(columns=["Usage"], inplace = True)
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(columns=["Usage"], inplace = True)


Unnamed: 0,emotion,pixels
28709,0,254 254 254 254 254 249 255 160 2 58 53 70 77 ...
28710,1,156 184 198 202 204 207 210 212 213 214 215 21...
28711,4,69 118 61 60 96 121 103 87 103 88 70 90 115 12...
28712,6,205 203 236 157 83 158 120 116 94 86 155 180 2...
28713,3,87 79 74 66 74 96 77 80 80 84 83 89 102 91 84 ...


Let us now analyse the type of data present in our Dataset to identify the most optimal classification algorithm and model training strategy. 

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28709 entries, 0 to 28708
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emotion  28709 non-null  int64 
 1   pixels   28709 non-null  object
dtypes: int64(1), object(1)
memory usage: 672.9+ KB


There are no null values in our dataset, and hence we don't need to drop any rows. 

In [11]:
pixels_array = np.array(train_data["pixels"])
pixels_array

array(['70 80 82 72 58 58 60 63 54 58 60 48 89 115 121 119 115 110 98 91 84 84 90 99 110 126 143 153 158 171 169 172 169 165 129 110 113 107 95 79 66 62 56 57 61 52 43 41 65 61 58 57 56 69 75 70 65 56 54 105 146 154 151 151 155 155 150 147 147 148 152 158 164 172 177 182 186 189 188 190 188 180 167 116 95 103 97 77 72 62 55 58 54 56 52 44 50 43 54 64 63 71 68 64 52 66 119 156 161 164 163 164 167 168 170 174 175 176 178 179 183 187 190 195 197 198 197 198 195 191 190 145 86 100 90 65 57 60 54 51 41 49 56 47 38 44 63 55 46 52 54 55 83 138 157 158 165 168 172 171 173 176 179 179 180 182 185 187 189 189 192 197 200 199 196 198 200 198 197 177 91 87 96 58 58 59 51 42 37 41 47 45 37 35 36 30 41 47 59 94 141 159 161 161 164 170 171 172 176 178 179 182 183 183 187 189 192 192 194 195 200 200 199 199 200 201 197 193 111 71 108 69 55 61 51 42 43 56 54 44 24 29 31 45 61 72 100 136 150 159 163 162 163 170 172 171 174 177 177 180 187 186 187 189 192 192 194 195 196 197 199 200 201 200 197 201 137 5

Let us know convert the training and testing dataframe, so that each column represents a specific pixel. 

In [12]:
new_cols = train_data["pixels"].str.split(" ", expand=True)


emotion_col = train_data["emotion"]
train_data.drop(columns=["pixels", "emotion"],inplace = True)
train_data[list(np.arange(0,2304))] = new_cols
train_data["emotion"] = emotion_col

train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(columns=["pixels", "emotion"],inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[list(np.arange(0,2304))] = new_cols
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[list(np.arange(0,2304))] = new_cols
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2295,2296,2297,2298,2299,2300,2301,2302,2303,emotion
0,70,80,82,72,58,58,60,63,54,58,...,182,183,136,106,116,95,106,109,82,0
1,151,150,147,155,148,133,111,140,170,174,...,108,95,108,102,67,171,193,183,184,0
2,231,212,156,164,174,138,161,173,182,200,...,138,152,122,114,101,97,88,110,152,2
3,24,32,36,30,32,23,19,20,30,41,...,126,132,132,133,136,139,142,143,142,4
4,4,0,0,0,0,0,0,0,0,0,...,34,31,31,31,27,31,30,29,30,6


In [13]:
new_cols = test_data["pixels"].str.split(" ", expand=True)


emotion_col = test_data["emotion"]
test_data.drop(columns=["pixels", "emotion"],inplace = True)
test_data[list(np.arange(0,2304))] = new_cols
test_data["emotion"] = emotion_col

test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(columns=["pixels", "emotion"],inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[list(np.arange(0,2304))] = new_cols
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[list(np.arange(0,2304))] = new_cols
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2295,2296,2297,2298,2299,2300,2301,2302,2303,emotion
28709,254,254,254,254,254,249,255,160,2,58,...,92,99,84,22,0,0,42,129,180,0
28710,156,184,198,202,204,207,210,212,213,214,...,199,197,193,188,184,180,172,167,161,1
28711,69,118,61,60,96,121,103,87,103,88,...,93,92,90,92,93,92,88,87,90,4
28712,205,203,236,157,83,158,120,116,94,86,...,211,213,206,196,181,112,43,82,86,6
28713,87,79,74,66,74,96,77,80,80,84,...,32,33,35,38,40,45,41,34,32,3


In [14]:
train_data.columns

Index([        0,         1,         2,         3,         4,         5,
               6,         7,         8,         9,
       ...
            2295,      2296,      2297,      2298,      2299,      2300,
            2301,      2302,      2303, 'emotion'],
      dtype='object', length=2305)

In [15]:
test_data.columns

Index([        0,         1,         2,         3,         4,         5,
               6,         7,         8,         9,
       ...
            2295,      2296,      2297,      2298,      2299,      2300,
            2301,      2302,      2303, 'emotion'],
      dtype='object', length=2305)