In [1]:
import pandas as pd
import numpy as np

## Data Processing

In [2]:
# read training.csv and test.csv
df = pd.read_csv('./data/training.csv', sep=',')
print(df[:1])
print(df[:1].shape)
print('\n\n')

   left_eye_center_x  left_eye_center_y  right_eye_center_x  \
0          66.033564          39.002274           30.227008   

   right_eye_center_y  left_eye_inner_corner_x  left_eye_inner_corner_y  \
0           36.421678                59.582075                39.647423   

   left_eye_outer_corner_x  left_eye_outer_corner_y  right_eye_inner_corner_x  \
0                73.130346                39.969997                 36.356571   

   right_eye_inner_corner_y  \
0                 37.389402   

                         ...                          nose_tip_y  \
0                        ...                           57.066803   

   mouth_left_corner_x  mouth_left_corner_y  mouth_right_corner_x  \
0            61.195308            79.970165             28.614496   

   mouth_right_corner_y  mouth_center_top_lip_x  mouth_center_top_lip_y  \
0             77.388992               43.312602               72.935459   

   mouth_center_bottom_lip_x  mouth_center_bottom_lip_y  \
0         

In [3]:
df.head()

Unnamed: 0,left_eye_center_x,left_eye_center_y,right_eye_center_x,right_eye_center_y,left_eye_inner_corner_x,left_eye_inner_corner_y,left_eye_outer_corner_x,left_eye_outer_corner_y,right_eye_inner_corner_x,right_eye_inner_corner_y,...,nose_tip_y,mouth_left_corner_x,mouth_left_corner_y,mouth_right_corner_x,mouth_right_corner_y,mouth_center_top_lip_x,mouth_center_top_lip_y,mouth_center_bottom_lip_x,mouth_center_bottom_lip_y,Image
0,66.033564,39.002274,30.227008,36.421678,59.582075,39.647423,73.130346,39.969997,36.356571,37.389402,...,57.066803,61.195308,79.970165,28.614496,77.388992,43.312602,72.935459,43.130707,84.485774,238 236 237 238 240 240 239 241 241 243 240 23...
1,64.332936,34.970077,29.949277,33.448715,58.85617,35.274349,70.722723,36.187166,36.034723,34.361532,...,55.660936,56.421447,76.352,35.122383,76.04766,46.684596,70.266553,45.467915,85.48017,219 215 204 196 204 211 212 200 180 168 178 19...
2,65.057053,34.909642,30.903789,34.909642,59.412,36.320968,70.984421,36.320968,37.678105,36.320968,...,53.538947,60.822947,73.014316,33.726316,72.732,47.274947,70.191789,47.274947,78.659368,144 142 159 180 188 188 184 180 167 132 84 59 ...
3,65.225739,37.261774,32.023096,37.261774,60.003339,39.127179,72.314713,38.380967,37.618643,38.754115,...,54.166539,65.598887,72.703722,37.245496,74.195478,50.303165,70.091687,51.561183,78.268383,193 192 193 194 194 194 193 192 168 111 50 12 ...
4,66.725301,39.621261,32.24481,38.042032,58.56589,39.621261,72.515926,39.884466,36.98238,39.094852,...,64.889521,60.671411,77.523239,31.191755,76.997301,44.962748,73.707387,44.227141,86.871166,147 148 160 196 215 214 216 217 219 220 206 18...


In [4]:
# check if the dataframe have some missing facial keypoints 
# if yes replce them with zeros
temp_df = df.drop('Image', axis=1)
# check for null values
print(temp_df.isnull().values.any())
# print the count of null/NaNs in each column of dataframe
print("\n\nMISSING VALUES FOR KEYPOINTS BY NAME:\n")
print(temp_df.isnull().sum())
print("\n\nTOTAL MISSING VALUES IN THE DATASET:\n")
print(temp_df.isnull().sum().sum())
# fill the missing values using forward filling rather than a 0 or some constant 
# value as it is likely that the next keypoint lies closer to previous one on a face
df = df.fillna(method="ffill")
print("Missing values after filling:", df.isnull().sum().sum())

True


MISSING VALUES FOR KEYPOINTS BY NAME:

left_eye_center_x              10
left_eye_center_y              10
right_eye_center_x             13
right_eye_center_y             13
left_eye_inner_corner_x      4778
left_eye_inner_corner_y      4778
left_eye_outer_corner_x      4782
left_eye_outer_corner_y      4782
right_eye_inner_corner_x     4781
right_eye_inner_corner_y     4781
right_eye_outer_corner_x     4781
right_eye_outer_corner_y     4781
left_eyebrow_inner_end_x     4779
left_eyebrow_inner_end_y     4779
left_eyebrow_outer_end_x     4824
left_eyebrow_outer_end_y     4824
right_eyebrow_inner_end_x    4779
right_eyebrow_inner_end_y    4779
right_eyebrow_outer_end_x    4813
right_eyebrow_outer_end_y    4813
nose_tip_x                      0
nose_tip_y                      0
mouth_left_corner_x          4780
mouth_left_corner_y          4780
mouth_right_corner_x         4779
mouth_right_corner_y         4779
mouth_center_top_lip_x       4774
mouth_center_top_lip_y       4774
mo

In [5]:
"""Splitting data into train and validation set 80/20"""
df_shape = df.shape
print(df_shape)
train_split = int(0.8 * df_shape[0])
print(train_split)
train_df = df.iloc[:train_split, :]
val_df = df.iloc[train_split:, :]
print(train_df.shape)
print(val_df.shape)

(7049, 31)
5639
(5639, 31)
(1410, 31)


### Prepare training data

In [6]:
"""Prepare X_train"""
# convert image pixels(str) to numpy(float32) arrays
train_images = train_df['Image']
flat_image_size = len(train_images[0].split())
X_train = np.zeros((train_df.shape[0],flat_image_size), dtype=np.float32)
for index in range(len(train_images)):
    pixels_str = train_images[index].split()
    # Using map function to convert all string values to float32
    pixels = list(map(np.float32, pixels_str))
    X_train[index] = np.asarray(pixels)
print(X_train.shape)

(5639, 9216)


In [7]:
X_train[0]

array([238., 236., 237., ...,  70.,  75.,  90.], dtype=float32)

In [8]:
# save X_train as .npy file for fast access
np.save('./processed_data/x_train.npy',X_train)

In [9]:
"""Prepare Y_train"""
# access the facial keypoints as rows of dataframe, remove the 'Image' column and convert the point to floats
column_names = list(train_df.columns.values)
# remove the name of last column as it holds Image pixel values
column_names = column_names[:len(column_names)-1]

Y_train = np.zeros((train_df.shape[0],len(column_names)), dtype=np.float32)
print(Y_train.shape)
# iterate through the rows of dataframe
for index, row in train_df.iterrows():
    keypoints = []
    for name in column_names:
        keypoints.append(row[name])
    Y_train[index] = np.asarray(keypoints)

(5639, 30)


In [10]:
# save Y_train as .npy file for fast access
np.save('./processed_data/y_train.npy',Y_train)

In [11]:
#rough
lis = train_df.iterrows()
print(type(lis))

<class 'generator'>


### Prepare validation data

In [14]:
val_df.head()['Image']

5639    138 138 138 137 135 132 128 123 117 112 107 10...
5640    47 39 32 29 33 31 24 20 17 17 20 22 23 23 24 2...
5641    112 117 123 133 132 126 113 109 111 114 113 10...
5642    44 45 46 46 46 46 46 46 44 43 54 64 73 76 75 8...
5643    42 41 42 44 47 50 54 59 64 69 68 64 55 48 43 4...
Name: Image, dtype: object

In [17]:
"""Prepare X_val"""
# convert image pixels(str) to numpy(float32) arrays
val_images = val_df['Image']
flat_image_size = len(val_images[train_split].split())
X_val = np.zeros((val_df.shape[0],flat_image_size), dtype=np.float32)
index = 0
for image in val_images:
    pixels_str = image.split()
    # Using map function to convert all string values to float32
    pixels = list(map(np.float32, pixels_str))
    X_val[index] = np.asarray(pixels)
    index += 1
print(X_val.shape)

(1410, 9216)


In [18]:
# save X_val as .npy file for fast access
np.save('./processed_data/x_val.npy',X_val)

In [30]:
"""Prepare Y_val"""
# access the facial keypoints as rows of dataframe, remove the 'Image' column and convert the point to floats
column_names = list(val_df.columns.values)
# remove the name of last column as it holds Image pixel values
column_names = column_names[:len(column_names)-1]

Y_val = np.zeros((val_df.shape[0],len(column_names)), dtype=np.float32)

# iterate through the rows of dataframe
index = 0
for ind, row in val_df.iterrows():
    keypoints = []
    for name in column_names:
        keypoints.append(row[name])
    Y_val[index] = np.asarray(keypoints)
    index += 1
print(Y_val.shape)

(1410, 30)
(1410, 30)


In [31]:
# save Y_val as .npy file for fast access
np.save('./processed_data/y_val.npy',Y_val)