## Get Labels

Code to build dataframes that contain directory listings of train, test and validate extracted JPEG frames including labels  
Code to build dataframes that contain directory listings of train, test and validate extracted avi videos including labels  
Assumed working from DAiSEE dir

In [1]:
import os
import shutil
import glob
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
os.getcwd()

'/Users/chris/MIDS/w251/w251_ChrisSexton/Project/src/cnn'

In [30]:
fps = '1FPS'
frame_dir = '../../data/DAiSEE/' + fps + '/DataSet/'
label_path = '../../data/DAiSEE/Labels/'
out_dir = '../../data/DAiSEE/' + fps + '/data' 

In [31]:
# Get Labels
df_train_labels = pd.read_csv(label_path + 'TrainLabels.csv')
df_train_labels['root'] = df_train_labels['ClipID'].str[:-4]

df_test_labels = pd.read_csv(label_path + 'TestLabels.csv')
df_test_labels['root'] = df_test_labels['ClipID'].str[:-4]

df_val_labels = pd.read_csv(label_path + 'ValidationLabels.csv')
df_val_labels['root'] = df_val_labels['ClipID'].str[:-4]

In [32]:
# Do some EDA on the labels. This is a multi label dataset - can it be simplified?
df_train_labels.shape


(5358, 6)

In [33]:
# Get Data Files
# Train
df_train_jpg = pd.DataFrame([file_path for file_path in Path(frame_dir + 'Train').glob('**/*.jpg')], columns=['file'])
df_train_jpg["root"] = df_train_jpg["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
# Test
df_test_jpg = pd.DataFrame([file_path for file_path in Path(frame_dir + 'Test').glob('**/*.jpg')], columns=['file'])
df_test_jpg["root"] = df_test_jpg["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
# Validation
df_val_jpg = pd.DataFrame([file_path for file_path in Path(frame_dir +'Validation').glob('**/*.jpg')], columns=['file'])
df_val_jpg["root"] = df_val_jpg["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])

In [34]:
df_train_jpg.shape

(109732, 2)

In [35]:
#Merge
df_train = pd.merge(df_train_jpg, df_train_labels, on='root', how='left')
df_test = pd.merge(df_test_jpg, df_test_labels, on='root', how='left')
df_val = pd.merge(df_val_jpg, df_val_labels, on='root', how='left')

In [36]:
# Get the Base File Name
df_train['basefile'] = df_train['file'].apply(lambda x: os.path.basename(x))
df_test['basefile'] = df_test['file'].apply(lambda x: os.path.basename(x))
df_val['basefile'] = df_val['file'].apply(lambda x: os.path.basename(x))

In [37]:
# Fix Error in file name (exra space)
df_train.rename(columns={'Frustration ':'Frustration'}, inplace = True)
df_test.rename(columns={'Frustration ':'Frustration'}, inplace = True)
df_val.rename(columns={'Frustration ':'Frustration'}, inplace = True)

In [38]:
print(df_train.shape)
df_train.head()

(109732, 8)


Unnamed: 0,file,root,ClipID,Boredom,Engagement,Confusion,Frustration,basefile
0,../../data/DAiSEE/2FPS/DataSet/Train/110004/11...,1100042011,1100042011.avi,2.0,2.0,1.0,1.0,110004201115.jpg
1,../../data/DAiSEE/2FPS/DataSet/Train/110004/11...,1100042011,1100042011.avi,2.0,2.0,1.0,1.0,110004201114.jpg
2,../../data/DAiSEE/2FPS/DataSet/Train/110004/11...,1100042011,1100042011.avi,2.0,2.0,1.0,1.0,110004201116.jpg
3,../../data/DAiSEE/2FPS/DataSet/Train/110004/11...,1100042011,1100042011.avi,2.0,2.0,1.0,1.0,110004201117.jpg
4,../../data/DAiSEE/2FPS/DataSet/Train/110004/11...,1100042011,1100042011.avi,2.0,2.0,1.0,1.0,11000420119.jpg


In [39]:
# write dataframe to pickle in case we need it later
df_train.to_pickle(out_dir + "/df_train.pkl")
df_test.to_pickle(out_dir + "/df_test.pkl")
df_val.to_pickle(out_dir + "/df_val.pkl")

In [40]:
filepath = df_train['file_str'] = df_train['file'].to_string()
filepath = df_test['file_str'] = df_test['file'].to_string()
filepath = df_val['file_str'] = df_val['file'].to_string()

In [41]:
def save_arrays(df, usage):
    np.random.seed(100)
    indices = np.random.permutation(len(df['file']))
 
    filepath = df['file_str'].to_numpy()
    filepath = filepath[indices]
    label = np.array(df[['Boredom', 'Engagement', 'Confusion', 'Frustration']]) 
    label = label[indices]

    np.save(f"{str(out_dir)}/x_{usage.lower()}", filepath, allow_pickle=True)
    np.save(f"{str(out_dir)}/y_{usage.lower()}", label)

In [42]:
save_arrays(df_train, 'train')
save_arrays(df_test, 'test')
save_arrays(df_val, 'validation')

## Move files to appropriate directories (train, test, val)

### Create Class Subdirectories for ease of use with Tenrorflow datasets

Inititally treat as a binary problem (cannot be a bit engaged and a bit bored  
Therefore create a binary class column based on boredom  
If there are duplicated we are going to keep going, because one less image file should not make a difference. 


In [27]:
# Check values for 'Boredom'
df_train.groupby(['Boredom']).size()

Boredom
0.0    24330
1.0    16960
2.0    10730
3.0     1560
dtype: int64

In [28]:
# One Hot Encode
y_train = pd.get_dummies(df_train['Boredom'], prefix='b')
y_test = pd.get_dummies(df_test['Boredom'], prefix='b')
y_val = pd.get_dummies(df_val['Boredom'], prefix='b')

df_train = pd.concat([df_train,y_train], axis = 1)
df_test = pd.concat([df_test,y_test], axis = 1)
df_val = pd.concat([df_val,y_val], axis = 1)

In [29]:
# Check counts of labels
df_train.groupby(['b_0.0']).size()

b_0.0
0    30490
1    24330
dtype: int64

In [30]:
df_train.rename(columns={"b_0.0": "b0", "b_1.0": "b1", "b_2.0": "b2", "b_3.0": "b3"}, inplace = True)
df_test.rename(columns={"b_0.0": "b0", "b_1.0": "b1", "b_2.0": "b2", "b_3.0": "b3"}, inplace = True)
df_val.rename(columns={"b_0.0": "b0", "b_1.0": "b1", "b_2.0": "b2", "b_3.0": "b3"}, inplace = True)


In [36]:
# Simple function to copy images to correct file structure
# As we have some duplicate file names, instead of fixing we will ignore as we have more than enough images
def copy_files(source, destination):
    for f in source:
        try:
            destination_file = os.path.join(destination, os.path.basename(f))
            shutil.copy(os.fspath(f), destination_file)
        except: 
            continue

In [37]:
# Create image file structure for bored/not bored
cols = ['b0', 'b1', 'b2', 'b3']
dirs = ['train', 'test', 'validation']

for d in dirs:
    for c in cols:
        data_dir = out_dir + '/' + d + '/' + c
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

In [22]:
# Move Train Images
copy_files(df_train[df_train['b0']==1]['file'].to_list(), out_dir + '/train/b0/')
copy_files(df_train[df_train['b1']==1]['file'].to_list(), out_dir + '/train/b1/')
copy_files(df_train[df_train['b2']==1]['file'].to_list(), out_dir + '/train/b2/')
copy_files(df_train[df_train['b3']==1]['file'].to_list(), out_dir + '/train/b3/')

In [23]:
# Move Test Images
copy_files(df_test[df_test['b0']==1]['file'].to_list(), out_dir + '/test/b0/')
copy_files(df_test[df_test['b1']==1]['file'].to_list(), out_dir + '/test/b1/')
copy_files(df_test[df_test['b2']==1]['file'].to_list(), out_dir + '/test/b2/')
copy_files(df_test[df_test['b3']==1]['file'].to_list(), out_dir + '/test/b3/')

In [24]:
# Move Validation Images
copy_files(df_val[df_val['b0']==1]['file'].to_list(), out_dir + '/validation/b0/')
copy_files(df_val[df_val['b1']==1]['file'].to_list(), out_dir + '/validation/b1/')
copy_files(df_val[df_val['b2']==1]['file'].to_list(), out_dir + '/validation/b2/')
copy_files(df_val[df_val['b3']==1]['file'].to_list(), out_dir + '/validation/b3/')