## Get Labels

Code to build dataframes that contain directory listings of train, test and validate extracted JPEG frames including labels  
Code to build dataframes that contain directory listings of train, test and validate extracted avi videos including labels  
Assumed working from DAiSEE dir

In [41]:
import os
import shutil
import glob
from pathlib import Path
import pandas as pd
import numpy as np

In [42]:
os.getcwd()

'/Users/chris/MIDS/w251/w251_ChrisSexton/Project/src/cnn'

In [43]:
fps = '2FPS'
frame_dir = '../../data/DAiSEE/' + fps + '/dataImages/'
label_path = '../../data/DAiSEE/Labels/'
out_dir = '../../data/DAiSEE/' + fps + '/data/' 

usage = ['Train', 'Test', 'Validation']

In [44]:
def get_labels (frame_dir, usage):
    df_l = pd.read_csv(label_path + usage + 'Labels.csv')
    df_l['basename'] = df_l['ClipID'].str[:-4]
    
    # Get Data Files
    df_j = pd.DataFrame([file_path for file_path in Path(frame_dir + usage).glob('*.jpg')], columns=['file'])
    df_j["root"] = df_j["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
    df_j['basefile'] = df_j['file'].apply(lambda x: os.path.basename(x))
    df_j['sequence'] = df_j['basefile'].apply(lambda x: int(x[x.find('_')+1:-4]))
    df_j['basename'] = df_j['basefile'].apply(lambda x: x[:x.find('_')])  
    
    # Merge and cleanup
    df = pd.merge(df_j, df_l, on='basename', how='inner')
    df = pd.merge(df_j, df_l, on='basename', how='inner')
    df = pd.merge(df_j, df_l, on='basename', how='inner')  
        
    df.rename(columns={'Frustration ':'Frustration'}, inplace = True)   
    df['file'] = df['file'].apply(lambda x: str(x))

    return df

In [45]:
df_train = get_labels(frame_dir, 'Train')
df_test = get_labels(frame_dir, 'Test')
df_val = get_labels(frame_dir, 'Validation')

In [46]:
df_val['Boredom'].value_counts()

2    9500
0    8920
1    7520
3    2640
Name: Boredom, dtype: int64

In [47]:
# write dataframe to pickle in case we need it later
df_train.to_pickle(frame_dir + "/df_train.pkl")
df_test.to_pickle(frame_dir + "/df_test.pkl")
df_val.to_pickle(frame_dir + "/df_val.pkl")

In [63]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(107140, 14)
(35680, 14)
(28580, 14)


Unnamed: 0,file,root,basefile,sequence,basename,ClipID,Boredom,Engagement,Confusion,Frustration,b0,b1,b2,b3
0,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_1.jpg,1,2100592066,2100592066.avi,1,2,0,0,0,1,0,0
1,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_14.jpg,14,2100592066,2100592066.avi,1,2,0,0,0,1,0,0
2,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_15.jpg,15,2100592066,2100592066.avi,1,2,0,0,0,1,0,0
3,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_17.jpg,17,2100592066,2100592066.avi,1,2,0,0,0,1,0,0
4,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_2.jpg,2,2100592066,2100592066.avi,1,2,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107135,../../data/DAiSEE/2FPS/dataImages/Train/210061...,Train,2100611005_2.jpg,2,2100611005,2100611005.avi,0,2,0,0,1,0,0,0
107136,../../data/DAiSEE/2FPS/dataImages/Train/210061...,Train,2100611005_10.jpg,10,2100611005,2100611005.avi,0,2,0,0,1,0,0,0
107137,../../data/DAiSEE/2FPS/dataImages/Train/210061...,Train,2100611005_12.jpg,12,2100611005,2100611005.avi,0,2,0,0,1,0,0,0
107138,../../data/DAiSEE/2FPS/dataImages/Train/210061...,Train,2100611005_1.jpg,1,2100611005,2100611005.avi,0,2,0,0,1,0,0,0


In [49]:
def save_arrays(df, usage):
    filepath = df['file'].to_numpy()
    label = np.array(df[['Boredom', 'Engagement', 'Confusion', 'Frustration']]) 

    np.save(f"{str(frame_dir)}/x_{usage.lower()}", filepath, allow_pickle=True)
    np.save(f"{str(frame_dir)}/y_{usage.lower()}", label)

In [50]:
save_arrays(df_train, 'train')
save_arrays(df_test, 'test')
save_arrays(df_val, 'validation')

## Move files to appropriate directories (train, test, val)

### Create Class Subdirectories for ease of use with Tenrorflow datasets

Inititally treat as a binary problem (cannot be a bit engaged and a bit bored  
Therefore create a binary class column based on boredom  
If there are duplicated we are going to keep going, because one less image file should not make a difference. 


In [51]:
def class_encoder(df):
    y = pd.get_dummies(df['Boredom'], prefix='b')
    df = pd.concat([df,y], axis = 1)
    df.rename(columns={"b_0": "b0", "b_1": "b1", "b_2": "b2", "b_3": "b3"}, inplace = True)
    
    return df

In [52]:
df_train = class_encoder(df_train)
df_test = class_encoder(df_test)
df_val = class_encoder(df_val)

In [53]:
# Check values for 'Boredom'
df_val.groupby(['Boredom']).size()

Boredom
0    8920
1    7520
2    9500
3    2640
dtype: int64

In [54]:
# Simple function to copy images to correct file structure
# As we have some duplicate file names, instead of fixing we will ignore as we have more than enough images
def copy_files(source, destination):
    for f in source:
        destination_file = os.path.join(destination, os.path.basename(f))
        shutil.copy(os.fspath(f), destination_file)

In [57]:
# Create image file structure for bored/not bored and copy files
cols = ['b0', 'b1', 'b2', 'b3']
dirs = ['train', 'test', 'validation']

for d in dirs:
    for c in cols:
        data_dir = out_dir + '/' + d + '/' + c
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

In [58]:
print(out_dir)

../../data/DAiSEE/2FPS/data/


In [59]:
# Move Train Images
copy_files(df_train[df_train['b0']==1]['file'].to_list(), out_dir + 'train/b0/')
copy_files(df_train[df_train['b1']==1]['file'].to_list(), out_dir + 'train/b1/')
copy_files(df_train[df_train['b2']==1]['file'].to_list(), out_dir + 'train/b2/')
copy_files(df_train[df_train['b3']==1]['file'].to_list(), out_dir + 'train/b3/')

In [60]:
# Move Test Images
copy_files(df_test[df_test['b0']==1]['file'].to_list(), out_dir + '/test/b0/')
copy_files(df_test[df_test['b1']==1]['file'].to_list(), out_dir + '/test/b1/')
copy_files(df_test[df_test['b2']==1]['file'].to_list(), out_dir + '/test/b2/')
copy_files(df_test[df_test['b3']==1]['file'].to_list(), out_dir + '/test/b3/')

In [61]:
# Move Validation Images
copy_files(df_val[df_val['b0']==1]['file'].to_list(), out_dir + '/validation/b0/')
copy_files(df_val[df_val['b1']==1]['file'].to_list(), out_dir + '/validation/b1/')
copy_files(df_val[df_val['b2']==1]['file'].to_list(), out_dir + '/validation/b2/')
copy_files(df_val[df_val['b3']==1]['file'].to_list(), out_dir + '/validation/b3/')