# Notebook to take 2FPS data and make sure there is an exactly 20 Frames for each video
# This is for whole image only, not faces

In [4]:
import warnings
warnings.filterwarnings('ignore')

import os
from matplotlib import pyplot as plt
%matplotlib inline
import shutil
import glob
from pathlib import Path
import pandas as pd
import numpy as np
import time

from PIL import Image
import cv2
from numpy import asarray

from keras.utils import to_categorical

import os

import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.utils import class_weight

Using TensorFlow backend.


In [5]:
fps = '2FPS'
frame_dir = '../../data/DAiSEE/' + fps + '/dataImages/'
label_path = '../../data/DAiSEE/Labels/'
out_dir = '../../data/DAiSEE/' + fps + '/data/' 

usage = ['Train', 'Test', 'Validation']

In [11]:
def get_labels (frame_dir, usage):
    df_l = pd.read_csv(label_path + usage + 'Labels.csv')
    df_l['basename'] = df_l['ClipID'].str[:-4]
    
    # Get Data Files
    df_j = pd.DataFrame([file_path for file_path in Path(frame_dir + usage).glob('*.jpg')], columns=['file'])
    df_j["root"] = df_j["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
    df_j['basefile'] = df_j['file'].apply(lambda x: os.path.basename(x))
    df_j['sequence'] = df_j['basefile'].apply(lambda x: int(x[x.find('_')+1:-4]))
    df_j['basename'] = df_j['basefile'].apply(lambda x: x[:x.find('_')])  
    
    # Merge and cleanup
    df = pd.merge(df_j, df_l, on='basename', how='inner')
    df = pd.merge(df_j, df_l, on='basename', how='inner')
    df = pd.merge(df_j, df_l, on='basename', how='inner')  
        
    df.rename(columns={'Frustration ':'Frustration'}, inplace = True)   
    df['file'] = df['file'].apply(lambda x: str(x))

    return df

In [12]:
df_train = get_labels(frame_dir, 'Train')
df_test = get_labels(frame_dir, 'Test')
df_val = get_labels(frame_dir, 'Validation')

In [13]:
df_val['Boredom'].value_counts()

2    9500
0    8920
1    7520
3    2640
Name: Boredom, dtype: int64

In [15]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(107140, 10)
(35680, 10)
(28580, 10)


In [16]:
df_train.head()

Unnamed: 0,file,root,basefile,sequence,basename,ClipID,Boredom,Engagement,Confusion,Frustration
0,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_1.jpg,1,2100592066,2100592066.avi,1,2,0,0
1,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_14.jpg,14,2100592066,2100592066.avi,1,2,0,0
2,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_15.jpg,15,2100592066,2100592066.avi,1,2,0,0
3,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_17.jpg,17,2100592066,2100592066.avi,1,2,0,0
4,../../data/DAiSEE/2FPS/dataImages/Train/210059...,Train,2100592066_2.jpg,2,2100592066,2100592066.avi,1,2,0,0


In [17]:
# get the file path and name)
df_train.sort_values(["basename", "sequence"], inplace = True)    
df_test.sort_values(["basename", "sequence"], inplace = True)   
df_val.sort_values(["basename", "sequence"], inplace = True)

In [22]:
df_train.head()

Unnamed: 0,file,root,basefile,sequence,basename,ClipID,Boredom,Engagement,Confusion,Frustration
87024,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_1.jpg,1,1100011002,1100011002.avi,0,2,0,0
87023,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_2.jpg,2,1100011002,1100011002.avi,0,2,0,0
87020,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_3.jpg,3,1100011002,1100011002.avi,0,2,0,0
87026,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_4.jpg,4,1100011002,1100011002.avi,0,2,0,0
87025,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_5.jpg,5,1100011002,1100011002.avi,0,2,0,0


In [23]:
def del_files(file_list):
    for f in file_list:
        try:
            os.remove(f)
        except:
            continue

In [24]:
# resequence the faces
df_train['frame_seq']=df_train.groupby('basename').cumcount()
df_test['frame_seq']=df_test.groupby('basename').cumcount()
df_val['frame_seq']=df_val.groupby('basename').cumcount()

In [25]:
# check minimum number of frames per basename again - should be 20
print("train", df_train.groupby(['basename']).size().min())
print("test",df_test.groupby(['basename']).size().min())
print("val",df_val.groupby(['basename']).size().min())

train 20
test 20
val 20


In [26]:
# Find the file whereby the max number of frames is 19
# we are going to delete so that we have exactly 20 frames
df_tmp = df_train.groupby('basename')['frame_seq'].max().to_frame().reset_index()
df_tmp["frame_seq"] = pd.to_numeric(df_tmp["frame_seq"])
df_tmp[df_tmp["frame_seq"]<19]

Unnamed: 0,basename,frame_seq


In [64]:
# delete from filesystem
del_short_files = df_train[df_train['basename']=='2100552061']['file'].to_list()
del_files(del_short_files)
# delete from dataframe
df_train.drop(df_train[df_train['basename'] == '2100552061'].index, inplace = True) 

In [55]:
# Now Remove just the files that are too long, i.e. more than 20 frames

# get list of files and delete
train_del_files = df_train[df_train['frame_seq'] > 19]['file'].to_list()
del_files(train_del_files)

test_del_files = df_test[df_test['frame_seq'] > 19]['file'].to_list()
del_files(test_del_files)

val_del_files = df_val[df_val['frame_seq'] > 19]['file'].to_list()
del_files(val_del_files)

# remove from dataframe
df_train.drop(df_train[df_train['frame_seq'] > 19].index, inplace = True) 
df_test.drop(df_test[df_test['frame_seq'] > 19].index, inplace = True) 
df_val.drop(df_val[df_val['frame_seq'] > 19].index, inplace = True) 

In [27]:
# check minimum number of frames per basename again - should now be 20
print("train", df_train.groupby(['basename']).size().min())
print("test",df_test.groupby(['basename']).size().min())
print("val",df_val.groupby(['basename']).size().min())

train 20
test 20
val 20


In [28]:
# reset the indexes
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [29]:
df_train.head()

Unnamed: 0,file,root,basefile,sequence,basename,ClipID,Boredom,Engagement,Confusion,Frustration,frame_seq
0,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_1.jpg,1,1100011002,1100011002.avi,0,2,0,0,0
1,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_2.jpg,2,1100011002,1100011002.avi,0,2,0,0,1
2,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_3.jpg,3,1100011002,1100011002.avi,0,2,0,0,2
3,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_4.jpg,4,1100011002,1100011002.avi,0,2,0,0,3
4,../../data/DAiSEE/2FPS/dataImages/Train/110001...,Train,1100011002_5.jpg,5,1100011002,1100011002.avi,0,2,0,0,4


In [30]:
# Now all the dataframes are in order and all the files are in order 
# we can resave the dataframe for future use
df_train.to_pickle(frame_dir + "/df_train.pkl")
df_test.to_pickle(frame_dir + "/df_test.pkl")
df_val.to_pickle(frame_dir + "/df_val.pkl")

In [32]:
# Save as Arrays (do not shuffle)
def save_arrays(df, usage): 
    filepath = df['file'].to_numpy()
    label = np.array(df[['Boredom', 'Engagement', 'Confusion', 'Frustration']]) 

    np.save(f"{str(frame_dir)}/x_{usage.lower()}", filepath, allow_pickle=True)
    np.save(f"{str(frame_dir)}/y_{usage.lower()}", label)

In [33]:
save_arrays(df_train, 'train')
save_arrays(df_test, 'test')
save_arrays(df_val, 'validation')

# NOW COPY TO BOREDOM DIRECTORIES



In [34]:
def class_encoder(df):
    y = pd.get_dummies(df['Boredom'], prefix='b')
    df = pd.concat([df,y], axis = 1)
    df.rename(columns={"b_0": "b0", "b_1": "b1", "b_2": "b2", "b_3": "b3"}, inplace = True)
    
    return df

In [35]:
df_train = class_encoder(df_train)
df_test = class_encoder(df_test)
df_val = class_encoder(df_val)

In [36]:
# Check values for 'Boredom'
df_val.groupby(['Boredom']).size()

Boredom
0    8920
1    7520
2    9500
3    2640
dtype: int64

In [37]:
# Simple function to copy images to correct file structure
# As we have some duplicate file names, instead of fixing we will ignore as we have more than enough images
def copy_files(source, destination):
    for f in source:
        destination_file = os.path.join(destination, os.path.basename(f))
        shutil.copy(os.fspath(f), destination_file)

In [38]:
# Create image file structure for bored/not bored and copy files
cols = ['b0', 'b1', 'b2', 'b3']
dirs = ['train', 'test', 'validation']

for d in dirs:
    for c in cols:
        data_dir = out_dir + '/' + d + '/' + c
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

In [39]:
print(out_dir)

../../data/DAiSEE/2FPS/data/


In [40]:
# Move Train Images
copy_files(df_train[df_train['b0']==1]['file'].to_list(), out_dir + 'train/b0/')
copy_files(df_train[df_train['b1']==1]['file'].to_list(), out_dir + 'train/b1/')
copy_files(df_train[df_train['b2']==1]['file'].to_list(), out_dir + 'train/b2/')
copy_files(df_train[df_train['b3']==1]['file'].to_list(), out_dir + 'train/b3/')

In [41]:
# Move Test Images
copy_files(df_test[df_test['b0']==1]['file'].to_list(), out_dir + '/test/b0/')
copy_files(df_test[df_test['b1']==1]['file'].to_list(), out_dir + '/test/b1/')
copy_files(df_test[df_test['b2']==1]['file'].to_list(), out_dir + '/test/b2/')
copy_files(df_test[df_test['b3']==1]['file'].to_list(), out_dir + '/test/b3/')

In [42]:
# Move Validation Images
copy_files(df_val[df_val['b0']==1]['file'].to_list(), out_dir + '/validation/b0/')
copy_files(df_val[df_val['b1']==1]['file'].to_list(), out_dir + '/validation/b1/')
copy_files(df_val[df_val['b2']==1]['file'].to_list(), out_dir + '/validation/b2/')
copy_files(df_val[df_val['b3']==1]['file'].to_list(), out_dir + '/validation/b3/')