# Create arrays for RNN training
Arrays are saved under 2FPS/data, i.e alongside the actual data

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
from matplotlib import pyplot as plt
%matplotlib inline
import shutil
import glob
from pathlib import Path
import pandas as pd
import numpy as np
import time
from PIL import Image

from keras.utils import to_categorical


In [2]:
fps = '2FPS'
frame_dir = '../../data/DAiSEE/' + fps + '/data/'

In [3]:
print(frame_dir)

../../data/DAiSEE/2FPS/data/


In [4]:
def get_data_frames (frame_dir, usage):
    # Get Data Files
    df = pd.DataFrame([file_path for file_path in Path(frame_dir + usage).glob('**/*.jpg')], columns=['file'])
    df["root"] = df["file"].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
    df['basefile'] = df['file'].apply(lambda x: os.path.basename(x))
    df['sequence'] = df['basefile'].apply(lambda x: int(x[x.find('_')+1:-4]))
    df['basename'] = df['basefile'].apply(lambda x: x[:x.find('_')])  
    df.sort_values(["root", "basename", "sequence"], inplace = True)
    

    return df

In [5]:
df_train = get_data_frames(frame_dir, 'train')
df_test = get_data_frames(frame_dir, 'test')
df_val = get_data_frames(frame_dir, 'validation')

In [6]:
# reset the indexes
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [7]:
# create label
df_train['label'] = df_train['root'].str.replace('b','').astype(int)
df_test['label'] = df_test['root'].str.replace('b','').astype(int)
df_val['label'] = df_val['root'].str.replace('b','').astype(int)

In [8]:
df_train.head(-10)

Unnamed: 0,file,root,basefile,sequence,basename,label
0,../../data/DAiSEE/2FPS/data/train/b0/110001100...,b0,1100011002_1.jpg,1,1100011002,0
1,../../data/DAiSEE/2FPS/data/train/b0/110001100...,b0,1100011002_2.jpg,2,1100011002,0
2,../../data/DAiSEE/2FPS/data/train/b0/110001100...,b0,1100011002_3.jpg,3,1100011002,0
3,../../data/DAiSEE/2FPS/data/train/b0/110001100...,b0,1100011002_4.jpg,4,1100011002,0
4,../../data/DAiSEE/2FPS/data/train/b0/110001100...,b0,1100011002_5.jpg,5,1100011002,0
...,...,...,...,...,...,...
107145,../../data/DAiSEE/2FPS/data/train/b3/459999021...,b3,4599990211_6.jpg,6,4599990211,3
107146,../../data/DAiSEE/2FPS/data/train/b3/459999021...,b3,4599990211_7.jpg,7,4599990211,3
107147,../../data/DAiSEE/2FPS/data/train/b3/459999021...,b3,4599990211_8.jpg,8,4599990211,3
107148,../../data/DAiSEE/2FPS/data/train/b3/459999021...,b3,4599990211_9.jpg,9,4599990211,3


In [9]:
# check shapes of df - they should all be divisible by 20
print(df_train.shape, 'divided by 20=', df_train.shape[0]/20)
print(df_test.shape, 'divided by 20=', df_test.shape[0]/20)
print(df_val.shape, 'divided by 20=', df_val.shape[0]/20)

(107160, 6) divided by 20= 5358.0
(35680, 6) divided by 20= 1784.0
(28580, 6) divided by 20= 1429.0


In [10]:
def save_array(df, usage):
    filepath = df['file'].to_numpy()
    y_arr = df['label'].to_numpy()
    y_arr_cat = to_categorical(y_arr, num_classes=4)
    
    np.save(f"{str(frame_dir)}/x_{usage.lower()}", filepath, allow_pickle=True)
    np.save(f"{str(frame_dir)}/y_cat_{usage.lower()}", y_arr_cat, allow_pickle=True)
    np.save(f"{str(frame_dir)}/y_lab_{usage.lower()}", y_arr, allow_pickle=True)

In [11]:
save_array(df_train, 'train')
save_array(df_test, 'test')
save_array(df_val, 'val')

# Save Arrays with images stored in array (takes time)

In [None]:
def save_array_images(frame_dir, usage):
    file_arr = np.load(frame_dir + 'x_' + usage + '.npy', allow_pickle=True) 
    file_str = file_arr.astype(str)
    x = np.array([np.array(Image.open(fname).resize((160,160))) for fname in file_str])
    np.save(frame_dir + 'X_' + usage + '_img.npy', x, allow_pickle=True)

In [None]:
save_array_images(frame_dir, 'train')
save_array_images(frame_dir, 'test')
save_array_images(frame_dir, 'val')