In [None]:
import os
import pickle
import datetime
import pandas as pd
import numpy as np
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Reshape, Permute
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D, Conv2D, DepthwiseConv2D, SeparableConv2D
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.backend import clear_session
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy import signal

# AU definitions
au_columns_name = {}
au_columns_name['AU01_r'] = 'Inner brow raiser, upper'
au_columns_name['AU02_r'] = 'Outer brow raiser, upper'
au_columns_name['AU04_r'] = 'Brow lowerer, upper'
au_columns_name['AU05_r'] = 'Upper lid raiser, upper'
au_columns_name['AU06_r'] = 'Cheekraiser, upper'
au_columns_name['AU07_r'] = 'Lid tightener, upper'
au_columns_name['AU09_r'] = 'Nose wrinkler, lower'
au_columns_name['AU10_r'] = 'Upper lip raiser, lower'
au_columns_name['AU12_r'] = 'Lip corner puller, lower'
au_columns_name['AU14_r'] = 'Dimpler, lower'
au_columns_name['AU15_r'] = 'Lip corner depressor, lower'
au_columns_name['AU17_r'] = 'Chin raiser, lower'
au_columns_name['AU20_r'] = 'Lipstretcher, lower'
au_columns_name['AU23_r'] = 'Lip tightener, lower'
au_columns_name['AU25_r'] = 'Lips part, lower'
au_columns_name['AU26_r'] = 'Jaw drop, lower'
au_columns_name['AU45_r'] = 'Blink, upper'

def generate_dataset(subjects, studies, paradigms, max_sequences=58, stack=False):
    """Generate Dataset based on resampling method for creating a balanced dataset with the action unit information.
    """
    fps_dataset = pd.read_csv('../dataset/csv_labels/FPS_of_stutter_dataset.csv', index_col=0)
    
    for subject in subjects:
        for study in studies:
            for paradigm in paradigms:
                global X
                global Y
                
                print("{}/{}/{}".format(subject, study, paradigm))
                fps = int(fps_dataset[np.logical_and(np.logical_and(fps_dataset['Paradigm']=='CAW1', fps_dataset['Study']=='S1'), fps_dataset['Subject']==982)]["FPS"])
                max_fps = int(max(fps_dataset['FPS']))
                
                try:
                    df_with_trials = pd.read_csv(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.lower(), 
                                                             ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv', sep=',')
                except:
                    df_with_trials = pd.read_csv(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.upper(), 
                                                             ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv', sep=',')

                try:
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.lower(), ('_').join([subject, paradigm.lower()])]) + '.csv', sep=', ')
                except:
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.upper(), ('_').join([subject, paradigm.upper()])]) + '.csv', sep=', ')
            
                #print(full_au_dataset.head())
                just_aus = full_au_dataset[sorted(list(au_columns_name.keys()))]
                # print(just_aus.head())
                just_aus['sequence'] = just_aus.apply(lambda x: list(x), axis=1)
                # just_aus.head()

                # minimum number of sequences found from the dataset csv.
                min_length = min(df_with_trials['trial_end'] - df_with_trials['trial_start'])
                # for all 50 trials per subject
                for i in range(50):
                    S1 = df_with_trials['trial_start'][i]
                    S2 = int(S1 + 1.5*fps)
                    signal_df = pd.DataFrame(list(just_aus['sequence'].iloc[S1:S2]))
                    signal_df_resampled = signal_df.apply(lambda x: signal.resample(x, int(1.5*max_fps)), axis=0)

                    # Round the values to 3 decimals (ablation study for preprocessing maybe?)
                    signal_df_resampled = np.round(signal_df_resampled, 3)
                    # Convert to numpy array
                    signal_array = np.asarray(np.round(signal_df_resampled, 3), np.float32)

                    X.append(signal_array)
                    Y.append(df_with_trials['code'].iloc[i])
                    
    if stack:
        # Moving axes to DL format NHWC. However, here, we don't consider 'c (channels)'
        # because we do the permute and reshape operations during training in Keras.
        # Not the best way to do it, but we can always add code to create the dataset properly
        # to work with any DL platform. As such the only change needed to this X array
        # is to add a new axis for the channels and make it 4D NHWC or NCHW.
        X = np.dstack(X)
        X = np.moveaxis(X, [0,1], [2,1])

def stack_dataset(X):
    # Moving axes to DL format NHWC. However, here, we don't consider 'c (channels)'
    # because we do the permute and reshape operations during training in Keras.
    # Not the best way to do it, but we can always add code to create the dataset properly
    # to work with any DL platform. As such the only change needed to this X array
    # is to add a new axis for the channels and make it 4D NHWC or NCHW.
    X = np.dstack(X)
    X = np.moveaxis(X, [0,1], [2,1])
    return (X)

def correct_dataset_labels(_X, _Y):
    list_of_unwanted_indexes = []
    for i, y in enumerate(_Y):
        # print (i,y)
        if y==0:
            _Y[i] = 0
        if 0 < y < 4:
            # print(y, "Y[i] between 0 and 4. Y[i] = 1")
            _Y[i] = 1
        elif y>3:
            list_of_unwanted_indexes.append(i)    

    _X_ = [v for i,v in enumerate(_X) if i not in list_of_unwanted_indexes]
    _Y_ = [v for i,v in enumerate(_Y) if i not in list_of_unwanted_indexes]
    return(np.asarray(_X_), np.asarray(_Y_))

X = []
Y = []

#Upsampled dataset for 58 fps
max_sequences = 442

In [None]:
subjects = ['971']
studies = ['S3', 'S5']
paradigms = ['caw1', 'caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms, max_sequences)

In [None]:
subjects = ['982']
studies = ['S1']
paradigms = ['caw1', 'caw2', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms, max_sequences)

In [None]:
subjects = ['982']
studies = ['S2']
paradigms = ['caw2', 'wag1', 'wag2', 'cw2', 'wg1']
generate_dataset(subjects, studies, paradigms, max_sequences)

In [None]:
subjects = ['982']
studies = ['S3']
paradigms = ['caw1', 'caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms, max_sequences)

In [None]:
subjects = ['1214']
studies = ['S1']
paradigms = ['caw1', 'caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms, max_sequences)

In [None]:
# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

In [None]:
# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)

print(X_array_corrected.shape)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

In [None]:
# raw data
with open('../dataset/pickled_datasets/X_array_raw_upsampled_S1S2.pkl','wb') as f: pickle.dump(np.asarray(X_array, np.float32), f)
with open('../dataset/pickled_datasets/Y_array_raw_upsampled_S1S2.pkl','wb') as f: pickle.dump(np.asarray(Y_array, np.float32), f)
# label corrected data
with open('../dataset/pickled_datasets/X_array_corrected_upsampled_S1S2.pkl','wb') as f: pickle.dump(np.asarray(X_array_corrected, np.float32), f)
with open('../dataset/pickled_datasets/Y_array_corrected_upsampled_S1S2.pkl','wb') as f: pickle.dump(np.asarray(Y_array_corrected, np.float32), f)