We use this script to process the Gesture dataset and get the input files for the GRU-D models.

In [1]:
import os 
import numpy as np
import pandas as pd
from python_linq import From

In [2]:
seed = 42
np.random.seed(seed=seed)

In [3]:
gesture_root_folder = os.path.join('gesture_datas/')
grud_working_folder = '.' # # This should be the [WD] (working directory) of the GRU-D project.
grud_gesture_data_folder = os.path.join(grud_working_folder, 'data', 'gesture')
if not os.path.exists(grud_gesture_data_folder):
    os.makedirs(grud_gesture_data_folder)
print('The data files will be converted and saved here: ', grud_gesture_data_folder)

The data files will be converted and saved here:  ./data/gesture


In [4]:
def Load_data(path):
    all_files = os.listdir(path)
    files_to_load = From(all_files).where(lambda file: file[-3:]=='csv').where(lambda file: file[-5]=='w').toList()
    df_Big = pd.read_csv(path +files_to_load[0], sep=',')
    for file in files_to_load:
        df_file = pd.read_csv(path + file, sep=',')
        df_Big = df_Big.append(df_file)
    df_Big = df_Big.loc[df_Big['phase']!='Preparação']
    return df_Big
dataframe = Load_data(gesture_root_folder)

In [5]:
def Truncate(df):
    k = 0
    nb_rows = df.shape[0]
    df_to_fill = pd.DataFrame(columns=df.columns.values.tolist())
    while k < nb_rows:    
        row_considered = df.iloc[[k]]
        phase_considered = row_considered['phase'].values.tolist()[0]
        phase_tmp = phase_considered
        #print(phase_considered)
        #print(phase_tmp)
        i = 0
        while phase_tmp == phase_considered and i<30:
            if k + i + 1 < nb_rows:
                # Verifying if there is a next value
                i += 1
                phase_tmp = df.iloc[[i+k]]['phase'].values.tolist()[0]
            else:
                # End of while
                break
        if i > 7:
            # Add the row considered            
            df_to_fill = df_to_fill.append(row_considered)            
        if k + i + 1 < nb_rows:
            k += i
        else:
            # End of the dataframe
            break
    df_to_fill = df_to_fill.reset_index(drop=True)
    return df_to_fill
df_truncated = Truncate(dataframe)

In [6]:
def Creat_missing_values(df, nb_rows_to_del=1):
    nb_rows = df.shape[0]
    nb_columns = df.shape[1]
    index_still = np.arange(nb_rows)
    df_copy = df.copy()
    if nb_rows_to_del < nb_rows:
        while nb_rows-len(index_still)<nb_rows_to_del:
            index_del = np.random.randint(len(index_still)-1)
            timestamp_saved = df.iloc[index_del].__getitem__('timestamp')
            df_copy.iloc[index_del] = [np.nan for i in range(nb_columns)]
            df_copy.loc[index_del, 'timestamp'] = timestamp_saved
            index_still = np.delete(index_still,index_del)
    return df_copy
df_truncated_missing_data = Creat_missing_values(df_truncated)

In [7]:
def Creat_mask(df):
    mask = df_truncated_missing_data.copy()
    mask = mask.where(mask.isnull(),1)
    mask = mask.where(mask.isnull()!=True,0)
    return mask
Mask = Creat_mask(df_truncated_missing_data)

In [8]:
timestamp = df_truncated_missing_data.timestamp.values
processed_data = {
    'input': np.asarray(df_truncated_missing_data.loc[:,'lhx':'rwz'].values),
    'masking': np.asarray(Mask.loc[:,'lhx':'rwz'].values)
}
n = len(timestamp)

In [9]:
# process label
#phase = ['Rest', 'Preparation', 'Hold', 'Stroke', 'Retraction']
processed_label = {
    'label_phase': np.asarray(df_truncated_missing_data['phase'])
}

In [10]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'data.npz'), timestamp=timestamp, **processed_data, **processed_label)

In [11]:
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
#https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-of-time-series-data

In [12]:
#df_truncated_missing_data.loc[:,'lhx':'rwz']

In [13]:
processed_fold = {
    'fold_phase': np.empty([5,3], dtype=object)
}
# process statistics
# n_labeltype, 1, n_fold, (ts, nts), (mean, std)
processed_stats = {
    'mean_phase': np.empty([5,3], dtype=object),
    'std_phase': np.empty([5,3], dtype=object)
}

In [14]:
ts = TimeSeriesSplit(n_splits=5)
val_rate = 0.2
i=0
for train_index, test_index in ts.split(processed_data['input']): 
    X_train, X_test = processed_data['input'][train_index], processed_data['input'][test_index]
    X_train, X_val = X_train[0:int(len(X_train)*(1-val_rate))], X_train[int(len(X_train)*(1-val_rate)):]
    
    processed_fold['fold_phase'][i][0] = np.asarray(train_index[0:int(len(X_train)*(1-val_rate))])
    processed_fold['fold_phase'][i][1] = np.asarray(train_index[int(len(X_train)*(1-val_rate)):])
    processed_fold['fold_phase'][i][2] = np.asarray(test_index)
    
    processed_stats['mean_phase'][i][0] = np.asarray([np.mean(X_train[:,i]) for i in range(X_train.shape[1])])
    processed_stats['std_phase'][i][0] = np.asarray([np.std(X_train[:,i]) for i in range(X_train.shape[1])])
    i += 1


In [15]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'fold.npz'), **processed_fold, **processed_stats)

In [16]:
data = np.load(os.path.join(grud_gesture_data_folder, 'data.npz'))
for k in data.keys():
    print(k, data[k].shape)

fold = np.load(os.path.join(grud_gesture_data_folder, 'fold.npz'))
for k in fold.keys():
    print(k, fold[k].shape)
    for f in fold[k]:
        print('\t', [x.shape for x in f])


timestamp (532,)
input (532, 18)
masking (532, 18)
label_phase (532,)
fold_phase (5, 3)
	 [(58,), (34,), (88,)]
	 [(115,), (65,), (88,)]
	 [(171,), (97,), (88,)]
	 [(227,), (129,), (88,)]
	 [(284,), (160,), (88,)]
mean_phase (5, 3)


AttributeError: 'NoneType' object has no attribute 'shape'