We use this script to process the Gesture dataset and get the input files for the GRU-D models.

In [1]:
import os 
import numpy as np
import pandas as pd
from python_linq import From

In [2]:
seed = 42
np.random.seed(seed=seed)

In [3]:
gesture_root_folder = os.path.join('gesture_datas/')
grud_working_folder = '.' # # This should be the [WD] (working directory) of the GRU-D project.
grud_gesture_data_folder = os.path.join(grud_working_folder, 'data', 'gesture')
if not os.path.exists(grud_gesture_data_folder):
    os.makedirs(grud_gesture_data_folder)
print('The data files will be converted and saved here: ', grud_gesture_data_folder)

The data files will be converted and saved here:  ./data/gesture


In [4]:
def Load_data(path):
    all_files = os.listdir(path)
    files_to_load = From(all_files).where(lambda file: file[-3:]=='csv').where(lambda file: file[-5]=='w').toList()
    df_Big = pd.read_csv(path +files_to_load[0], sep=',')
    for file in files_to_load:
        df_file = pd.read_csv(path + file, sep=',')
        df_Big = df_Big.append(df_file)
    df_Big = df_Big.loc[df_Big['phase']!='Preparação']
    return df_Big
dataframe = Load_data(gesture_root_folder)

In [5]:
def Truncate(df):
    k = 0
    nb_rows = df.shape[0]
    df_to_fill = pd.DataFrame(columns=df.columns.values.tolist())
    while k < nb_rows:    
        row_considered = df.iloc[[k]]
        phase_considered = row_considered['phase'].values.tolist()[0]
        phase_tmp = phase_considered
        #print(phase_considered)
        #print(phase_tmp)
        i = 0
        while phase_tmp == phase_considered and i<30:
            if k + i + 1 < nb_rows:
                # Verifying if there is a next value
                i += 1
                phase_tmp = df.iloc[[i+k]]['phase'].values.tolist()[0]
            else:
                # End of while
                break
        if i > 7:
            # Add the row considered            
            df_to_fill = df_to_fill.append(row_considered)            
        if k + i + 1 < nb_rows:
            k += i
        else:
            # End of the dataframe
            break
    df_to_fill = df_to_fill.reset_index(drop=True)
    return df_to_fill
df_truncated = Truncate(dataframe)

In [6]:
def Creat_missing_values(df, nb_rows_to_del=1):
    nb_rows = df.shape[0]
    nb_columns = df.shape[1]
    index_still = np.arange(nb_rows)
    df_copy = df.copy()
    if nb_rows_to_del < nb_rows:
        while nb_rows-len(index_still)<nb_rows_to_del:
            index_del = np.random.randint(len(index_still)-1)
            timestamp_saved = df.iloc[index_del].__getitem__('timestamp')
            df_copy.iloc[index_del] = [np.nan for i in range(nb_columns)]
            df_copy.loc[index_del, 'timestamp'] = timestamp_saved
            index_still = np.delete(index_still,index_del)
    return df_copy
df_truncated_missing_data = Creat_missing_values(df_truncated)

In [7]:
def Creat_mask(df):
    mask = df_truncated_missing_data.copy()
    mask = mask.where(mask.isnull(),1)
    mask = mask.where(mask.isnull()!=True,0)
    return mask
Mask = Creat_mask(df_truncated_missing_data)

In [8]:
df_truncated_missing_data.loc[:,'lhx':'rwz'].values


array([[5.347435, 4.363681, 1.501913, ..., 5.553945, 4.370456, 1.553521],
       [5.17276 , 4.29817 , 1.539378, ..., 5.101995, 4.275612, 1.545337],
       [4.900373, 4.314654, 1.532006, ..., 5.195107, 4.343143, 1.542306],
       ...,
       [2.020312, 4.206409, 2.098202, ..., 3.19627 , 3.894011, 2.259658],
       [2.066605, 4.205699, 2.088519, ..., 3.289339, 3.917616, 2.255383],
       [1.775075, 4.861233, 2.234605, ..., 3.33368 , 4.413923, 2.293885]])

In [9]:
timestamp = df_truncated_missing_data.timestamp
processed_data = {
    'input': df_truncated_missing_data.loc[:,'lhx':'rwz'].values,
    'masking': Mask
}
n = len(timestamp)

In [10]:
# process label
phase = ['Rest', 'Preparation', 'Hold', 'Stroke', 'Retraction']
processed_label = {
    'label_phase': phase
}

In [11]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'data.npz'), timestamp=timestamp, **processed_data, **processed_label)

In [12]:
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
#https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-of-time-series-data

In [33]:
# https://stackoverflow.com/questions/45115964/separate-pandas-dataframe-using-sklearns-kfold
#added some parameters
#kf = KFold(n_splits = 5, shuffle = False, random_state = 2)
ts = TimeSeriesSplit(n_splits=5)
result = next(ts.split(processed_data['input']), None)
print (len(result))
#(array([0, 2, 3, 5, 6, 7, 8, 9]), array([1, 4]))

train = processed_data['input'][result[1]]

test =  processed_data['input'][result[1]]

#print("train: " + train)
#print("test: " + test)

2
[[3.346799 2.259089 1.637492 ... 6.801356 1.902031 1.71084 ]
 [5.235381 3.599986 1.471026 ... 4.911345 3.791246 1.513937]
 [5.551727 4.258776 1.515576 ... 5.477768 4.145972 1.523574]
 ...
 [2.431286 3.679767 1.906436 ... 3.67384  2.37312  2.261567]
 [2.187767 3.539126 1.935147 ... 3.882411 2.075931 2.243435]
 [1.443359 5.224425 2.047259 ... 3.75924  4.946609 2.183301]]


In [None]:
#kf = TimeSeriesSplit(n_splits=2)
#for train, test in kf.split(processed_data['input']):
     #print("train:%s \ntest:%s" % (train, test))

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25)

In [None]:
#processed_fold = {
#    'fold_phase': fold_data['folds_ep_pha'][0][0]    
#}

In [None]:
#np.savez_compressed(os.path.join(grud_gesture_data_folder, 'fold.npz'), **processed_fold)