In [1]:
import os 
import numpy as np
import pandas as pd
from python_linq import From
from sklearn.model_selection import TimeSeriesSplit

In [2]:
seed = 42
np.random.seed(seed=seed)

In [3]:
gesture_root_folder = os.path.join('gesture_datas/')
grud_working_folder = '.' # # This should be the [WD] (working directory) of the GRU-D project.
grud_gesture_data_folder = os.path.join(grud_working_folder, 'data', 'gesture')
if not os.path.exists(grud_gesture_data_folder):
    os.makedirs(grud_gesture_data_folder)
print('The data files will be converted and saved here: ', grud_gesture_data_folder)

The data files will be converted and saved here:  ./data/gesture


In [4]:
def Load_data(path):
    all_files = os.listdir(path)
    files_to_load = From(all_files).where(lambda file: file[-3:]=='csv').where(lambda file: file[-5]=='w').toList()
    all_df = np.empty([len(files_to_load)], dtype=object)
    for i,file in enumerate(files_to_load):
        df_file = pd.read_csv(path + file, sep=',')
        index_to_change = df_file.loc[df_file['phase']=='Preparação'].index
        if index_to_change.size == 1:
            df_file = df_file.loc[df_file['phase']!='Preparação']
        all_df[i] = df_file
    return all_df
all_df = Load_data(gesture_root_folder)

In [5]:
def Truncate(df):
    k = 0
    nb_rows = df.shape[0]
    df_to_fill = pd.DataFrame(columns=df.columns.values.tolist())
    while k < nb_rows:    
        row_considered = df.iloc[[k]]
        phase_considered = row_considered['phase'].values.tolist()[0]
        phase_tmp = phase_considered
        #print(phase_considered)
        #print(phase_tmp)
        i = 0
        while phase_tmp == phase_considered and i<30:
            if k + i + 1 < nb_rows:
                # Verifying if there is a next value
                i += 1
                phase_tmp = df.iloc[[i+k]]['phase'].values.tolist()[0]
            else:
                # End of while
                break
        if i > 7:
            # Add the row considered            
            df_to_fill = df_to_fill.append(row_considered)            
        if k + i + 1 < nb_rows:
            k += i
        else:
            # End of the dataframe
            break
    df_to_fill = df_to_fill.reset_index(drop=True)
    return df_to_fill
all_df_truncated = [Truncate(dataframe) for dataframe in all_df]

In [6]:
def Creat_missing_values(df, nb_rows_to_del=1):
    nb_rows = df.shape[0]
    nb_columns = df.shape[1]
    index_still = np.arange(nb_rows)
    df_copy = df.copy()
    if nb_rows_to_del < nb_rows:
        while nb_rows-len(index_still)<nb_rows_to_del:
            index_del = np.random.randint(len(index_still)-1)
            timestamp_saved = df.iloc[index_del].__getitem__('timestamp')
            df_copy.iloc[index_del] = [np.nan for i in range(nb_columns)]
            df_copy.loc[index_del, 'timestamp'] = timestamp_saved
            index_still = np.delete(index_still,index_del)
    return df_copy
all_df_truncated_missing_data = [Creat_missing_values(df) for df in all_df_truncated]

In [7]:
def Creat_mask(df):
    mask = df.copy()
    mask = mask.where(mask.isnull(),1)
    mask = mask.where(mask.isnull()!=True,0)
    return mask
all_Mask = [Creat_mask(df) for df in all_df_truncated_missing_data]

In [8]:
all_timestamp = [df.timestamp.values for df in all_df_truncated_missing_data]
processed_data = {
    'input': np.asarray([df.loc[:,'lhx':'rwz'].values for df in all_df_truncated_missing_data]),
    'masking': np.asarray([Mask.loc[:,'lhx':'rwz'].values for Mask in all_Mask])
}

In [9]:
# process label
#phase = ['Rest', 'Preparation', 'Hold', 'Stroke', 'Retraction']
processed_label = {
    'label_phase': np.asarray([df['phase'].values for df in all_df_truncated_missing_data])
}

In [10]:
processed_label['label_phase'][0]

array(['Rest', 'Rest', 'Preparation', 'Preparation', 'Hold', 'Stroke',
       'Preparation', 'Retraction', 'Rest', 'Preparation', 'Preparation',
       'Hold', 'Hold', 'Preparation', 'Hold', 'Hold', 'Preparation',
       'Hold', 'Retraction', 'Preparation', 'Stroke', 'Stroke',
       'Preparation', 'Preparation', 'Preparation', 'Stroke', 'Hold',
       'Preparation', 'Hold', 'Preparation', 'Stroke', 'Retraction',
       'Rest', 'Preparation', 'Stroke', 'Preparation', 'Stroke', 'Stroke',
       nan, 'Hold', 'Preparation', 'Preparation', 'Stroke', 'Preparation',
       'Stroke', 'Hold', 'Stroke', 'Stroke', 'Preparation', 'Stroke',
       'Preparation', 'Hold', 'Stroke', 'Retraction'], dtype=object)

In [11]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'data.npz'), timestamp=all_timestamp, **processed_data, **processed_label)

In [12]:
#n = processed_label['label_phase'].shape[0]
n = len(processed_label['label_phase'])
processed_fold = {
    'fold_phase': np.empty([n,5,3], dtype=object)
}
# process statistics
# n_labeltype, 1, n_fold, (ts, nts), (mean, std)
processed_stats = {
    'mean_phase': np.empty([n,5,3], dtype=object),
    'std_phase': np.empty([n,5,3], dtype=object)
}

In [13]:
ts = TimeSeriesSplit(n_splits=5)
val_rate = 0.2
for n, inputs in enumerate(processed_data['input']):
    for i,(train_index, test_index) in enumerate(ts.split(inputs)): 
        X_train, X_test = inputs[train_index], inputs[test_index]
        X_train, X_val = X_train[0:int(len(X_train)*(1-val_rate))], X_train[int(len(X_train)*(1-val_rate)):]

        processed_fold['fold_phase'][n][i][0] = np.asarray(train_index[0:int(len(X_train)*(1-val_rate))])
        processed_fold['fold_phase'][n][i][1] = np.asarray(train_index[int(len(X_train)*(1-val_rate)):])
        processed_fold['fold_phase'][n][i][2] = np.asarray(test_index)
        print(X_train.shape[1])
        processed_stats['mean_phase'][n][i][0] = np.asarray([np.mean(X_train[:,i]) for i in range(X_train.shape[1])])
        processed_stats['std_phase'][n][i][0] = np.asarray([np.std(X_train[:,i]) for i in range(X_train.shape[1])])

    

18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18


In [14]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'fold.npz'), **processed_fold, **processed_stats)

In [15]:
data = np.load(os.path.join(grud_gesture_data_folder, 'data.npz'))
for k in data.keys():
    print(k, data[k].shape)
print(data['input'][1].shape)

fold = np.load(os.path.join(grud_gesture_data_folder, 'fold.npz'))
for k in fold.keys():
    print(k, fold[k].shape)
    for f in fold[k]:
        print('\t', [x.shape for x in f])

ValueError: Object arrays cannot be loaded when allow_pickle=False