We use this script to process the Gesture dataset and get the input files for the GRU-D models.

In [1]:
import os 
import numpy as np
import pandas as pd
from python_linq import From

In [2]:
seed = 42
np.random.seed(seed=seed)

In [3]:
gesture_root_folder = os.path.join('gesture_datas/')
grud_working_folder = '.' # # This should be the [WD] (working directory) of the GRU-D project.
grud_gesture_data_folder = os.path.join(grud_working_folder, 'data', 'gesture')
if not os.path.exists(grud_gesture_data_folder):
    os.makedirs(grud_gesture_data_folder)
print('The data files will be converted and saved here: ', grud_gesture_data_folder)

The data files will be converted and saved here:  ./data/gesture


In [4]:
def Load_data(path):
    all_files = os.listdir(path)
    files_to_load = From(all_files).where(lambda file: file[-3:]=='csv').where(lambda file: file[-5]=='w').toList()
    df_Big = pd.read_csv(path +files_to_load[0], sep=',')
    for file in files_to_load:
        df_file = pd.read_csv(path + file, sep=',')
        df_Big = df_Big.append(df_file)
    df_Big = df_Big.loc[df_Big['phase']!='Preparação']
    return df_Big
dataframe = Load_data(gesture_root_folder)

In [5]:
def Truncate(df):
    k = 0
    nb_rows = df.shape[0]
    df_to_fill = pd.DataFrame(columns=df.columns.values.tolist())
    while k < nb_rows:    
        row_considered = df.iloc[[k]]
        phase_considered = row_considered['phase'].values.tolist()[0]
        phase_tmp = phase_considered
        #print(phase_considered)
        #print(phase_tmp)
        i = 0
        while phase_tmp == phase_considered and i<30:
            if k + i + 1 < nb_rows:
                # Verifying if there is a next value
                i += 1
                phase_tmp = df.iloc[[i+k]]['phase'].values.tolist()[0]
            else:
                # End of while
                break
        if i > 7:
            # Add the row considered            
            df_to_fill = df_to_fill.append(row_considered)            
        if k + i + 1 < nb_rows:
            k += i
        else:
            # End of the dataframe
            break
    df_to_fill = df_to_fill.reset_index(drop=True)
    return df_to_fill
df_truncated = Truncate(dataframe)

In [6]:
def Creat_missing_values(df, nb_rows_to_del=1):
    nb_rows = df.shape[0]
    nb_columns = df.shape[1]
    index_still = np.arange(nb_rows)
    df_copy = df.copy()
    if nb_rows_to_del < nb_rows:
        while nb_rows-len(index_still)<nb_rows_to_del:
            index_del = np.random.randint(len(index_still)-1)
            timestamp_saved = df.iloc[index_del].__getitem__('timestamp')
            df_copy.iloc[index_del] = [np.nan for i in range(nb_columns)]
            df_copy.loc[index_del, 'timestamp'] = timestamp_saved
            index_still = np.delete(index_still,index_del)
    return df_copy
df_truncated_missing_data = Creat_missing_values(df_truncated)

In [7]:
def Creat_mask(df):
    mask = df_truncated_missing_data.copy()
    mask = mask.where(mask.isnull(),1)
    mask = mask.where(mask.isnull()!=True,0)
    return mask
Mask = Creat_mask(df_truncated_missing_data)

In [8]:
timestamp = df_truncated_missing_data.timestamp
processed_data = {
    'input': df_truncated_missing_data.loc[:,'lhx':'rwz'].values,
    'masking': Mask
}
n = len(timestamp)

In [9]:
# process label
phase = ['Rest', 'Preparation', 'Hold', 'Stroke', 'Retraction']
processed_label = {
    'label_phase': phase
}

In [10]:
np.savez_compressed(os.path.join(grud_gesture_data_folder, 'data.npz'), timestamp=timestamp, **processed_data, **processed_label)

In [11]:
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
#https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-of-time-series-data

In [43]:
df_truncated_missing_data.loc[:,'lhx':'rwz']


Unnamed: 0,lhx,lhy,lhz,rhx,rhy,rhz,hx,hy,hz,sx,sy,sz,lwx,lwy,lwz,rwx,rwy,rwz
0,1.197867,5.502980,2.154376,3.937746,5.555298,2.194409,2.844247,1.282464,2.274105,2.693945,3.979356,2.300664,1.309632,5.117686,2.186277,3.868249,5.108600,2.228558
1,1.129344,5.582613,2.184588,3.901658,5.536760,2.178253,2.708008,1.279634,2.258140,2.685528,3.943174,2.296258,1.260791,5.146002,2.208834,3.830482,5.085307,2.210379
2,1.159948,5.519786,2.111937,3.897376,5.569611,2.182385,2.741405,1.279580,2.251838,2.629318,3.974289,2.279994,1.228727,5.084608,2.147404,3.809897,5.125341,2.212972
3,0.975070,5.593150,2.128695,4.126638,4.465309,2.019147,2.702959,1.276405,2.243343,2.546966,4.011563,2.254505,1.068170,5.142811,2.140317,3.949232,4.350619,2.091816
4,1.468093,3.148694,2.046527,2.159844,3.994637,1.936775,2.558908,1.290702,2.238736,2.537210,4.032292,2.253146,1.490683,2.995434,2.089815,2.487680,3.987506,1.981446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,3.848081,5.284687,1.793734,5.208730,0.860326,1.794496,4.571526,1.841547,1.958877,4.520376,3.935662,1.959966,3.697005,5.004112,1.839327,5.301485,1.126448,1.807827
514,3.611807,5.205160,1.813496,5.396915,0.658939,1.783259,4.409449,1.825602,1.940068,4.486669,3.922371,1.949062,3.639745,4.962801,1.844618,5.419861,0.948358,1.798431
515,3.602747,5.023941,1.837341,5.193390,1.100939,1.857540,4.283885,1.874449,1.915989,4.465778,3.968906,1.944897,3.589914,4.961304,1.856823,5.411085,1.416187,1.841328
516,3.611109,5.022792,1.832637,5.263934,1.167015,1.806657,4.553945,1.605447,1.942065,4.468688,3.966625,1.945066,3.600027,4.942451,1.853232,5.370014,1.446505,1.811720


In [44]:
df_truncated_missing_data.loc[:,'lhx':'rwz'].values[4]
#processed_data['input']

array([1.468093, 3.148694, 2.046527, 2.159844, 3.994637, 1.936775,
       2.558908, 1.290702, 2.238736, 2.53721 , 4.032292, 2.253146,
       1.490683, 2.995434, 2.089815, 2.48768 , 3.987506, 1.981446])

In [62]:
# https://stackoverflow.com/questions/45115964/separate-pandas-dataframe-using-sklearns-kfold
#added some parameters
kf = KFold(n_splits = 5, shuffle = False, random_state = 2)
#ts = TimeSeriesSplit(n_splits=5)
#result = next(ts.split(processed_data['input']), None)
#print(result)
#(array([0, 2, 3, 5, 6, 7, 8, 9]), array([1, 4]))
i=0
for train_index, test_index in kf.split(processed_data['input']):
    print (i)
    i = i+1
    #fold_data['folds_ep_pha'][0][0]   
    #print(processed_data['input'])
    print("train_index:",processed_data['input'][train_index])
    #print("test_index:",test_index)
    print("\n")

index = kf.split(processed_data['input'] )
#train = processed_data['input'][index[0]]

#test =  processed_data['input'][result[1]]
#print(train)
#print(test)

0
[[1.197867 5.50298  2.154376 ... 3.868249 5.1086   2.228558]
 [1.129344 5.582613 2.184588 ... 3.830482 5.085307 2.210379]
 [1.159948 5.519786 2.111937 ... 3.809897 5.125341 2.212972]
 ...
 [3.602747 5.023941 1.837341 ... 5.411085 1.416187 1.841328]
 [3.611109 5.022792 1.832637 ... 5.370014 1.446505 1.81172 ]
 [3.896576 5.386982 1.763772 ... 5.238472 4.971535 1.822406]]
train_index: [[1.465642 6.041111 2.094026 ... 3.030067 4.4999   1.985247]
 [1.77075  5.44022  1.995184 ... 3.229025 4.118286 2.011081]
 [1.145497 5.608553 2.150756 ... 3.049983 4.046341 2.011206]
 ...
 [3.602747 5.023941 1.837341 ... 5.411085 1.416187 1.841328]
 [3.611109 5.022792 1.832637 ... 5.370014 1.446505 1.81172 ]
 [3.896576 5.386982 1.763772 ... 5.238472 4.971535 1.822406]]


1
[[1.197867 5.50298  2.154376 ... 3.868249 5.1086   2.228558]
 [1.129344 5.582613 2.184588 ... 3.830482 5.085307 2.210379]
 [1.159948 5.519786 2.111937 ... 3.809897 5.125341 2.212972]
 ...
 [3.602747 5.023941 1.837341 ... 5.411085 1.41618

In [None]:
#kf = TimeSeriesSplit(n_splits=2)
#for train, test in kf.split(processed_data['input']):
     #print("train:%s \ntest:%s" % (train, test))

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25)

In [None]:
#processed_fold = {
#    'fold_phase': fold_data['folds_ep_pha'][0][0]    
#}

In [None]:
#np.savez_compressed(os.path.join(grud_gesture_data_folder, 'fold.npz'), **processed_fold)

In [64]:
data = {"name":[]}
data["name"].append(5)
print(data)

{'name': [5]}
