In [1]:
import numpy as np
import pandas as pd
import rpy2.robjects as robjects

In [2]:
# create np.array from RData
robjects.r['load']("armdata.RData")
matrix = robjects.r["armdata"]

armdata = np.array(matrix)

In [3]:
# create data frame from np.array
df_coords = []
for q in ['x', 'y', 'z']:
    for i in range(1,101):
        df_coords.append(f"{q}{i}")
df_data = []
for exp in range(len(armdata)):
    for person in range(len(armdata[exp,:])):
        for attempt in range(len(armdata[exp,person,:])):
            df_data.append([exp+1, person+1, attempt+1, *armdata[exp, person, attempt, :, 0], *armdata[exp,person,attempt,:,1], *armdata[exp,person,attempt,:,2]])
df = pd.DataFrame(df_data, columns=['experiment', 'person', 'attempt', *df_coords])

In [4]:
# encode experiment
ds = [15, 22.5, 30, 37.5, 45]
obstacles = ['S', 'M', 'T']

encoder_d = {}
encoder_o = {}
for i in range(1, 17):
    for k in range(5):
        if i in range(1+3*k, (k+1)*3):
            d = ds[k]
            break
    if i % 3 == 1:
        o = obstacles[0]
    elif i % 3 == 2:
        o = obstacles[1]
    else:
        o = obstacles[2]
    encoder_d[i] = d
    encoder_o[i] = o

In [5]:
df_ds = []
df_obstacles = []
for i in range(len(df)):
    experiment = df.loc[i, 'experiment']
    df_ds.append(encoder_d[experiment])
    df_obstacles.append(encoder_o[experiment])
df['d'] = df_ds
df['obstacle'] = df_obstacles

In [6]:
df.head()

Unnamed: 0,experiment,person,attempt,x1,x2,x3,x4,x5,x6,x7,...,z93,z94,z95,z96,z97,z98,z99,z100,d,obstacle
0,1,1,1,0.318606,0.29345,0.258842,0.223056,0.202482,0.163079,0.11737,...,22.866134,22.648898,22.603012,22.593056,22.564686,22.559457,22.568451,22.582079,15.0,S
1,1,1,2,-0.235394,-0.280031,-0.294954,-0.329996,-0.366376,-0.376393,-0.405935,...,22.666885,22.597741,22.59348,22.569628,22.583162,22.559361,22.561933,22.560902,15.0,S
2,1,1,3,-0.351842,-0.419376,-0.462213,-0.448826,-0.483149,-0.512998,-0.538375,...,22.679918,22.641539,22.629874,22.628294,22.625198,22.643443,22.638235,22.628298,15.0,S
3,1,1,4,-0.025704,-0.080457,-0.151805,-0.172176,-0.216299,-0.254781,-0.350717,...,22.618492,22.619046,22.611242,22.619689,22.614906,22.610536,22.602206,22.610013,15.0,S
4,1,1,5,-0.549467,-0.633789,-0.651556,-0.678097,-0.691002,-0.718224,-0.71172,...,22.685877,22.613237,22.64012,22.671474,22.623875,22.644248,22.656119,22.639856,15.0,S


In [7]:
# check for nans
df[df.isna().any(axis=1)]

Unnamed: 0,experiment,person,attempt,x1,x2,x3,x4,x5,x6,x7,...,z93,z94,z95,z96,z97,z98,z99,z100,d,obstacle
480,5,9,1,,-0.0307,-0.046479,-0.07702,-0.188686,-0.313984,-0.43716,...,22.711403,22.627799,22.610329,22.609082,22.607265,22.607739,22.608525,22.610557,22.5,M
681,7,9,2,,-0.11307,-0.173303,-0.209398,-0.228355,-0.267421,-0.258362,...,22.630605,22.631671,22.636193,22.639771,22.649747,22.658039,22.655762,22.65536,30.0,S
980,10,9,1,,,-0.497727,-0.477967,-0.413835,-0.356579,-0.247389,...,23.348723,22.970558,22.683275,22.602706,22.597603,22.601928,22.609632,22.616469,37.5,S
1080,11,9,1,,,-0.327297,-0.2966,-0.27124,-0.203538,-0.102154,...,22.615596,22.606932,22.610162,22.625601,22.642909,22.651995,22.654424,22.665316,37.5,M
1280,13,9,1,,,,,0.268572,0.395516,0.578227,...,22.671183,22.617488,22.618179,22.62796,22.632765,22.64872,22.648138,22.639687,45.0,S
1381,14,9,2,,,-0.532806,-0.533457,-0.502349,-0.360284,-0.110534,...,22.618193,22.613213,22.607329,22.617776,22.607098,22.601933,22.606414,22.608949,45.0,M


In [8]:
df.isna().sum().sum()

36

In [9]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [10]:
missing_values_table(df)

Your selected dataframe has 305 columns.
There are 12 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
x1,6,0.4
y1,6,0.4
z1,6,0.4
x2,4,0.2
y2,4,0.2
z2,4,0.2
x3,1,0.1
x4,1,0.1
y3,1,0.1
y4,1,0.1


In [11]:
# replace nans with next non-nan value
for i in df[df.isna().any(axis=1)].index:
    for k in range(1, 101):
        for axis in ['x', 'y', 'z']:
            el = df.loc[i, f'{axis}{k}']
            if pd.isna(el):
                counter = k
                next_el = el
                while pd.isna(next_el):
                    counter += 1
                    next_el = df.loc[i, f'{axis}{counter}']
                df.loc[i, f'{axis}{k}'] = next_el


In [12]:
# make sure all nans are removed
df.isna().any(axis=1).sum()

0

In [13]:
clean_df = df
clean_df.to_csv('clean_data.csv', index=False)

In [14]:
df_aug = pd.read_csv('clean_data.csv')

In [15]:
df_aug.head()

Unnamed: 0,experiment,person,attempt,x1,x2,x3,x4,x5,x6,x7,...,z93,z94,z95,z96,z97,z98,z99,z100,d,obstacle
0,1,1,1,0.318606,0.29345,0.258842,0.223056,0.202482,0.163079,0.11737,...,22.866134,22.648898,22.603012,22.593056,22.564686,22.559457,22.568451,22.582079,15.0,S
1,1,1,2,-0.235394,-0.280031,-0.294954,-0.329996,-0.366376,-0.376393,-0.405935,...,22.666885,22.597741,22.59348,22.569628,22.583162,22.559361,22.561933,22.560902,15.0,S
2,1,1,3,-0.351842,-0.419376,-0.462213,-0.448826,-0.483149,-0.512998,-0.538375,...,22.679918,22.641539,22.629874,22.628294,22.625198,22.643443,22.638235,22.628298,15.0,S
3,1,1,4,-0.025704,-0.080457,-0.151805,-0.172176,-0.216299,-0.254781,-0.350717,...,22.618492,22.619046,22.611242,22.619689,22.614906,22.610536,22.602206,22.610013,15.0,S
4,1,1,5,-0.549467,-0.633789,-0.651556,-0.678097,-0.691002,-0.718224,-0.71172,...,22.685877,22.613237,22.64012,22.671474,22.623875,22.644248,22.656119,22.639856,15.0,S


In [16]:
np.sum(df_aug.isna()).sum()

0

In [17]:
# Calculate curve length
distances = []
for i in range(len(df_aug)):
    d = 0
    for k in range(1, 100):
        x2, y2, z2 = df.loc[i, f'x{k+1}'], df.loc[i, f'y{k+1}'], df.loc[i, f'z{k+1}']
        x1, y1, z1 = df.loc[i, f'x{k}'], df.loc[i, f'y{k}'], df.loc[i, f'z{k}']
        d += np.sqrt((x2-x1)**2+(y2-y1)**2+(z2-z1)**2)
    distances.append(d)        

In [18]:
df_aug['curve_length'] = distances

In [19]:
df_aug.head()

Unnamed: 0,experiment,person,attempt,x1,x2,x3,x4,x5,x6,x7,...,z94,z95,z96,z97,z98,z99,z100,d,obstacle,curve_length
0,1,1,1,0.318606,0.29345,0.258842,0.223056,0.202482,0.163079,0.11737,...,22.648898,22.603012,22.593056,22.564686,22.559457,22.568451,22.582079,15.0,S,94.131094
1,1,1,2,-0.235394,-0.280031,-0.294954,-0.329996,-0.366376,-0.376393,-0.405935,...,22.597741,22.59348,22.569628,22.583162,22.559361,22.561933,22.560902,15.0,S,92.549212
2,1,1,3,-0.351842,-0.419376,-0.462213,-0.448826,-0.483149,-0.512998,-0.538375,...,22.641539,22.629874,22.628294,22.625198,22.643443,22.638235,22.628298,15.0,S,92.079312
3,1,1,4,-0.025704,-0.080457,-0.151805,-0.172176,-0.216299,-0.254781,-0.350717,...,22.619046,22.611242,22.619689,22.614906,22.610536,22.602206,22.610013,15.0,S,93.026979
4,1,1,5,-0.549467,-0.633789,-0.651556,-0.678097,-0.691002,-0.718224,-0.71172,...,22.613237,22.64012,22.671474,22.623875,22.644248,22.656119,22.639856,15.0,S,96.919909


In [24]:
columns = df_aug.columns.tolist()
new_columns = columns[:3] + columns[-3:] + columns[3:-3]

In [25]:
df3 = df_aug[new_columns]

In [26]:
df3.head()

Unnamed: 0,experiment,person,attempt,d,obstacle,curve_length,x1,x2,x3,x4,...,z91,z92,z93,z94,z95,z96,z97,z98,z99,z100
0,1,1,1,15.0,S,94.131094,0.318606,0.29345,0.258842,0.223056,...,23.729541,23.286825,22.866134,22.648898,22.603012,22.593056,22.564686,22.559457,22.568451,22.582079
1,1,1,2,15.0,S,92.549212,-0.235394,-0.280031,-0.294954,-0.329996,...,23.106681,22.835292,22.666885,22.597741,22.59348,22.569628,22.583162,22.559361,22.561933,22.560902
2,1,1,3,15.0,S,92.079312,-0.351842,-0.419376,-0.462213,-0.448826,...,23.369131,22.977192,22.679918,22.641539,22.629874,22.628294,22.625198,22.643443,22.638235,22.628298
3,1,1,4,15.0,S,93.026979,-0.025704,-0.080457,-0.151805,-0.172176,...,22.800929,22.705006,22.618492,22.619046,22.611242,22.619689,22.614906,22.610536,22.602206,22.610013
4,1,1,5,15.0,S,96.919909,-0.549467,-0.633789,-0.651556,-0.678097,...,22.604172,22.677976,22.685877,22.613237,22.64012,22.671474,22.623875,22.644248,22.656119,22.639856


In [27]:
df3.to_csv('df3.csv', index=False)