# Dataset creation without images
Since the time of this project is very limited, we decided not to use the frames to train our model. Instead, we will focus "only" on the MediaPipe and YOLOv7 outputs to create our dataset.

The final dataset will contains, for each frame :
- landmarks of each hands (21 landmarks for both hands = 42 landmarks)
- the hands centers (x and y coordinates of hand center for both hands)
- the coordinates of the closest object ==> [object_class, x, y, width, height, confidence]
- a pixelwise distance between each landmark and the object center position (2x21 landmarks = 42 pixelwise distances)

In [2]:
import numpy as np
import pandas as pd
import glob
import os
import pickle # to save the model
import math
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as pl
from PIL import Image
from tqdm import tqdm

In [3]:
# CONSTANTS
# Path to the all landmarks (2x21) file
file_name = os.path.join(".", "pickle_data", "300_300_AfC_AfP_hands_coordinates_21_landmarks.pickle")
with open(file_name, "rb") as f:
    hands_landmarks = pickle.load(f)

# Path to the hand center file
file_name = os.path.join(".", "pickle_data", "300_300_AfC_AfP_hands_coordinates_center.pickle")
with open(file_name, "rb") as f:
    hands_center = pickle.load(f)

# Path to the object detection results file
# load csv file
objects = pd.read_csv("objects_detected.csv")

# Path to the ground truth file
ground_truth_file = "Matlab_GroundTruth/ground_truth.csv"

# Loading all data
Getting all data ready to create a global dataset

In [4]:
# import ground truth csv file into a pandas dataframe
ground_truth = pd.read_csv(ground_truth_file)
print(ground_truth.shape)
print(ground_truth)

(28826, 11)
                Time  L_Reach  L_Grasp  L_Manipulation  L_Transport  L_Place   
0              0 sec        0        0               0            0        0  \
1      0.0071667 sec        0        0               0            0        0   
2        0.02385 sec        0        0               0            0        0   
3       0.040533 sec        0        0               0            0        0   
4       0.057217 sec        0        0               0            0        0   
...              ...      ...      ...             ...          ...      ...   
28821     480.82 sec        0        0               0            0        1   
28822     480.84 sec        0        0               0            0        1   
28823     480.85 sec        0        0               0            0        1   
28824     480.87 sec        0        0               0            0        1   
28825     480.89 sec        0        0               0            0        1   

       R_Reach  R_Grasp  R_

In [5]:
# drop the first column because time is not needed
ground_truth = ground_truth.drop(columns=['Time'])
print(ground_truth.shape)
print(ground_truth)

(28826, 10)
       L_Reach  L_Grasp  L_Manipulation  L_Transport  L_Place  R_Reach   
0            0        0               0            0        0        0  \
1            0        0               0            0        0        0   
2            0        0               0            0        0        0   
3            0        0               0            0        0        0   
4            0        0               0            0        0        0   
...        ...      ...             ...          ...      ...      ...   
28821        0        0               0            0        1        0   
28822        0        0               0            0        1        0   
28823        0        0               0            0        1        0   
28824        0        0               0            0        1        0   
28825        0        0               0            0        1        0   

       R_Grasp  R_Manipulation  R_Transport  R_Place  
0            0               0            0 

In [6]:
print(type(hands_landmarks))
print(type(hands_center))
print(type(objects))
print(type(ground_truth))

<class 'list'>
<class 'list'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [7]:
# convert everything to numpy arrays, except for ground truth --> we don't need it now
hands_landmarks = np.array(hands_landmarks)
hands_center = np.array(hands_center)
objects = np.array(objects)

  hands_landmarks = np.array(hands_landmarks)
  hands_center = np.array(hands_center)


In [8]:
print(type(hands_landmarks))
print(type(hands_center))
print(type(objects))
print(type(ground_truth))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [9]:
# check the shapes of the arrays
print(hands_landmarks.shape)
print(hands_center.shape)
print(objects.shape)
print(ground_truth.shape)

(28826, 3)
(28826, 3)
(28853, 1)
(28826, 10)


In [10]:
# drop the last element of objects to match the shape of the other arrays
objects = objects[:-27]


In [11]:
print(objects.shape)

(28826, 1)


In [12]:
# show 5 elements of each array
print(" ============================================ HANDS LANDMARKS ============================================")
print(hands_landmarks[:5])
print(" ============================================ HANDS CENTER ============================================")
print(hands_center[:5])
print(" ============================================ OBJECTS ============================================")
print(objects[100:105]) # there is no objects detected in the first 5 frames so I show the 100th to 105th elements

[[0
  list([[203.678, 215.0048, 2.0850416149187367e-07], [201.9033, 205.3035, 0.004473347682505846], [196.5171, 195.1516, 0.006036499049514532], [191.7402, 188.7383, 0.005319010931998491], [188.219, 183.4591, 0.0038206814788281918], [195.5639, 194.5877, 0.002629668451845646], [189.9175, 183.8406, -0.0069520906545221806], [185.9305, 175.2332, -0.014755076728761196], [182.4908, 168.4838, -0.01992180198431015], [194.1577, 194.4168, -0.005564892198890448], [187.8342, 181.7611, -0.014680745080113411], [183.0777, 172.2711, -0.02269960567355156], [178.5262, 163.9938, -0.028351115062832832], [192.2044, 195.2947, -0.014078854583203793], [186.5482, 182.9543, -0.02173532545566559], [183.351, 173.7823, -0.027611026540398598], [180.3378, 166.1863, -0.03172048181295395], [189.8526, 197.0832, -0.022334376350045204], [186.6848, 187.9192, -0.027938470244407654], [184.999, 181.1177, -0.030382338911294937], [183.4057, 175.4153, -0.03252025693655014]])
  None]
 [1
  list([[210.2895, 194.1755, -2.314900768

# Objects selection
The list of detected object has already been filtred to keep only "interesting" ones.

Anyway, we want to keep only one object per frame : the one which is the closest to the hand.

To do so, we need to calculate pixelwise distance between the object center position and the hands center.

In [13]:
# since the detected objects are stored in a list of strings, we need to convert them to a list of lists
# each list contains the detected objects in a frame
# we can't split on "," because some objects have "," in their coordinates
# so we will split on "]," since it is the first character of each object coordinates list

frame_with_multiple_objects = []

# check if the frame contains multiple objects (if so, the string will be bigger than 60 characters)
for i in range(len(objects)):

    if len(objects[i][0]) > 60 and len(objects[i][0]) < 130: # contains exactly 2 objects
        # we need to remove the first and last characters of the string which are "[" and "]" to avoid problems when splitting
        objects[i] = objects[i][0][1:-1] # remove the first and last characters
        splitted_objects = objects[i][0].split("],")
        splitted_objects[0] += "]" # add "]" to the end of the first element
        splitted_objects = [i] + splitted_objects # add the frame number to the beginning of the list
        frame_with_multiple_objects.append(splitted_objects)

    elif len(objects[i][0]) > 130: # contains 3 objects
        objects[i] = objects[i][0][1:-1] # remove the first and last characters
        splitted_objects = objects[i][0].split("],")
        splitted_objects[0] += "]" # add "]" to the end of the first element
        splitted_objects[1] += "]" # add "]" to the end of the second element
        splitted_objects = [i] + splitted_objects # add the frame number to the beginning of the list
        frame_with_multiple_objects.append(splitted_objects)

In [14]:
print("There are {} frames with multiple objects".format(len(frame_with_multiple_objects)))
print(frame_with_multiple_objects[802])

There are 3569 frames with multiple objects
[11325, "['45', 267.8907, 159.1407, 52.9689, 42.6564, '0.818359']", " ['72', 87.1875, 101.4843, 67.5, 71.7189, '0.824707']", " ['43', 285.4689, 203.4375, 12.1875, 30.0, '0.853516']"]


In [15]:
# convert the list of lists to a numpy array
frame_with_multiple_objects = np.array(frame_with_multiple_objects)
print(frame_with_multiple_objects.shape)
print(frame_with_multiple_objects[0])

(3569,)
[464, "['50', 189.6093, 175.7814, 35.1564, 32.8125, '0.842285']", " ['72', 208.5936, 137.5782, 181.875, 142.9686, '0.880371']"]


  frame_with_multiple_objects = np.array(frame_with_multiple_objects)


In [16]:
# transform the string to a list of floats
def evaluate(element):
    if isinstance(element, str):
        return eval(element)
    else:
        return element

In [17]:
# convert the objects coordinates to a list of lists of floats instead of strings

for i in tqdm(range(len(objects))):
    for j in range(len(objects[i])):
        if len(objects[i][j]) > 0: # if the frame contains objects
            objects[i][j] = evaluate(objects[i][j])

100%|██████████| 28826/28826 [00:01<00:00, 22532.66it/s]


In [18]:
print(objects[1000][0][0][1])
print(type(objects[1000][0][0][1]))

96.5625
<class 'float'>


In [19]:
# convert the objects coordinates to a list of lists of floats instead of strings
for i in tqdm(range(len(frame_with_multiple_objects))):
    for j in range(len(frame_with_multiple_objects[i])):
        if j != 0:
            # transform the string to a list of floats and add it to the list
            frame_with_multiple_objects[i][j] = evaluate(frame_with_multiple_objects[i][j])

100%|██████████| 3569/3569 [00:00<00:00, 18595.41it/s]


In [20]:
print(frame_with_multiple_objects[0][1])
print(type(frame_with_multiple_objects[0][1]))


['50', 189.6093, 175.7814, 35.1564, 32.8125, '0.842285']
<class 'list'>


In [21]:
def compute_euclidian_d(x1, y1, x2, y2):
    return math.sqrt((float(x2)-float(x1))**2 + (float(y2)-float(y1))**2)

In [22]:
a = compute_euclidian_d(0, 0, 300, 300)
print(a)

424.26406871192853


In [23]:
# for each frame with multiple objects, check wich object is the closest to the hand center and keep only that object
# we will use the euclidean distance to find the closest object

# create a list to store the closest object for each frame
closest_object = []

# initiate obj coordinates
obj_1_x_coord = None
obj_1_y_coord = None
obj_2_x_coord = None
obj_2_y_coord = None
obj_3_x_coord = None
obj_3_y_coord = None

# initiate hands center coordinates
x1_center = None
y1_center = None
x2_center = None
y2_center = None

# loop through each frame with multiple objects
for i in range(len(frame_with_multiple_objects)):
    # get the frame number
    frame_number = frame_with_multiple_objects[i][0]
    print(frame_number)

    # get the first hand center coordinates of the frame
    first_hand_center = hands_center[frame_number][1]
    if first_hand_center is not None:
        x1_center = first_hand_center[0]
        y1_center = first_hand_center[1]

    # get the second hand center coordinates of the frame
    second_hand_center = hands_center[frame_number][2]
    if second_hand_center is not None:
        x2_center = second_hand_center[0]
        y2_center = second_hand_center[1]

    # get the objects coordinates of the frame (in this array, a frame can have 2 or 3 detected objects) 
    nb_obj_detected = (len(frame_with_multiple_objects[i]))
    
    if nb_obj_detected == (2 + 1): # +1 because the first element of the list is the frame number
        obj_1_x_coord = frame_with_multiple_objects[i][1][1]
        obj_1_y_coord = frame_with_multiple_objects[i][1][2]
        obj_2_x_coord = frame_with_multiple_objects[i][2][1]
        obj_2_y_coord = frame_with_multiple_objects[i][2][2]
    elif nb_obj_detected == (3 + 1): # +1 because the first element of the list is the frame number
        obj_1_x_coord = frame_with_multiple_objects[i][1][1]
        obj_1_y_coord = frame_with_multiple_objects[i][1][2]
        obj_2_x_coord = frame_with_multiple_objects[i][2][1]
        obj_2_y_coord = frame_with_multiple_objects[i][2][2]
        obj_3_x_coord = frame_with_multiple_objects[i][3][1]
        obj_3_y_coord = frame_with_multiple_objects[i][3][2]

    # if we have the object coordinates
    if (obj_1_x_coord is not None and obj_1_y_coord is not None) and (obj_2_x_coord is not None and obj_2_y_coord is not None):

        # if we have the coordinates of both hand centers
        if (x1_center is not None and y1_center is not None) and (x2_center is not None and y2_center is not None):
            euclidean_distance_both_hands = []

            # calculate the euclidean distance between the first hand center and each object
            euclidean_distance_both_hands.append(compute_euclidian_d(x1_center, y1_center, obj_1_x_coord, obj_1_y_coord)) # 1st obj with 1st hand
            euclidean_distance_both_hands.append(compute_euclidian_d(x2_center, y2_center, obj_1_x_coord, obj_1_y_coord)) # 1st obj with 2nd hand
            euclidean_distance_both_hands.append(compute_euclidian_d(x1_center, y1_center, obj_2_x_coord, obj_2_y_coord)) # 2nd obj with 1st hand
            euclidean_distance_both_hands.append(compute_euclidian_d(x2_center, y2_center, obj_2_x_coord, obj_2_y_coord)) # 2nd obj with 2nd hand
            if nb_obj_detected == 3 + 1:
                euclidean_distance_both_hands.append(compute_euclidian_d(x1_center, y1_center, obj_3_x_coord, obj_3_y_coord)) # 3rd obj with 1st hand
                euclidean_distance_both_hands.append(compute_euclidian_d(x2_center, y2_center, obj_3_x_coord, obj_3_y_coord)) # 3rd obj with 2nd hand

            # calculate the mean euclidean distance between each object and the two hands
            mean_euclidean_distance_both_hands = []
            mean_euclidean_distance_both_hands.append((euclidean_distance_both_hands[0]+euclidean_distance_both_hands[1])/2) # 1st obj
            mean_euclidean_distance_both_hands.append((euclidean_distance_both_hands[2]+euclidean_distance_both_hands[3])/2) # 2nd obj
            if nb_obj_detected == 3 + 1:
                mean_euclidean_distance_both_hands.append((euclidean_distance_both_hands[4]+euclidean_distance_both_hands[5])/2) # 3rd obj

            # find the closest object to the two hands
            closest_object_both_hands = mean_euclidean_distance_both_hands.index(min(mean_euclidean_distance_both_hands))

            # add the frame number
            closest_object_both_hands = [frame_number, closest_object_both_hands]

            # store the closest object to the two hands
            closest_object.append(closest_object_both_hands)

        # if we have the coordinates of only one hand center (it must be the first hand center)
        elif x1_center is not None and y1_center is not None:
            euclidean_distance_one_hand = []

            # calculate the euclidean distance between the first hand center and each object
            euclidean_distance_one_hand.append(compute_euclidian_d(x1_center, y1_center, obj_1_x_coord, obj_1_y_coord)) # 1st obj
            euclidean_distance_one_hand.append(compute_euclidian_d(x1_center, y1_center, obj_2_x_coord, obj_2_y_coord)) # 2nd obj
            if nb_obj_detected == 3 + 1:
                euclidean_distance_one_hand.append(compute_euclidian_d(x1_center, y1_center, obj_3_x_coord, obj_3_y_coord)) # 3rd obj

            # find the closest object to the first hand
            closest_object_one_hand = euclidean_distance_one_hand.index(min(euclidean_distance_one_hand))

            # add the frame number
            closest_object_one_hand = [frame_number, closest_object_one_hand]

            # store the closest object to the first hand
            closest_object.append(closest_object_one_hand)

print(len(closest_object))
print(closest_object)

464
465
466
492
493
494
506
507
897
898
899
1158
1163
1164
1165
1166
1167
1168
1169
1170
1171
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1214
1215
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1249
1250
1252
1253
1255
1265
1266
1267
1268
1269
1275
1290
1291
1292
1293
1294
1295
1296
1297
1298
1336
1337
1342
1343
1344
1345
1348
1349
1350
1353
1354
1355
1356
1357
1358
1359
1360
1361
1376
1442
1443
1447
1449
1450
1451
1452
1453
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
3414
3417
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3

In [24]:
# check which object is the closest to the two hands, just by curiosity

count_1st_obj = 0
count_2nd_obj = 0
count_3rd_obj = 0

for i in range(len(closest_object)):
    if closest_object[i][1] == 0:
        count_1st_obj += 1
    elif closest_object[i][1] == 1:
        count_2nd_obj += 1
    elif closest_object[i][1] == 2:
        count_3rd_obj += 1

print("1st obj: ", count_1st_obj)
print("2nd obj: ", count_2nd_obj)
print("3rd obj: ", count_3rd_obj)
print("total: ", count_1st_obj+count_2nd_obj+count_3rd_obj)

1st obj:  2205
2nd obj:  1331
3rd obj:  33
total:  3569


Now, we know which objects are the closest one to the hands (for the frame with multiples objects detected).

In [25]:
print(objects[11326])
print(len(objects[11326][0]),objects[11326][0])

[(['43', 288.0468, 205.3125, 11.7188, 28.125, '0.811035'], ['72', 89.2968, 101.4843, 66.0939, 71.7189, '0.813965'])]
2 (['43', 288.0468, 205.3125, 11.7188, 28.125, '0.811035'], ['72', 89.2968, 101.4843, 66.0939, 71.7189, '0.813965'])


In [26]:
# Store the closest object to the two hands and erase the other objects detected
for i in tqdm(range(len(frame_with_multiple_objects))):
    
    # get the frame number
    frame_number = closest_object[i][0]

    # remove all the previous objects detected
    objects[frame_number][0] = tuple()

    # convert the tuple into a list to be able to modify it
    objects[frame_number][0] = list(objects[frame_number][0]) 

    # store the closest object to the two hands
    if closest_object[i][1] == 0:
        objects[frame_number][0].append(frame_with_multiple_objects[i][0])

    elif closest_object[i][1] == 1:
        objects[frame_number][0].append(frame_with_multiple_objects[i][1])

    elif closest_object[i][1] == 2:
        objects[frame_number][0].append(frame_with_multiple_objects[i][2])

100%|██████████| 3569/3569 [00:00<00:00, 256291.45it/s]


In [27]:
# Verify that there is only one object detected per frame
for i in tqdm(range(len(objects))):
    for j in range(len(objects[i])):
        if len(objects[i][j]) > 1:
            print("ERROR : frame number: ", i, "has more than one object detected")


100%|██████████| 28826/28826 [00:00<00:00, 558174.25it/s]


In [28]:
print(objects[11326])
print(len(objects[11326][0]),objects[11326][0])

[list([['43', 288.0468, 205.3125, 11.7188, 28.125, '0.811035']])]
1 [['43', 288.0468, 205.3125, 11.7188, 28.125, '0.811035']]


# Remove frames additional number

In [29]:
# remove the frames number in hands_landmarks and hands_center
hands_landmarks = [i[1:] for i in hands_landmarks]
hands_center = [i[1:] for i in hands_center]

# Replace "None" value by vector full of zero
None values could be misinterpreted by the futur model, it would be better to set them to 0 and to keep an identical shape

## Replace None values by vector full of 0 values in hands_center

In [30]:
print(hands_center[5])
print("=====================================")
print(hands_center[6])
print("=====================================")
print(hands_center[100])

[None None]
[list([245.9678828716278, 186.20791912078857]) None]
[list([158.8541579246521, 181.53979897499084])
 list([227.0016610622406, 209.63350653648376])]


In [31]:
# create a list of 2 0 values
zeros = np.zeros((2))
zeros = zeros.tolist()

for i in range(len(hands_center)):
    for j in range(len(hands_center[i])):
        if hands_center[i][j] is None:
            hands_center[i][j] = zeros

In [32]:
print(hands_center[5])
print("=====================================")
print(hands_center[6])
print("=====================================")
print(hands_center[100])

[list([0.0, 0.0]) list([0.0, 0.0])]
[list([245.9678828716278, 186.20791912078857]) list([0.0, 0.0])]
[list([158.8541579246521, 181.53979897499084])
 list([227.0016610622406, 209.63350653648376])]


## Replace empty values by vector full of 0 values in objects

In [33]:
print(objects[0]) # this should be replaced by zeros
print(objects[78]) # idk where this 'b' comes from but it needs to be replaced by a 39
print(objects[100]) # the only correct value
print(objects[465]) # this should be replaced by zeros

[list([])]
[list([['39', 11.9531, 113.6718, 17.3438, 25.7812, '0.819336']])]
[list([['39', 34.2189, 110.625, 13.125, 24.375, '0.85791']])]
[list([465])]


In [34]:
# Also, remove the confidence value of the object detection (not needed anymore)
# And change the 'b' value of the object_class of the first object detected by 39 

# create a list of 5 0 values
zeros = np.zeros((5))
zeros = zeros.tolist()

for i in range(len(objects)):
    for j in range(len(objects[i])):
        if objects[i][j] == []:
            objects[i][j] = [zeros]
        else:
            for k in range(len(objects[i][j])):
                if not isinstance(objects[i][j][k], list):
                    objects[i][j] = [zeros]
                else:
                    for l in range(len(objects[i][j][k])):
                        if objects[i][j][k][l] == 'b':
                            objects[i][j][k][l] = 39

                        # convert all string value into float value
                        if isinstance(objects[i][j][k][l], str):
                            objects[i][j][k][l] = float(objects[i][j][k][l])

                        # delete the confidence value of the object detection
                        if l == 5:
                            objects[i][j][k].pop(l)

In [35]:
print(objects[0]) # this should be replaced by zeros
print(objects[78]) # idk where this 'b' comes from but it needs to be replaced by a 39
print(objects[100]) # the only correct value
print(objects[465]) # this should be replaced by zeros

[list([[0.0, 0.0, 0.0, 0.0, 0.0]])]
[list([[39.0, 11.9531, 113.6718, 17.3438, 25.7812]])]
[list([[39.0, 34.2189, 110.625, 13.125, 24.375]])]
[list([[0.0, 0.0, 0.0, 0.0, 0.0]])]


## Replace None values by vector full of 0 values in hands_landmarks

In [36]:
print(hands_landmarks[5])
print("=====================================")
print(hands_landmarks[6])
print("=====================================")
print(hands_landmarks[100])

[None None]
[list([[268.2697, 208.0679, -1.1411800926452997e-07], [252.2939, 207.6888, 0.007134987972676754], [237.4896, 200.1762, 0.015723353251814842], [227.4309, 194.35, 0.018801052123308182], [219.2556, 190.2804, 0.02206707000732422], [234.9532, 186.6881, 0.037710245698690414], [222.0792, 179.9712, 0.03476671129465103], [212.066, 177.2227, 0.02700057066977024], [204.9813, 175.884, 0.020737046375870705], [237.6811, 182.2558, 0.02590193599462509], [223.978, 175.1319, 0.02283995784819126], [212.9706, 172.2056, 0.012809130363166332], [204.5574, 171.4614, 0.005743152461946011], [241.7076, 178.5331, 0.010987917892634869], [229.3826, 170.0865, 0.00432335352525115], [220.1773, 167.3723, -0.002652241848409176], [213.0593, 166.4911, -0.0066759311594069], [247.2278, 175.4948, -0.004696916323155165], [238.2604, 166.3414, -0.011616628617048264], [231.4839, 164.1376, -0.01298348419368267], [226.1307, 164.752, -0.013128921389579773]])
 None]
[list([[156.4496, 203.1393, -5.24644470090152e-08], [17

In [37]:
# create a list of 3*21 0 values
zeros = np.zeros((21, 3))
zeros = zeros.tolist()

for i in range(len(hands_landmarks)):
    for j in range(len(hands_landmarks[i])):
        if hands_landmarks[i][j] is None:
            hands_landmarks[i][j] = zeros

In [38]:
print(hands_landmarks[5])
print("=====================================")
print(hands_landmarks[6])
print("=====================================")
print(hands_landmarks[100])

[list([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
 list([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])]
[list([[268.2697, 208.0679, -1.1411800926452997e-07], [252.2939, 207.6888, 0.007134987972676754], [237.4896, 200.1762, 0.015723353251814842], [227.4309, 194.35, 0.018801052123308182], [219.2556, 190.2804, 0.02206707000732422], [234.9532, 186.6881, 0.037710245698690414]

## Replace 0.0 values by "phantom" values
(I still want to keep the 0.0 vector values transformation because its ensure me that each row as the same structure)

It seems that replacing void or None value by 0.0 is not the correct thing to do, for several reasons.

    1. The model will probably not understand no object is detected in the frame, I will probably understand that there is an object and this object is at the position (0.0 ; 0.0).
    2. With zeros values, we can't compute the distance between hands and object

The "phantom" value of an inexistant object is the coordinate which is the furthest away from the hands. In other words, if we don't detect any object, we will simulate that we detect one but very far away from the hands, this way the model should understand that there is no interaction between the hands and the object.

Naturally, the furthest point from a hand depends on the position of the hand. But in all cases, it should be in one of the corner.
The image format use for the hands detection is 300x300, same for the object detection (it was 640x640 but I have adapted the distances to match the hands pixels one). So we have a square of 300x300 pixels, which can be divided in 4 quadrants :

    - A = top left
    - B = top right
    - C = bottom left
    - D = bottom right

If the hand center is in the quandrant A, the furthest point of the hand is the very bottom right corner. In other words, if the x and y values of the center of the hand are respectively (X < 150 && Y > 150) => phantom value = (0, 300) = very bottom right corner. (same logic for the other quadrant)

In [39]:
# count number of frame with at least one hand detected but no object detected
counter_A = 0
counter_B = 0
for i in range(len(objects)):
    if sum(hands_center[i][0][0:2]) > 0 and sum(objects[i][0][0][0:4]) == 0:
        counter_A += 1
    elif sum(hands_center[i][0][0:2]) == 0 and sum(objects[i][0][0][0:4]) == 0:
        counter_B += 1

print("There is ", counter_A, " frames with at least one hand detected but no object detected")
print("There is ", counter_B, " frames with no hand detected and no object detected")

There is  7676  frames with at least one hand detected but no object detected
There is  2312  frames with no hand detected and no object detected


In [40]:
# PHANTOM CONSTANTS
# if hand is in quandrant A, then the object is in quadrant D (bottom right)
phantom_x_A = 300
phantom_y_A = 0
# if hand is in quandrant B, then the object is in quadrant C (bottom left)
phantom_x_B = 0
phantom_y_B = 0
# if hand is in quandrant C, then the object is in quadrant B (top left)
phantom_x_C = 300
phantom_y_C = 300
# if hand is in quandrant D, then the object is in quadrant A (top right)
phantom_x_D = 0
phantom_y_D = 300

# pixel value of the center of the screen
screen_center = 150

# in YOLOv7, the 4th class is the 'airplane' object class, 
#since I am sure that there is no airplane in the video, I will use this class to add the phantom object
phantom_class = 4 

# phantom object size (in pixel) is 1x1 (since we don't know the size of the object)
phantom_l, phantom_w = 1, 1

# first we need to check if one hand is detected AND the object is missing
# if so, we need to add the phantom object
for i in range(len(objects)):
    if sum(hands_center[i][0][0:2]) > 0 and sum(objects[i][0][0][0:4]) == 0:
        # if the hand is in the top left corner (quad A)
        if hands_center[i][0][0] < screen_center and hands_center[i][0][1] > screen_center:
            objects[i] = [[[phantom_class, phantom_x_A, phantom_y_A, phantom_l, phantom_w]]]
        # if the hand is in the top right corner (quad B)
        elif hands_center[i][0][0] > screen_center and hands_center[i][0][1] > screen_center:
            objects[i] = [[[phantom_class, phantom_x_B, phantom_y_B, phantom_l, phantom_w]]]
        # if the hand is in the bottom left corner (quad C)
        elif hands_center[i][0][0] < screen_center and hands_center[i][0][1] < screen_center:
            objects[i] = [[[phantom_class, phantom_x_C, phantom_y_C, phantom_l, phantom_w]]]
        # if the hand is in the bottom right corner (quad D)
        elif hands_center[i][0][0] > screen_center and hands_center[i][0][1] < screen_center:
            objects[i] = [[[phantom_class, phantom_x_D, phantom_y_D, phantom_l, phantom_w]]]

    # if no hand is detected AND the object is missing the hand center value is set to (0,0) = quadrant C, 
    # so we need to add the phantom object in quadrant B
    elif sum(hands_center[i][0][0:2]) == 0 and sum(objects[i][0][0][0:4]) == 0:
        objects[i] = [[[phantom_class, phantom_x_C, phantom_y_C, phantom_l, phantom_w]]]

In [41]:
print(objects[:10])

[[list([[4, 0, 0, 1, 1]])]
 [list([[4, 0, 0, 1, 1]])]
 [list([[4, 300, 300, 1, 1]])]
 [list([[4, 0, 0, 1, 1]])]
 [list([[4, 0, 0, 1, 1]])]
 [list([[4, 300, 300, 1, 1]])]
 [list([[4, 0, 0, 1, 1]])]
 [list([[4, 0, 0, 1, 1]])]
 [list([[4, 300, 300, 1, 1]])]
 [list([[4, 0, 300, 1, 1]])]]


### Every single frame contains now a real object or a "phantom" one

# Calculate distances between landmarks and objects
Since we know which object is the closest one to then hand, we can calculate the exact distance between the hands coordinates and the object position (to better understand the interaction between hands and object).

In other words, we want to calculate and store the euclidian distance between each landmarks hand the object detected on every frame.

In [42]:
# check if each frame has a hand detected
#zero_hand_counter = 0
#one_hand_counter = 0
#two_hands_counter = 0

#for i in range(len(hands_landmarks)):
#    if (hands_landmarks[i][0] is None) and (hands_landmarks[i][1] is None):
#        zero_hand_counter += 1
#    elif (hands_landmarks[i][0] is not None) and (hands_landmarks[i][1] is None):
#        one_hand_counter += 1
#    elif (hands_landmarks[i][0] is not None) and (hands_landmarks[i][1] is not None):
#        two_hands_counter += 1
#    elif (hands_landmarks[i][0] is None) and (hands_landmarks[i][1] is not None):
#        print("ERROR : hands 2 should not be detected if hands 1 is not detected")

#print("zero_hand_counter: ", zero_hand_counter)
#print("one_hand_counter: ", one_hand_counter)
#print("two_hands_counter: ", two_hands_counter)
#print("total with at least one hand: ",one_hand_counter+two_hands_counter)

In [43]:
print(hands_landmarks[5])
hand_x = hands_landmarks[5][0][0][0]
hand_y = hands_landmarks[5][0][0][1]

print(objects[5])
obj_x = objects[5][0][0][1]
obj_y = objects[5][0][0][2]

print("These are the value to compute : ", hand_x, obj_x, hand_y, obj_y)
print(compute_euclidian_d(obj_x, obj_y, hand_x, hand_y))

[list([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
 list([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])]
[list([[4, 300, 300, 1, 1]])]
These are the value to compute :  0.0 300 0.0 300
424.26406871192853


In [44]:
# compute the euclidean distance between all the landmarks and the object

all_distance_hands_obj = []
obj_counter = 0

for i in tqdm(range(len(objects))):

    distance_hands_obj = []

    # get the coordinates of the object
    obj_x_coord = objects[i][0][0][1]
    obj_y_coord = objects[i][0][0][2]

    # list of first and second hand coordinates
    hand_1_x_coord = []
    hand_1_y_coord = []
    hand_2_x_coord = []
    hand_2_y_coord = []

    # boolean value to skip further computation no hand is detected
    no_hand_detected = False

    first_hand_distance_hands_obj = []
    second_hand_distance_hands_obj = []

    for landmark_1 in range(len(hands_landmarks[i][0])):
        hand_1_x_coord.append(hands_landmarks[i][0][landmark_1][0]) # save x coord of each landmark
        hand_1_y_coord.append(hands_landmarks[i][0][landmark_1][1]) # save y coord of each landmark

        # compute the euclidean distance between each landmark of first hand and the object
    for k in range(len(hand_1_x_coord)):
        first_hand_distance_hands_obj.append(compute_euclidian_d(hand_1_x_coord[k], hand_1_y_coord[k], obj_x_coord, obj_y_coord))

        
    for landmark_2 in range(len(hands_landmarks[i][1])):
        hand_2_x_coord.append(hands_landmarks[i][1][landmark_2][0])
        hand_2_y_coord.append(hands_landmarks[i][1][landmark_2][1])

    # compute the euclidean distance between each landmark of second hand and the object
    for l in range(len(hand_2_x_coord)):
        second_hand_distance_hands_obj.append(compute_euclidian_d(hand_2_x_coord[l], hand_2_y_coord[l], obj_x_coord, obj_y_coord))

    distance_hands_obj.append([first_hand_distance_hands_obj])
    distance_hands_obj.append([second_hand_distance_hands_obj])
        
    all_distance_hands_obj.append(distance_hands_obj)

100%|██████████| 28826/28826 [00:03<00:00, 8989.66it/s] 


In [85]:
print(hands_landmarks[0])
print(objects[0])

print(all_distance_hands_obj[0])

[list([[203.678, 215.0048, 2.0850416149187367e-07], [201.9033, 205.3035, 0.004473347682505846], [196.5171, 195.1516, 0.006036499049514532], [191.7402, 188.7383, 0.005319010931998491], [188.219, 183.4591, 0.0038206814788281918], [195.5639, 194.5877, 0.002629668451845646], [189.9175, 183.8406, -0.0069520906545221806], [185.9305, 175.2332, -0.014755076728761196], [182.4908, 168.4838, -0.01992180198431015], [194.1577, 194.4168, -0.005564892198890448], [187.8342, 181.7611, -0.014680745080113411], [183.0777, 172.2711, -0.02269960567355156], [178.5262, 163.9938, -0.028351115062832832], [192.2044, 195.2947, -0.014078854583203793], [186.5482, 182.9543, -0.02173532545566559], [183.351, 173.7823, -0.027611026540398598], [180.3378, 166.1863, -0.03172048181295395], [189.8526, 197.0832, -0.022334376350045204], [186.6848, 187.9192, -0.027938470244407654], [184.999, 181.1177, -0.030382338911294937], [183.4057, 175.4153, -0.03252025693655014]])
 list([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], 

## Replace None values by vector full of 0 values in all_distance_hands_obj

In [47]:
# convert all_distance_hands_obj into a numpy array
print(type(all_distance_hands_obj))
print(type(all_distance_hands_obj[0]))

all_distance_hands_obj = np.array(all_distance_hands_obj)

for i in tqdm(range(len(all_distance_hands_obj))):
        all_distance_hands_obj[i] = np.array(all_distance_hands_obj[i])
        
print(type(all_distance_hands_obj))
print(type(all_distance_hands_obj[0]))

<class 'list'>
<class 'list'>


100%|██████████| 28826/28826 [00:00<00:00, 343187.64it/s]

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>





In [48]:
print(all_distance_hands_obj[0])
print("=====================================")
print(all_distance_hands_obj[101])
print("=====================================")
print(all_distance_hands_obj[100])

[[[296.16176611 287.94872749 276.95327688 269.04730101 262.83765585
   275.87970562 264.32181714 255.49329778 248.37407865 274.76372503
   261.37862224 251.38571187 242.41611028 274.01195453 261.29008173
   252.62200418 245.23378319 273.65269494 264.88646677 258.89814842
   253.78766374]]

 [[  0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.        ]]]
[[[161.36944332 151.86903297 143.07825996 138.39222847 135.04675649
   145.34030276 141.3307551  139.63442375 138.35876792 150.08369788
   144.34560236 142.79132095 143.59978858 154.58435186 148.26192968
   146.9521194  147.79517981 158.37655556 152.93494241 151.88182172
   152.82609893]]

 [[116.14814795 116.14814795 116.14814795 116.14814795 116.14814795
   116.14814795 116.14814795 116.14814795 116.14814795 116.14814795
   116.14814795 116.148

In [49]:
# create a list of 2 * 21 0 values
both_distances_missing = np.zeros((2, 21))
both_distances_missing = both_distances_missing.tolist()

# create a list of 21 0 values
one_distance_missing = np.zeros((21))
one_distance_missing = one_distance_missing.tolist()

for i in range(len(all_distance_hands_obj)):
    for j in range(len(all_distance_hands_obj[i])):
        if all_distance_hands_obj[i][j] is None:
            all_distance_hands_obj[i][j] = both_distances_missing
        else:
            for k in range(len(all_distance_hands_obj[i][j])):
                for l in range(len(all_distance_hands_obj[i][j][k])):
                    if all_distance_hands_obj[i][j][k][l] is None:
                        all_distance_hands_obj[i][j][k] = one_distance_missing


In [50]:
print(all_distance_hands_obj[0])
print("=====================================")
print(all_distance_hands_obj[101])
print("=====================================")
print(all_distance_hands_obj[100])

[[[296.16176611 287.94872749 276.95327688 269.04730101 262.83765585
   275.87970562 264.32181714 255.49329778 248.37407865 274.76372503
   261.37862224 251.38571187 242.41611028 274.01195453 261.29008173
   252.62200418 245.23378319 273.65269494 264.88646677 258.89814842
   253.78766374]]

 [[  0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.           0.           0.           0.           0.
     0.        ]]]
[[[161.36944332 151.86903297 143.07825996 138.39222847 135.04675649
   145.34030276 141.3307551  139.63442375 138.35876792 150.08369788
   144.34560236 142.79132095 143.59978858 154.58435186 148.26192968
   146.9521194  147.79517981 158.37655556 152.93494241 151.88182172
   152.82609893]]

 [[116.14814795 116.14814795 116.14814795 116.14814795 116.14814795
   116.14814795 116.14814795 116.14814795 116.14814795 116.14814795
   116.14814795 116.148

# All data have been created/cleaned => Dataset can be assembled
The distances between all landmarks and the objects have been calculated, it means we have all the data we need to create our dataset.

In [51]:
print("size of the array of landmarks of both hands : ", len(hands_landmarks))
print("size of the array of closest objects : ", len(objects))
print("size of the array of distances between hands and objects : ", len(all_distance_hands_obj))
print("size of the array of hands center : ", len(hands_center))

size of the array of landmarks of both hands :  28826
size of the array of closest objects :  28826
size of the array of distances between hands and objects :  28826
size of the array of hands center :  28826


In [52]:
# check the structure of each array
print("structure of hands_landmarks : ", hands_landmarks[100])
print("=====================================")
print("structure of objects : ", objects[100])
print("=====================================")
print("structure of all_distance_hands_obj : ", all_distance_hands_obj[100])
print("=====================================")
print("structure of hands_center : ", hands_center[100])

structure of hands_landmarks :  [list([[156.4496, 203.1393, -5.24644470090152e-08], [172.7615, 199.2163, 0.00021449675841722637], [181.5464, 192.3535, 0.002985759172588587], [186.0438, 186.4534, 0.005181078799068928], [188.1296, 183.3973, 0.008739686571061611], [168.5563, 174.1744, 0.009405670687556267], [178.5483, 172.27, 0.01178621593862772], [183.0276, 178.2377, 0.008706944063305855], [183.9778, 183.4503, 0.00658069085329771], [160.8451, 173.8956, 0.009582112543284893], [171.6887, 171.7744, 0.015201371163129807], [176.9907, 176.6825, 0.011294025927782059], [178.1609, 182.1347, 0.006742431316524744], [155.6191, 176.5092, 0.010058070532977581], [165.1456, 173.4283, 0.016024170443415642], [170.2548, 177.3312, 0.013728313148021698], [171.458, 182.3851, 0.0104950787499547], [152.8007, 179.9805, 0.010449445806443691], [160.712, 176.3714, 0.015952087938785553], [165.0577, 178.748, 0.019001390784978867], [166.7083, 182.813, 0.020856449380517006]])
 list([[238.2149, 230.1531, -1.722870024423

In [53]:
# create final dataset with all the features
final_dataset = []

for i in tqdm(range(len(hands_landmarks))): # I choose hands_landmarks but it could be any other array since they all have the same size

    clean_hands_landmarks = []
    clean_hands_landmarks.append(hands_landmarks[i])

    clean_objects = []
    if len(objects[i][0]) != 0: 
        if isinstance(objects[i][0][0], list):
            clean_objects.append(objects[i][0][0])

    clean_all_distance_hands_obj = []
    clean_all_distance_hands_obj.append(all_distance_hands_obj[i])

    clean_hands_center = []
    clean_hands_center.append(hands_center[i])

    final_dataset.append([clean_hands_landmarks, clean_objects, clean_all_distance_hands_obj, clean_hands_center])

100%|██████████| 28826/28826 [00:00<00:00, 228080.48it/s]


In [54]:
print(len(final_dataset))
print(final_dataset[100])

28826
[[array([list([[156.4496, 203.1393, -5.24644470090152e-08], [172.7615, 199.2163, 0.00021449675841722637], [181.5464, 192.3535, 0.002985759172588587], [186.0438, 186.4534, 0.005181078799068928], [188.1296, 183.3973, 0.008739686571061611], [168.5563, 174.1744, 0.009405670687556267], [178.5483, 172.27, 0.01178621593862772], [183.0276, 178.2377, 0.008706944063305855], [183.9778, 183.4503, 0.00658069085329771], [160.8451, 173.8956, 0.009582112543284893], [171.6887, 171.7744, 0.015201371163129807], [176.9907, 176.6825, 0.011294025927782059], [178.1609, 182.1347, 0.006742431316524744], [155.6191, 176.5092, 0.010058070532977581], [165.1456, 173.4283, 0.016024170443415642], [170.2548, 177.3312, 0.013728313148021698], [171.458, 182.3851, 0.0104950787499547], [152.8007, 179.9805, 0.010449445806443691], [160.712, 176.3714, 0.015952087938785553], [165.0577, 178.748, 0.019001390784978867], [166.7083, 182.813, 0.020856449380517006]]),
       list([[238.2149, 230.1531, -1.7228700244231732e-07], 

In [55]:
# convert the final dataset into a dataframe
df = pd.DataFrame(final_dataset, columns = ['hands_landmarks', 'objects', 'distance_landmarks_obj', 'hands_center'])
df.head()

Unnamed: 0,hands_landmarks,objects,distance_landmarks_obj,hands_center
0,"[[[[203.678, 215.0048, 2.0850416149187367e-07]...","[[4, 0, 0, 1, 1]]",[[[[296.16176611 287.94872749 276.95327688 269...,"[[[195.09132027626038, 199.27747249603271], [0..."
1,"[[[[210.2895, 194.1755, -2.3149007688516576e-0...","[[4, 0, 0, 1, 1]]",[[[[286.2268307 280.1363043 272.99221645 266...,"[[[208.6511743068695, 179.4220232963562], [0.0..."
2,"[[[[95.7721, 153.131, 1.4491298827579158e-07],...","[[4, 300, 300, 1, 1]]",[[[[251.55424524 245.39239099 249.03576008 252...,"[[[90.08440017700195, 129.03473675251007], [0...."
3,"[[[[166.9457, 171.6965, 9.100789100102702e-08]...","[[4, 0, 0, 1, 1]]",[[[[239.47975877 241.96706373 242.00371242 241...,"[[[168.38631391525269, 161.72376036643982], [2..."
4,"[[[[236.6061, 209.4084, 1.01826573839503e-08],...","[[4, 0, 0, 1, 1]]",[[[[315.96570154 312.99429868 309.4229624 304...,"[[[242.26854801177979, 193.36541175842285], [0..."


In [56]:
# check how many frames have no object detected
nb_frame_no_obj = len(df[df['objects'].map(lambda d: len(d)) == 0])
nb_frame_with_obj = len(df[df['objects'].map(lambda d: len(d)) != 0])

print("number of frames with no object detected : ", nb_frame_no_obj)
print("number of frames with object detected : ", nb_frame_with_obj)

number of frames with no object detected :  0
number of frames with object detected :  28826


In [57]:
print(df.shape)
print(ground_truth.shape)

(28826, 4)
(28826, 10)


In [58]:
# merge df with ground truth to get the label of each frame
merge_df = df.merge(ground_truth, left_index=True, right_index=True)
merge_df.head()

Unnamed: 0,hands_landmarks,objects,distance_landmarks_obj,hands_center,L_Reach,L_Grasp,L_Manipulation,L_Transport,L_Place,R_Reach,R_Grasp,R_Manipulation,R_Transport,R_Place
0,"[[[[203.678, 215.0048, 2.0850416149187367e-07]...","[[4, 0, 0, 1, 1]]",[[[[296.16176611 287.94872749 276.95327688 269...,"[[[195.09132027626038, 199.27747249603271], [0...",0,0,0,0,0,0,0,0,0,0
1,"[[[[210.2895, 194.1755, -2.3149007688516576e-0...","[[4, 0, 0, 1, 1]]",[[[[286.2268307 280.1363043 272.99221645 266...,"[[[208.6511743068695, 179.4220232963562], [0.0...",0,0,0,0,0,0,0,0,0,0
2,"[[[[95.7721, 153.131, 1.4491298827579158e-07],...","[[4, 300, 300, 1, 1]]",[[[[251.55424524 245.39239099 249.03576008 252...,"[[[90.08440017700195, 129.03473675251007], [0....",0,0,0,0,0,0,0,0,0,0
3,"[[[[166.9457, 171.6965, 9.100789100102702e-08]...","[[4, 0, 0, 1, 1]]",[[[[239.47975877 241.96706373 242.00371242 241...,"[[[168.38631391525269, 161.72376036643982], [2...",0,0,0,0,0,0,0,0,0,0
4,"[[[[236.6061, 209.4084, 1.01826573839503e-08],...","[[4, 0, 0, 1, 1]]",[[[[315.96570154 312.99429868 309.4229624 304...,"[[[242.26854801177979, 193.36541175842285], [0...",0,0,0,0,0,0,0,0,0,0


In [59]:
# create a new dataframe with the only the frames with at least one object detected
df_with_obj = merge_df[merge_df['objects'].map(lambda d: len(d)) != 0]
print(df_with_obj.shape)
df_with_obj.head()

(28826, 14)


Unnamed: 0,hands_landmarks,objects,distance_landmarks_obj,hands_center,L_Reach,L_Grasp,L_Manipulation,L_Transport,L_Place,R_Reach,R_Grasp,R_Manipulation,R_Transport,R_Place
0,"[[[[203.678, 215.0048, 2.0850416149187367e-07]...","[[4, 0, 0, 1, 1]]",[[[[296.16176611 287.94872749 276.95327688 269...,"[[[195.09132027626038, 199.27747249603271], [0...",0,0,0,0,0,0,0,0,0,0
1,"[[[[210.2895, 194.1755, -2.3149007688516576e-0...","[[4, 0, 0, 1, 1]]",[[[[286.2268307 280.1363043 272.99221645 266...,"[[[208.6511743068695, 179.4220232963562], [0.0...",0,0,0,0,0,0,0,0,0,0
2,"[[[[95.7721, 153.131, 1.4491298827579158e-07],...","[[4, 300, 300, 1, 1]]",[[[[251.55424524 245.39239099 249.03576008 252...,"[[[90.08440017700195, 129.03473675251007], [0....",0,0,0,0,0,0,0,0,0,0
3,"[[[[166.9457, 171.6965, 9.100789100102702e-08]...","[[4, 0, 0, 1, 1]]",[[[[239.47975877 241.96706373 242.00371242 241...,"[[[168.38631391525269, 161.72376036643982], [2...",0,0,0,0,0,0,0,0,0,0
4,"[[[[236.6061, 209.4084, 1.01826573839503e-08],...","[[4, 0, 0, 1, 1]]",[[[[315.96570154 312.99429868 309.4229624 304...,"[[[242.26854801177979, 193.36541175842285], [0...",0,0,0,0,0,0,0,0,0,0


In [60]:
# dataframe with columns which contain multidimensional arrays (hands_landmarks, objects, distance_landmarks_obj, hands_center) are not readable by the model
# so we need to convert them into a readable format

In [61]:
# check the shape of the dataframe
print(df_with_obj.shape)
df_with_obj.head()

(28826, 14)


Unnamed: 0,hands_landmarks,objects,distance_landmarks_obj,hands_center,L_Reach,L_Grasp,L_Manipulation,L_Transport,L_Place,R_Reach,R_Grasp,R_Manipulation,R_Transport,R_Place
0,"[[[[203.678, 215.0048, 2.0850416149187367e-07]...","[[4, 0, 0, 1, 1]]",[[[[296.16176611 287.94872749 276.95327688 269...,"[[[195.09132027626038, 199.27747249603271], [0...",0,0,0,0,0,0,0,0,0,0
1,"[[[[210.2895, 194.1755, -2.3149007688516576e-0...","[[4, 0, 0, 1, 1]]",[[[[286.2268307 280.1363043 272.99221645 266...,"[[[208.6511743068695, 179.4220232963562], [0.0...",0,0,0,0,0,0,0,0,0,0
2,"[[[[95.7721, 153.131, 1.4491298827579158e-07],...","[[4, 300, 300, 1, 1]]",[[[[251.55424524 245.39239099 249.03576008 252...,"[[[90.08440017700195, 129.03473675251007], [0....",0,0,0,0,0,0,0,0,0,0
3,"[[[[166.9457, 171.6965, 9.100789100102702e-08]...","[[4, 0, 0, 1, 1]]",[[[[239.47975877 241.96706373 242.00371242 241...,"[[[168.38631391525269, 161.72376036643982], [2...",0,0,0,0,0,0,0,0,0,0
4,"[[[[236.6061, 209.4084, 1.01826573839503e-08],...","[[4, 0, 0, 1, 1]]",[[[[315.96570154 312.99429868 309.4229624 304...,"[[[242.26854801177979, 193.36541175842285], [0...",0,0,0,0,0,0,0,0,0,0


In [62]:
print(len(hands_landmarks[100]))
print(len(hands_landmarks[100][1]))

2
21


In [63]:
print((hands_landmarks[100]))
print("=====================================")
print((hands_landmarks[100][0]))
print("=====================================")
print((hands_landmarks[100][0][0]))
print("=====================================")
print(hands_landmarks[100][0][0][0])
print(hands_landmarks[100][0][0][1])
print(hands_landmarks[100][0][0][2])

[list([[156.4496, 203.1393, -5.24644470090152e-08], [172.7615, 199.2163, 0.00021449675841722637], [181.5464, 192.3535, 0.002985759172588587], [186.0438, 186.4534, 0.005181078799068928], [188.1296, 183.3973, 0.008739686571061611], [168.5563, 174.1744, 0.009405670687556267], [178.5483, 172.27, 0.01178621593862772], [183.0276, 178.2377, 0.008706944063305855], [183.9778, 183.4503, 0.00658069085329771], [160.8451, 173.8956, 0.009582112543284893], [171.6887, 171.7744, 0.015201371163129807], [176.9907, 176.6825, 0.011294025927782059], [178.1609, 182.1347, 0.006742431316524744], [155.6191, 176.5092, 0.010058070532977581], [165.1456, 173.4283, 0.016024170443415642], [170.2548, 177.3312, 0.013728313148021698], [171.458, 182.3851, 0.0104950787499547], [152.8007, 179.9805, 0.010449445806443691], [160.712, 176.3714, 0.015952087938785553], [165.0577, 178.748, 0.019001390784978867], [166.7083, 182.813, 0.020856449380517006]])
 list([[238.2149, 230.1531, -1.7228700244231732e-07], [220.9242, 230.17, -0

In [64]:
print(all_distance_hands_obj[100])
print("=====================================")
print(all_distance_hands_obj[100][0])
print("=====================================")
print(all_distance_hands_obj[100][0][0])
print("=====================================")
print(all_distance_hands_obj[100][0][0][0])

[[[153.29461741 164.44594994 168.47830712 169.70782689 170.24779359
   148.61044135 156.94292507 163.4487883  166.52703219 141.55339399
   150.45662176 157.31300068 160.72627837 138.12579907 145.21038277
   151.51067028 154.86795188 137.37477451 142.55908761 147.5111342
   150.87925125]]

 [[236.43463094 221.69771328 206.49681761 198.67213817 196.846348
   203.37638145 194.93111036 193.79714715 195.02742304 209.11130218
   202.84971937 202.86053458 205.92020852 215.08750122 209.38835463
   209.17691318 211.54551948 220.11285719 214.18845034 213.72478601
   215.89946494]]]
[[153.29461741 164.44594994 168.47830712 169.70782689 170.24779359
  148.61044135 156.94292507 163.4487883  166.52703219 141.55339399
  150.45662176 157.31300068 160.72627837 138.12579907 145.21038277
  151.51067028 154.86795188 137.37477451 142.55908761 147.5111342
  150.87925125]]
[153.29461741 164.44594994 168.47830712 169.70782689 170.24779359
 148.61044135 156.94292507 163.4487883  166.52703219 141.55339399
 150.

In [65]:
print(hands_landmarks[100])
print("=====================================")
print(objects[100])
print("=====================================")
print(all_distance_hands_obj[100])
print("=====================================")
print(hands_center[100])

[list([[156.4496, 203.1393, -5.24644470090152e-08], [172.7615, 199.2163, 0.00021449675841722637], [181.5464, 192.3535, 0.002985759172588587], [186.0438, 186.4534, 0.005181078799068928], [188.1296, 183.3973, 0.008739686571061611], [168.5563, 174.1744, 0.009405670687556267], [178.5483, 172.27, 0.01178621593862772], [183.0276, 178.2377, 0.008706944063305855], [183.9778, 183.4503, 0.00658069085329771], [160.8451, 173.8956, 0.009582112543284893], [171.6887, 171.7744, 0.015201371163129807], [176.9907, 176.6825, 0.011294025927782059], [178.1609, 182.1347, 0.006742431316524744], [155.6191, 176.5092, 0.010058070532977581], [165.1456, 173.4283, 0.016024170443415642], [170.2548, 177.3312, 0.013728313148021698], [171.458, 182.3851, 0.0104950787499547], [152.8007, 179.9805, 0.010449445806443691], [160.712, 176.3714, 0.015952087938785553], [165.0577, 178.748, 0.019001390784978867], [166.7083, 182.813, 0.020856449380517006]])
 list([[238.2149, 230.1531, -1.7228700244231732e-07], [220.9242, 230.17, -0

In [66]:
print(hands_center[100][0])   

[158.8541579246521, 181.53979897499084]


In [67]:
# count the number of columns wich should contain the future dataframe
nb_total_columns = 0
nb_columns_hands_landmarks = 0
nb_columns_objects = 0
nb_columns_distance_landmarks_obj = 0
nb_columns_hands_center = 0

for i in range(len(hands_landmarks[100])):
    for j in range(len(hands_landmarks[100][i])):
        for k in range(len(hands_landmarks[100][i][j])):
            nb_columns_hands_landmarks += 1

for i in range(len(objects[100])):
    for j in range(len(objects[100][i])):
        for k in range(len(objects[100][i][j])):
            nb_columns_objects += 1

for i in range(len(all_distance_hands_obj[100])):
    for j in range(len(all_distance_hands_obj[100][i])):
        for k in range(len(all_distance_hands_obj[100][i][j])):
            nb_columns_distance_landmarks_obj += 1

for i in range(len(hands_center[100])):
    for j in range(len(hands_center[100][i])):
        nb_columns_hands_center += 1

nb_total_columns = nb_columns_hands_landmarks + nb_columns_objects + nb_columns_distance_landmarks_obj + nb_columns_hands_center

print("number of columns for hands_landmarks : ", nb_columns_hands_landmarks)
print("number of columns for objects : ", nb_columns_objects)
print("number of columns for distance_landmarks_obj : ", nb_columns_distance_landmarks_obj)
print("number of columns for hands_center : ", nb_columns_hands_center)
print("total number of columns : ", nb_total_columns)

number of columns for hands_landmarks :  126
number of columns for objects :  5
number of columns for distance_landmarks_obj :  42
number of columns for hands_center :  4
total number of columns :  177


In [68]:
# create a final dataset with all the features
# for the model to understand the data, we need to flatten the array
# we also need to add a new column to the dataset to indicate the frame number of the data

final_dataset = []

for i in tqdm(range(len(hands_landmarks))):

    temp = []
    col_names = []

    # loop through the hands center
    for j in range(len(hands_center[i])):
        for k in range(len(hands_center[i][j])):
            if k == 0:
                center_name = "center_{}_X".format(j)
                col_names.append(center_name)
            elif k == 1:
                center_name = "center_{}_Y".format(j)
                col_names.append(center_name)

            val_center = hands_center[i][j][k]
            temp.append(val_center)

    # loop through the objects
    for j in range(len(objects[i])):
        for k in range(len(objects[i][j])):
            for l in range(len(objects[i][j][k])):
                if l == 0: # obj class
                    obj_name = "obj_class"
                    col_names.append(obj_name)
                elif l == 1: # obj center x
                    obj_name = "obj_X"
                    col_names.append(obj_name)
                elif l == 2: # obj center y
                    obj_name = "obj_Y"
                    col_names.append(obj_name)
                elif l == 3: # obj width
                    obj_name = "obj_width"
                    col_names.append(obj_name)
                elif l == 4: # obj height
                    obj_name = "obj_height"
                    col_names.append(obj_name)

                val_obj = objects[i][j][k][l]
                temp.append(val_obj)

    # loop through the hands landmarks
    for j in range(len(hands_landmarks[i])):
        for k in range(len(hands_landmarks[i][j])):
            for l in range(len(hands_landmarks[i][j][k])):
                if l == 0: # x coordinate
                    landmark_name = "landmark_{}_{}_X".format(j, k)
                    col_names.append(landmark_name)
                elif l == 1: # y coordinate
                    landmark_name = "landmark_{}_{}_Y".format(j, k)
                    col_names.append(landmark_name)
                elif l == 2: # z coordinate
                    landmark_name = "landmark_{}_{}_Z".format(j, k)
                    col_names.append(landmark_name)
            
                val_landmarks = hands_landmarks[i][j][k][l]
                temp.append(val_landmarks)

    # loop through the distance between hands and objects
    for j in range(len(all_distance_hands_obj[i])):
        for k in range(len(all_distance_hands_obj[i][j])):
            for l in range(len(all_distance_hands_obj[i][j][k])):
                dist_name = "dist_{}_{}".format(j, l)
                col_names.append(dist_name)

                val_dist = all_distance_hands_obj[i][j][k][l]
                temp.append(val_dist)


    # add the new variables to the final dataset
    final_dataset.append(temp)

100%|██████████| 28826/28826 [00:08<00:00, 3428.22it/s]


In [69]:
print(len(col_names))
print(col_names)

177
['center_0_X', 'center_0_Y', 'center_1_X', 'center_1_Y', 'obj_class', 'obj_X', 'obj_Y', 'obj_width', 'obj_height', 'landmark_0_0_X', 'landmark_0_0_Y', 'landmark_0_0_Z', 'landmark_0_1_X', 'landmark_0_1_Y', 'landmark_0_1_Z', 'landmark_0_2_X', 'landmark_0_2_Y', 'landmark_0_2_Z', 'landmark_0_3_X', 'landmark_0_3_Y', 'landmark_0_3_Z', 'landmark_0_4_X', 'landmark_0_4_Y', 'landmark_0_4_Z', 'landmark_0_5_X', 'landmark_0_5_Y', 'landmark_0_5_Z', 'landmark_0_6_X', 'landmark_0_6_Y', 'landmark_0_6_Z', 'landmark_0_7_X', 'landmark_0_7_Y', 'landmark_0_7_Z', 'landmark_0_8_X', 'landmark_0_8_Y', 'landmark_0_8_Z', 'landmark_0_9_X', 'landmark_0_9_Y', 'landmark_0_9_Z', 'landmark_0_10_X', 'landmark_0_10_Y', 'landmark_0_10_Z', 'landmark_0_11_X', 'landmark_0_11_Y', 'landmark_0_11_Z', 'landmark_0_12_X', 'landmark_0_12_Y', 'landmark_0_12_Z', 'landmark_0_13_X', 'landmark_0_13_Y', 'landmark_0_13_Z', 'landmark_0_14_X', 'landmark_0_14_Y', 'landmark_0_14_Z', 'landmark_0_15_X', 'landmark_0_15_Y', 'landmark_0_15_Z',

In [70]:
# convert final_dataset_v3 to a dataframe
final_df = pd.DataFrame(final_dataset)

In [71]:
print(final_df.shape)
final_df.head()

(28826, 177)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,167,168,169,170,171,172,173,174,175,176
0,195.09132,199.277472,0.0,0.0,4.0,0.0,0.0,1.0,1.0,203.678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,208.651174,179.422023,0.0,0.0,4.0,0.0,0.0,1.0,1.0,210.2895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,90.0844,129.034737,0.0,0.0,4.0,300.0,300.0,1.0,1.0,95.7721,...,424.264069,424.264069,424.264069,424.264069,424.264069,424.264069,424.264069,424.264069,424.264069,424.264069
3,168.386314,161.72376,217.941821,175.335138,4.0,0.0,0.0,1.0,1.0,166.9457,...,273.582723,276.196636,275.466223,271.688934,275.365014,277.632284,275.863013,273.418585,276.209308,278.149293
4,242.268548,193.365412,0.0,0.0,4.0,0.0,0.0,1.0,1.0,236.6061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# add the column names to the dataframe
final_df.columns = col_names

# merge with the ground truth dataframe
final_df = pd.concat([final_df, ground_truth], axis=1)

In [73]:
print(final_df.shape)
final_df.head()

(28826, 187)


Unnamed: 0,center_0_X,center_0_Y,center_1_X,center_1_Y,obj_class,obj_X,obj_Y,obj_width,obj_height,landmark_0_0_X,...,L_Reach,L_Grasp,L_Manipulation,L_Transport,L_Place,R_Reach,R_Grasp,R_Manipulation,R_Transport,R_Place
0,195.09132,199.277472,0.0,0.0,4.0,0.0,0.0,1.0,1.0,203.678,...,0,0,0,0,0,0,0,0,0,0
1,208.651174,179.422023,0.0,0.0,4.0,0.0,0.0,1.0,1.0,210.2895,...,0,0,0,0,0,0,0,0,0,0
2,90.0844,129.034737,0.0,0.0,4.0,300.0,300.0,1.0,1.0,95.7721,...,0,0,0,0,0,0,0,0,0,0
3,168.386314,161.72376,217.941821,175.335138,4.0,0.0,0.0,1.0,1.0,166.9457,...,0,0,0,0,0,0,0,0,0,0
4,242.268548,193.365412,0.0,0.0,4.0,0.0,0.0,1.0,1.0,236.6061,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# save the final dataframe to a csv file
final_df.to_csv("final_dataframe.csv", index=False)

In [75]:
# load the final dataframe
final_df = pd.read_csv("final_dataframe.csv")
print(final_df.shape)
final_df.head()

(28826, 187)


Unnamed: 0,center_0_X,center_0_Y,center_1_X,center_1_Y,obj_class,obj_X,obj_Y,obj_width,obj_height,landmark_0_0_X,...,L_Reach,L_Grasp,L_Manipulation,L_Transport,L_Place,R_Reach,R_Grasp,R_Manipulation,R_Transport,R_Place
0,195.09132,199.277472,0.0,0.0,4.0,0.0,0.0,1.0,1.0,203.678,...,0,0,0,0,0,0,0,0,0,0
1,208.651174,179.422023,0.0,0.0,4.0,0.0,0.0,1.0,1.0,210.2895,...,0,0,0,0,0,0,0,0,0,0
2,90.0844,129.034737,0.0,0.0,4.0,300.0,300.0,1.0,1.0,95.7721,...,0,0,0,0,0,0,0,0,0,0
3,168.386314,161.72376,217.941821,175.335138,4.0,0.0,0.0,1.0,1.0,166.9457,...,0,0,0,0,0,0,0,0,0,0
4,242.268548,193.365412,0.0,0.0,4.0,0.0,0.0,1.0,1.0,236.6061,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# print a row where center_0_X is 0.0
test = final_df[final_df["center_0_X"] == 0.0]
print(test.shape)

print(test.head())

(6714, 187)
    center_0_X  center_0_Y  center_1_X  center_1_Y  obj_class  obj_X  obj_Y   
5          0.0         0.0         0.0         0.0        4.0  300.0  300.0  \
8          0.0         0.0         0.0         0.0        4.0  300.0  300.0   
12         0.0         0.0         0.0         0.0        4.0  300.0  300.0   
17         0.0         0.0         0.0         0.0        4.0  300.0  300.0   
18         0.0         0.0         0.0         0.0        4.0  300.0  300.0   

    obj_width  obj_height  landmark_0_0_X  ...  L_Reach  L_Grasp   
5         1.0         1.0             0.0  ...        0        0  \
8         1.0         1.0             0.0  ...        0        0   
12        1.0         1.0             0.0  ...        0        0   
17        1.0         1.0             0.0  ...        0        0   
18        1.0         1.0             0.0  ...        0        0   

    L_Manipulation  L_Transport  L_Place  R_Reach  R_Grasp  R_Manipulation   
5                0        

In [77]:
# print distance columns in test
test_dist = test.filter(regex=("dist_.*"))
print(test_dist.head())

      dist_0_0    dist_0_1    dist_0_2    dist_0_3    dist_0_4    dist_0_5   
5   424.264069  424.264069  424.264069  424.264069  424.264069  424.264069  \
8   424.264069  424.264069  424.264069  424.264069  424.264069  424.264069   
12  424.264069  424.264069  424.264069  424.264069  424.264069  424.264069   
17  424.264069  424.264069  424.264069  424.264069  424.264069  424.264069   
18  424.264069  424.264069  424.264069  424.264069  424.264069  424.264069   

      dist_0_6    dist_0_7    dist_0_8    dist_0_9  ...   dist_1_11   
5   424.264069  424.264069  424.264069  424.264069  ...  424.264069  \
8   424.264069  424.264069  424.264069  424.264069  ...  424.264069   
12  424.264069  424.264069  424.264069  424.264069  ...  424.264069   
17  424.264069  424.264069  424.264069  424.264069  ...  424.264069   
18  424.264069  424.264069  424.264069  424.264069  ...  424.264069   

     dist_1_12   dist_1_13   dist_1_14   dist_1_15   dist_1_16   dist_1_17   
5   424.264069  424.264069

In [78]:
# print first row of test 
a = test.iloc[0]
print(a[:15])
print("_________________________")
print(a[135:160])

center_0_X          0.0
center_0_Y          0.0
center_1_X          0.0
center_1_Y          0.0
obj_class           4.0
obj_X             300.0
obj_Y             300.0
obj_width           1.0
obj_height          1.0
landmark_0_0_X      0.0
landmark_0_0_Y      0.0
landmark_0_0_Z      0.0
landmark_0_1_X      0.0
landmark_0_1_Y      0.0
landmark_0_1_Z      0.0
Name: 5, dtype: float64
_________________________
dist_0_0     424.264069
dist_0_1     424.264069
dist_0_2     424.264069
dist_0_3     424.264069
dist_0_4     424.264069
dist_0_5     424.264069
dist_0_6     424.264069
dist_0_7     424.264069
dist_0_8     424.264069
dist_0_9     424.264069
dist_0_10    424.264069
dist_0_11    424.264069
dist_0_12    424.264069
dist_0_13    424.264069
dist_0_14    424.264069
dist_0_15    424.264069
dist_0_16    424.264069
dist_0_17    424.264069
dist_0_18    424.264069
dist_0_19    424.264069
dist_0_20    424.264069
dist_1_0     424.264069
dist_1_1     424.264069
dist_1_2     424.264069
dist_1_3     4