forked from IcarusWizard/PixelwiseRegression
-
Notifications
You must be signed in to change notification settings - Fork 0
/
depth_hands_tracking.py
372 lines (299 loc) · 14.7 KB
/
depth_hands_tracking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import argparse
import os
import numpy as np
import cv2 as cv
import torch as tr
from matplotlib import pyplot as plt
import pyzed.sl as sl
# python -m pip install --ignore-installed /home/labcom/Documents/PixelwiseRegression/pyzed-4.0-cp310-cp310-linux_x86_64.whl
from rgb_hands_tracker import RGB_Hands_Tracker
from depth_hands_tracker import Depth_Hands_Tracker
from utils import center_crop, norm_img
PIXEL_SIZE_2K = 0.002
PIXEL_SIZE_1080 = 0.002
PIXEL_SIZE_720 = 0.004
PIXEL_SIZE_WVGA = 0.008
def make_parser():
parser = argparse.ArgumentParser("hands tracking")
parser.add_argument(
"-nh",
"--num_hands",
type=int,
default=4,
help="Maximum number of hands to track on the video.",
)
parser.add_argument(
"-res",
"--resolution",
type=int,
default=1080,
help="Height component of the RGB cameras resolution among 4 options : 376, 720, 1080, 1242.",
)
parser.add_argument(
"-fps",
"--frame_rate",
type=int,
default=30,
help="Frame rate in fps for the RGB cameras among 4 options which are not available for all resolutions : " +
"15(for all resolutions), 30(for 376, 720 and 1080 resolutions), 60(for 376 and 720 resolutions) and 100(only for the 376 resolution).",
)
parser.add_argument(
"-dm",
"--depth_mode",
type=str,
default="ULTRA",
help="Depth Computation Mode to use among 4 options : PERFORMANCE, QUALITY, ULTRA and NEURAL.",
)
parser.add_argument(
"-d",
"--display",
action="store_true",
help="To display the camera streams.",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Whether you want to display the results.",
)
parser.add_argument(
'--subject',
type=int,
default=0,
help="the subject of model file"
)
parser.add_argument(
'--dataset',
type=str,
default='HAND17',
help="choose from MSRA and HAND17"
)
parser.add_argument(
'--label_size',
type=int,
default=64
)
parser.add_argument(
'--cube_size',
type=int,
default=10000
)
parser.add_argument(
'--norm_method',
type=str,
default='instance',
help='choose from batch and instance'
)
parser.add_argument(
'--heatmap_method',
type=str,
default='softmax',
help='choose from softmax and sumz'
)
parser.add_argument(
'--gpu_id',
type=str,
default='0'
)
parser.add_argument(
'--stages',
type=int,
default=2
)
parser.add_argument(
'--features',
type=int,
default=128
)
parser.add_argument(
'--level',
type=int,
default=4
)
return parser
def measure_phalanxes(hands_landmarks_pts, display=False) :
phalanxes_size = np.zeros((len(hands_landmarks_pts), 5, 3), dtype=float)
for idx_hand, hand_landmarks_pts in enumerate(hands_landmarks_pts) :
for idx_finger in range(5) :
for idx_phalanx in range(3) :
idx_joint = idx_finger*4 + idx_phalanx
A = hands_landmarks_pts[idx_hand, idx_joint+1]
B = hands_landmarks_pts[idx_hand, idx_joint+2]
size = np.sqrt( np.square(B[0]-A[0]) + np.square(B[1]-A[1]) + np.square(B[2]-A[2]) )
phalanxes_size[idx_hand, idx_finger, idx_phalanx] = size
if display :
print("\n[MEASURE PHALANXES]\tPhalanxes sizes :\n", phalanxes_size)
return phalanxes_size
def compute_palms_cross_products(hands_pts, display=False) :
palm_cross_products = np.ones((len(hands_pts), 4, 3), dtype=float)
for idx_hands, hand_pts in enumerate(hands_pts) :
O5 = hands_pts[idx_hands, 5]-hands_pts[idx_hands, 0]
O9 = hands_pts[idx_hands, 9]-hands_pts[idx_hands, 0]
O13 = hands_pts[idx_hands, 13]-hands_pts[idx_hands, 0]
O17 = hands_pts[idx_hands, 17]-hands_pts[idx_hands, 0]
palm_cross_products[idx_hands, 0] = np.cross(O5, O9)
palm_cross_products[idx_hands, 1] = np.cross(O5, O13)
palm_cross_products[idx_hands, 2] = np.cross(O5, O17)
palm_cross_products[idx_hands, 3] = np.cross(O9, O13)
if display :
print("\n[COMPUTE PALMS CROSS PRODUCTS]\tPalms cross products :\n", palm_cross_products)
return palm_cross_products
def get_hands_close_up(bgr_img, depth_img, hand_landmarks, focal_length=None, cube_size=5000, display=False, clean=False) :
img_shape = np.shape(depth_img)
nb_hands = np.shape(hand_landmarks)[0]
hands_frames = np.zeros((4), dtype=np.int32)
if clean :
depth_img = np.where((depth_img == -np.inf) | (depth_img == np.inf) | (depth_img != depth_img), 0.0, depth_img)
origin = np.array((np.nanmin(hand_landmarks[:, :2], 0))-200, dtype=np.int32)
sides_size = int(np.max((np.nanmax(hand_landmarks[:, :2], 0))- origin, 0)+200)
hands_frames[0] = origin[0] if origin[0] >= 0 else 0
hands_frames[1] = origin[1] if origin[1] >= 0 else 0
cond = (origin + sides_size >= img_shape)
if np.any(cond) :
hands_frames[2] = np.nanmin(img_shape - origin)
else :
hands_frames[2] = sides_size
hands_frames[3] = 128/hands_frames[2]
end = origin + hands_frames[2]
depth_hand_close_up = depth_img[hands_frames[0] : end[0], hands_frames[1] : end[1]]
bgr_hand_close_up = bgr_img[hands_frames[0] : end[0], hands_frames[1] : end[1]]
if display :
dist = depth_hand_close_up.astype(np.uint8)
dist = cv.applyColorMap(dist, cv.COLORMAP_RAINBOW)
cv.imshow("Hand close-up on the depth frame", dist)
cv.imshow("Hand close-up on the bgr frame", bgr_hand_close_up)
cv.waitKey()
return bgr_hand_close_up, depth_hand_close_up, hands_frames
def get_full_size_uvd(hands_frames, depth_hands_landmarks, resolution) :
for hand_frame, depth_hand_landmarks in zip(hands_frames, depth_hands_landmarks) :
depth_hand_landmarks[:, :2] = (depth_hand_landmarks[:, :2] + 1) / 2 * 128
depth_hand_landmarks[:, 2] = (depth_hand_landmarks[:, 2] + 1) * hand_frame[5] / 2 + hand_frame[4]
return depth_hands_landmarks.astype(np.int32)
if __name__ == "__main__":
# Collect the parameters given as arguments
args = make_parser().parse_args()
# Get the maximum number of hands to track on the video
num_hands = int(args.num_hands)
# Get the instructions
resolution = args.resolution
fps = args.frame_rate
depth_mode = args.depth_mode
display = args.display
verbose = args.verbose
cube_size = args.cube_size
model_parameters = {
"stage" : args.stages,
"label_size" : args.label_size,
"features" : args.features,
"level" : args.level,
"norm_method" : args.norm_method,
"heatmap_method" : args.heatmap_method,
}
if args.dataset == 'HAND17' :
skeleton_mode = 1
model_name = "HAND17_default_final.pt"
elif args.dataset == 'MSRA' :
skeleton_mode = 0
model_name = "MSRA_default_subject{}_final.pt".format(args.subject)
if not os.path.exists("skeleton"):
os.mkdir("skeleton")
if not os.path.exists(os.path.join("skeleton", args.dataset)):
os.mkdir(os.path.join("skeleton", args.dataset))
os.mkdir(os.path.join("skeleton", args.dataset, "predict"))
assert os.path.exists('Model'), "Please put the models in ./Model folder"
# Create a Camera object
zed = sl.Camera()
# Create a InitParameters object and set configuration parameters
init_params = sl.InitParameters()
init_params.camera_resolution = sl.RESOLUTION.HD1080 # Use HD1080 video mode
init_params.coordinate_units = sl.UNIT.MILLIMETER # Set coordinate units
init_params.depth_mode = sl.DEPTH_MODE.NEURAL
init_params.coordinate_system = sl.COORDINATE_SYSTEM.RIGHT_HANDED_Y_UP
# Open the ZED m camera
err = zed.open(init_params)
if err != sl.ERROR_CODE.SUCCESS:
exit(1)
# Get ZED camera information
camera_info = zed.get_camera_information()
left_focal = np.array([camera_info.camera_configuration.calibration_parameters.left_cam.fx, camera_info.camera_configuration.calibration_parameters.left_cam.fy])*PIXEL_SIZE_1080
right_focal = np.array([camera_info.camera_configuration.calibration_parameters.right_cam.fx, camera_info.camera_configuration.calibration_parameters.right_cam.fy])*PIXEL_SIZE_1080
# 2D viewer utilities
display_resolution = sl.Resolution(min(camera_info.camera_configuration.resolution.width, 1920), min(camera_info.camera_configuration.resolution.height, 1080))
image_scale = [display_resolution.width / camera_info.camera_configuration.resolution.width
, display_resolution.height / camera_info.camera_configuration.resolution.height]
# Create ZED objects filled in the main loop
left_frame = sl.Mat()
depth = sl.Mat()
point_cloud = sl.Mat()
# Initialise the object containing the hands tracker with the maximum number of hands it has to tracks.
rgb_hands_tracker = RGB_Hands_Tracker(num_hands)
depth_hands_tracker = Depth_Hands_Tracker(model_name, model_parameters, left_focal, cube_size, args.gpu_id)
print("[MAIN]\tRunning Hands Tracking Press 'q' to quit")
while True :
# Capture the frame from the ZED m camera
if zed.grab() == sl.ERROR_CODE.SUCCESS:
# Retrieve left frame
zed.retrieve_image(left_frame, sl.VIEW.LEFT, sl.MEM.CPU, display_resolution)
# Retrieve depth Mat. Depth is aligned on the left frame
zed.retrieve_measure(depth, sl.MEASURE.DEPTH)
# Retrieve colored point cloud. Point cloud is aligned on the left image
zed.retrieve_measure(point_cloud, sl.MEASURE.XYZ)
# Update OCV view
left_frame_ocv = left_frame.get_data()
depth_ocv = depth.get_data()
point_cloud_np = point_cloud.get_data()
left_frame_ocv = cv.cvtColor(left_frame_ocv, cv.COLOR_BGRA2BGR)
# Ask the hands tracker to do detection and/or tracking of the hands on the current frame.
rgb_hands_tracker.update(left_frame_ocv)
# For each detected hands, draw on the frame the sqeleton and collect if asked the data to store in the csv file.
hands_scores, hands_sides, hands_landmarks, hands_world_landmarks = rgb_hands_tracker.draw_and_export_data(left_frame_ocv, draw=False)
nb_detected_hands = np.count_nonzero(hands_scores)
depth_hands_landmarks = np.empty(np.shape(hands_landmarks), dtype=np.float32)
hands_frames = np.empty((nb_detected_hands, 4), dtype=np.int32)
if nb_detected_hands :
for idx in range(nb_detected_hands) :
bgr_hand_close_up, depth_hand_close_up, hands_frames[idx] = get_hands_close_up(left_frame_ocv, depth_ocv, hands_landmarks[idx], clean=True, display=False)
depth_hand_close_up = np.array([[depth_hand_close_up]])
depth_hands_landmarks[idx] = depth_hands_tracker.estimate(depth_hand_close_up)
hand_close_up_512 = depth_hands_tracker.draw_skeleton(depth_hand_close_up, idx, skeleton_mode=skeleton_mode)
hand_close_up_512 = np.clip(hand_close_up_512, 0, 1)
print("\n###[MAIN]\tShape of depth_hands_landmarks : ", np.shape(depth_hands_landmarks), "\n###\tdepth_hands_landmarks = ", depth_hands_landmarks)
cv.imshow("[HANDS TRACKING ON DEPTH FRAMES HAND N°", idx, "] Press \'q\' to quit / Press \'p\' to play/pause / Press \'n\' to print the measured norms of the phalanxes / Press \'c\' to print the cross products of the palm / Press \'w\' to print the world coordinates of the hands", hand_close_up_512)
# cv.imshow("[HANDS TRACKING ON THE LEFT RGB FRAMES] Press \'q\' to quit / Press \'p\' to play/pause / Press \'n\' to print the measured norms of the phalanxes / Press \'c\' to print the cross products of the palm / Press \'w\' to print the world coordinates of the hands", left_frame_ocv)
key = cv.waitKey(1)
if key == ord('q') :
break
elif key == ord('p') :
while True :
button = cv.waitKey(1)
# if button == ord('n'):
#phalanx_size = measure_phalanxes(camera_hands_landmarks_pts, True)
# phalanx_size = measure_phalanxes(mediapipe_hands_landmarks_pts, True)
# elif button == ord('c'):
#palm_cross_products = compute_palms_cross_products(camera_hands_landmarks_pts, True)
# palm_cross_products = compute_palms_cross_products(mediapipe_hands_landmarks_pts, True)
# elif button == ord('w'):
# print("\n[MAIN]\tPixel coordinates of the hands with depth : ", camera_hands_landmarks)
# print("[MAIN]\tWorld coordinates of the hands with camera z : ", camera_hands_landmarks_pts)
# print("[MAIN]\tGeometric centers : ", geometric_centers)
# print("[MAIN]\tWorld coordinates of the hands with mediapipe z : ", mediapipe_hands_landmarks_pts)
if button == ord('p'):
break
# elif key == ord('n'):
# phalanx_size = measure_phalanxes(camera_hands_landmarks_pts, True)
# phalanx_size = measure_phalanxes(mediapipe_hands_landmarks_pts, True)
# elif key == ord('c'):
# palm_cross_products = compute_palms_cross_products(camera_hands_landmarks_pts, True)
# palm_cross_products = compute_palms_cross_products(mediapipe_hands_landmarks_pts, True)
# elif key == ord('w'):
# print("\n[MAIN]\tPixel coordinates of the hands with depth : ", camera_hands_landmarks)
# print("[MAIN]\tWorld coordinates of the hands with camera z : ", camera_hands_landmarks_pts)
# print("[MAIN]\tGeometric centers : ", geometric_centers)
# print("[MAIN]\tWorld coordinates of the hands with mediapipe z : ", mediapipe_hands_landmarks_pts)
# if key == ord('s'):
# plt.imsave(os.path.join("skeleton", args.dataset, "predict", "{}.jpg".format((datetime.datetime.now()).strftime("%f"))), skeleton_pre)
left_frame.free(sl.MEM.CPU)
depth.free(sl.MEM.CPU)
point_cloud.free(sl.MEM.CPU)
zed.close()
cv.destroyAllWindows()