In [None]:
!pip install mediapipe

# 1. Import

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp
import random
from tqdm.notebook import tqdm

# 2. Keypoints using Mediapipe Hands and Pose

In [2]:
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils # drawing utils
mp_drawing_styles = mp.solutions.drawing_styles

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
    image.flags.writeable = False
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [4]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)    
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)    
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)    

In [5]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# 3. Extract Keypoints Values

In [6]:
def extract_keypoints(image,hands,pose):
    results = hands.process(image)
    results_pose = pose.process(image)

    hand_np = []
    if results.multi_hand_landmarks:
        for i in range(len(results.multi_hand_landmarks)):
            hand_np.append(np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks[i].landmark]).flatten() if results.multi_hand_landmarks[i] else np.zeros(21*3))

    if len(hand_np) == 0:
        lh = np.zeros(21*3)
        rh = np.zeros(21*3)
    elif len(hand_np) == 1:
        lh = hand_np[0]
        rh = np.zeros(21*3)
    else:
        lh = hand_np[0]
        rh = hand_np[1]        
    ps = np.array([[res.x, res.y, res.z, res.visibility] for res in results_pose.pose_landmarks.landmark]).flatten() if results_pose.pose_landmarks else np.zeros(33*4)

    return np.concatenate([ps, lh, rh])

In [7]:
def extract_keypoints_aug(image,hands,pose,xval,yval):
    results = hands.process(image)
    results_pose = pose.process(image)

    hand_np = []
    if results.multi_hand_landmarks:
        for i in range(len(results.multi_hand_landmarks)):
            hand_np.append(np.array([[res.x+xval, res.y+yval, res.z] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z] for res in results.multi_hand_landmarks[i].landmark]).flatten() if results.multi_hand_landmarks[i].landmark else np.zeros(21*3))
    if len(hand_np) == 0:
        lh = np.zeros(21*3)
        rh = np.zeros(21*3)
    elif len(hand_np) == 1:
        lh = hand_np[0]
        rh = np.zeros(21*3)
    else:
        lh = hand_np[0]
        rh = hand_np[1]    
    ps = np.array([[res.x+xval, res.y+yval, res.z, res.visibility] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z, res.visibility] for res in results_pose.pose_landmarks.landmark]).flatten() if results_pose.pose_landmarks else np.zeros(33*4)

    return np.concatenate([ps, lh, rh])

In [8]:
def extract_keypoints_flip_aug(image,hands,pose,xval,yval):
    results = hands.process(image)
    results_pose = pose.process(image)

    hand_np = []
    if results.multi_hand_landmarks:
        for i in range(len(results.multi_hand_landmarks)):
            hand_np.append(np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks[i].landmark]).flatten() if results.multi_hand_landmarks[i] else np.zeros(21*3))
    if len(hand_np) == 0:
        lh1 = np.zeros(21*3)
        rh1 = np.zeros(21*3)
    elif len(hand_np) == 1:
        lh1 = hand_np[0]
        rh1 = np.zeros(21*3)
    else:
        lh1 = hand_np[0]
        rh1 = hand_np[1]        
    ps1 = np.array([[res.x, res.y, res.z, res.visibility] for res in results_pose.pose_landmarks.landmark]).flatten() if results_pose.pose_landmarks else np.zeros(33*4)
    
    hand_np = []
    if results.multi_hand_landmarks:
        for i in range(len(results.multi_hand_landmarks)):
            hand_np.append(np.array([[res.x+xval, res.y+yval, res.z] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z] for res in results.multi_hand_landmarks[i].landmark]).flatten() if results.multi_hand_landmarks[i].landmark else np.zeros(21*3))
    if len(hand_np) == 0:
        lh2 = np.zeros(21*3)
        rh2 = np.zeros(21*3)
    elif len(hand_np) == 1:
        lh2 = hand_np[0]
        rh2 = np.zeros(21*3)
    else:
        lh2 = hand_np[0]
        rh2 = hand_np[1]    
    ps2 = np.array([[res.x+xval, res.y+yval, res.z, res.visibility] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z, res.visibility] for res in results_pose.pose_landmarks.landmark]).flatten() if results_pose.pose_landmarks else np.zeros(33*4)

    hand_np = []
    if results.multi_hand_landmarks:
        for i in range(len(results.multi_hand_landmarks)):
            hand_np.append(np.array([[1-res.x, res.y, res.z] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z] for res in results.multi_hand_landmarks[i].landmark]).flatten() if results.multi_hand_landmarks[i].landmark else np.zeros(21*3))
    if len(hand_np) == 0:
        lh3 = np.zeros(21*3)
        rh3 = np.zeros(21*3)
    elif len(hand_np) == 1:
        lh3 = hand_np[0]
        rh3 = np.zeros(21*3)
    else:
        lh3 = hand_np[0]
        rh3 = hand_np[1]    
    ps3 = np.array([[1-res.x, res.y, res.z, res.visibility] if res.x != 0 and res.y!= 0 else [res.x, res.y, res.z, res.visibility] for res in results_pose.pose_landmarks.landmark]).flatten() if results_pose.pose_landmarks else np.zeros(33*4)

    
    return np.concatenate([ps1, lh1, rh1]), np.concatenate([ps2, lh2, rh2]), np.concatenate([ps3, lh3, rh3])

In [49]:
result_test = extract_keypoints(results)

In [31]:
np.set_printoptions(threshold=np.inf)

In [53]:
result_test_aug = extract_keypoints_aug(results, random.uniform(0.01,0.05),random.uniform(0.01,0.05))

# 4. Setup

In [31]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MSASL') 
SAVE_PATH = os.path.join('keypoints_noface_msasl')

# Videos are going to be 30 frames in length
MAX_SEQ_LENGTH = 256

In [32]:
import json

In [50]:
import json

file = open(os.path.join(DATA_PATH,"MSASL_classes.json"),'r')
data = [word.strip().replace('"','').replace("[",'').replace("]",'').strip() for word in file.read().split(',')]

In [11]:
# Save action list into np array
np.save('msasl_actions_list.npy',data)

In [33]:
# Load action list from np array
actions = np.load('msasl_actions_list.npy')

In [34]:
len(actions)

1000

In [31]:
os.makedirs(SAVE_PATH,exist_ok=True)
for action in actions:
    os.makedirs(os.path.join(SAVE_PATH, action),exist_ok=True)

# 5. Extract Keypoints & Augment for Collected dataset

In [35]:
data_file = json.load(open(os.path.join(DATA_PATH,"MSASL_test.json"),'r'))
prev_id = ""
num = 1
for item in tqdm(data_file):
#     if main_counter == 4:
#         break
    url_id = item["url"][-11:]
    label = actions[int(item["label"])]
    if url_id == prev_id:
        num+=1
    else:
        num=1
    if int(item["label"]) < 100:
        if os.path.exists(os.path.join(DATA_PATH, "cropped_videos_testset",url_id+str(num)+".mp4")):
            if not os.path.exists(os.path.join(SAVE_PATH,label,url_id+str(num)+"_0.npy")):
                cap = cv2.VideoCapture(os.path.join(DATA_PATH, "cropped_videos_testset",url_id+str(num)+".mp4"))
                frame_count = 0
                while(cap.isOpened()):
                    # Capture frame-by-frame
                    ret, image = cap.read()
                    if ret == True:
                        with mp_hands.Hands(model_complexity=0,min_detection_confidence=0.5,min_tracking_confidence=0.5) as hands:
                            with mp_pose.Pose(min_detection_confidence=0.5,min_tracking_confidence=0.5) as pose:


                                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
                                image.flags.writeable = False

                                # Make detections
    #                             keypoints_flipped = extract_keypoints(cv2.flip(image,1), hands, pose)
                                keypoints, keypoints_aug, keypoints_flipped = extract_keypoints_flip_aug(image, hands, pose,random.uniform(0.01,0.03),random.uniform(0.01,0.03))

                                image.flags.writeable = True
                                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

                                # Export keypoints
                                npy_path = os.path.join(SAVE_PATH,label,url_id+str(num)+"_"+str(frame_count)+".npy")
                                np.save(npy_path, keypoints)

                                npy_path_aug = os.path.join(SAVE_PATH,label,url_id+str(num)+"_"+str(frame_count)+"_AUG.npy")
                                np.save(npy_path_aug, keypoints_aug)

                                npy_path_flipped = os.path.join(SAVE_PATH,label,url_id+str(num)+"_"+str(frame_count)+"_MIR.npy")
                                np.save(npy_path_flipped, keypoints_flipped)   
                    else: 
                        break
                    frame_count+=1
        
    prev_id = url_id


  0%|          | 0/4172 [00:00<?, ?it/s]

# 6. Preprocess Data and Create Labels and Features

In [24]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [18]:
label_map = {label:num for num, label in enumerate(actions[:100])}
print(label_map)

{'hello': 0, 'nice': 1, 'teacher': 2, 'eat': 3, 'no': 4, 'happy': 5, 'like': 6, 'orange': 7, 'want': 8, 'deaf': 9, 'school': 10, 'sister': 11, 'finish': 12, 'white': 13, 'bird': 14, 'what': 15, 'tired': 16, 'friend': 17, 'sit': 18, 'mother': 19, 'yes': 20, 'student': 21, 'learn': 22, 'spring': 23, 'good': 24, 'fish': 25, 'again': 26, 'sad': 27, 'table': 28, 'need': 29, 'where': 30, 'father': 31, 'milk': 32, 'cousin': 33, 'brother': 34, 'paper': 35, 'forget': 36, 'nothing': 37, 'book': 38, 'girl': 39, 'fine': 40, 'black': 41, 'boy': 42, 'lost': 43, 'family': 44, 'hearing': 45, 'bored': 46, 'please': 47, 'water': 48, 'computer': 49, 'help': 50, 'doctor': 51, 'yellow': 52, 'write': 53, 'hungry': 54, 'but': 55, 'drink': 56, 'bathroom': 57, 'man': 58, 'how': 59, 'understand': 60, 'red': 61, 'beautiful': 62, 'sick': 63, 'blue': 64, 'green': 65, 'english': 66, 'name': 67, 'you': 68, 'who': 69, 'same': 70, 'nurse': 71, 'day': 72, 'now': 73, 'brown': 74, 'thanks': 75, 'hurt': 76, 'here': 77, 'g

In [19]:
sequences, labels = [], []
for action in tqdm(label_map.keys()):
    if  os.path.exists(os.path.join(SAVE_PATH, action)):
#         for sequence in range(1,no_sequences+1):
        sequence_list = np.array([name[:12] + name[12:].split("_")[0] for name in os.listdir(os.path.join(SAVE_PATH, action))])
#         sequence_map = {}
#         for sequence in np.unique(sequence_list):
#             sequence_map[sequence] = np.count_nonzero(sequence_list == sequence)//3
#             maxLength = max(maxLength,sequence_map[sequence])
#             minLength = min(minLength,sequence_map[sequence]) 
        for sequence in np.unique(sequence_list):
            window = []
            last_frame = 0
#             for frame_num in range(0,sequence_map[sequence]):
            for frame_num in range(0,MAX_SEQ_LENGTH):

                if  os.path.exists(os.path.join(SAVE_PATH, action, sequence+"_"+"{}.npy".format(frame_num))): 
                    res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}.npy".format(frame_num)))
                    last_frame = frame_num
                else: # if the video length is shorter than sequence length, need to pad
                    res = np.zeros((258,),dtype='float64') # zero padding
#                     res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}.npy".format(last_frame))) # last frame padding
                window.append(res)

            # repeat for aug and mir
            window_aug = []
            last_frame = 0
            for frame_num in range(0,MAX_SEQ_LENGTH):
                if  os.path.exists(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_AUG.npy".format(frame_num))): 
                    res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_AUG.npy".format(frame_num)))
                    last_frame = frame_num
                else: 
                    res = np.zeros((258,),dtype='float64') # zero padding
#                     res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_AUG.npy".format(last_frame))) # last frame padding
                window_aug.append(res)
                       
            window_mir = []
            last_frame = 0
            for frame_num in range(0,MAX_SEQ_LENGTH):
                if  os.path.exists(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_MIR.npy".format(frame_num))): 
                    res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_MIR.npy".format(frame_num)))
                    last_frame = frame_num
                else: 
                    res = np.zeros((258,),dtype='float64') # zero padding
#                     res = np.load(os.path.join(SAVE_PATH, action, sequence+"_"+"{}_MIR.npy".format(last_frame))) # last frame padding
                window_mir.append(res)

            sequences.append(window)
            labels.append(label_map[action])
            sequences.append(window_aug)
            labels.append(label_map[action])
            sequences.append(window_mir)
            labels.append(label_map[action])

  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
X = np.array(sequences)

In [21]:
X.shape

(10173, 256, 258)

In [25]:
y = to_categorical(labels,num_classes=len(actions[:100])).astype(int)
y = np.argmax(y, axis=1).reshape(-1, 1)

In [26]:
info = [len(y[y==i]) for i in range(len(actions[:100]))]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [28]:
info = [len(y_test[y_test==i]) for i in range(len(actions[:100]))]

In [29]:
len(info)

100

In [30]:
os.makedirs("transformer_data_"+"msasl",exist_ok=True)
np.save(os.path.join("transformer_data_"+"msasl","train_data_new.npy"),X_train)
np.save(os.path.join("transformer_data_"+"msasl","train_labels_new.npy"),y_train)
np.save(os.path.join("transformer_data_"+"msasl","test_data_new.npy"),X_test)
np.save(os.path.join("transformer_data_"+"msasl","test_labels_new.npy"),y_test)