In [1]:
import tensorflow as tf
import mediapipe as mp
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from sklearn.model_selection import train_test_split
import time
import cv2
import os

In [2]:
mp_drawings = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
mp_hands = mp.solutions.hands

## Test

In [3]:
def rendering(image, results):
    mp_drawings.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              mp_drawings.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
                              mp_drawings.DrawingSpec(color=(80, 121, 10), thickness=1, circle_radius=1))
    mp_drawings.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawings.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawings.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


In [15]:
cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # flip the image
        frame = cv2.flip(frame, 1)
        
        # bgr to rgb
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # make detections


        result = hands.process(image)
            # render results of detections
        if result.multi_hand_landmarks:
                for landmark in result.multi_hand_landmarks:
                    mp_drawings.draw_landmarks(frame, landmark, mp_hands.HAND_CONNECTIONS,
                                         mp_drawings.DrawingSpec(color=(100, 131, 25), thickness=2, circle_radius=2),
                                         mp_drawings.DrawingSpec(color=(22, 55, 200), thickness=3, circle_radius=2))
      

        # cv2.imwrite(os.path.join('output images', f'{str(uuid.uuid1())}.jpg'), frame)
        cv2.imshow('Hand tracking', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

In [421]:
'''def extract_keypoints(results):
    """face = []
    if results.face_landmarks:
        for res in results.face_landmarks.landmark:
            face.append(np.array([res.x, res.y, res.z]))"""
    
    #face = np.array(face).flatten()
    if results.left_hand_landmarks:
        lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark])
    else:
        lh = np.zeros((21, 3))
    if results.right_hand_landmarks:
        rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark])
    else:
        rh = np.zeros((21, 3))
    if results.pose_landmarks:
        pose = [np.array([res.x, res.y, res.z]) for res in results.pose_landmarks.landmark]
        #pose = np.array(pose).flatten()
    else:
        pose = np.zeros(shape=(31, 3))
    
    return tf.concat((lh, rh, pose), 0)'''

In [16]:
def extract_keypoints(results):
    hand = []
    if results.multi_hand_landmarks:
        for lm in results.multi_hand_landmarks:
            for res in lm.landmark:
                hand.append(np.array([res.x, res.y, res.z]))
    return hand

In [17]:
hand = extract_keypoints(result)
hand = tf.convert_to_tensor(hand)
hand

<tf.Tensor: shape=(21, 3), dtype=float64, numpy=
array([[ 1.02330431e-01,  8.77934337e-01,  7.28771738e-07],
       [ 2.01304868e-01,  8.51286411e-01, -3.86849605e-02],
       [ 2.88881660e-01,  7.78087914e-01, -6.07262403e-02],
       [ 3.54907185e-01,  7.20797896e-01, -8.24246258e-02],
       [ 4.20482635e-01,  6.95796847e-01, -1.05033420e-01],
       [ 2.64046460e-01,  5.62440634e-01, -3.38571854e-02],
       [ 3.06724012e-01,  4.41207528e-01, -6.76996857e-02],
       [ 3.34738851e-01,  3.62298191e-01, -9.63834599e-02],
       [ 3.60291630e-01,  2.93383986e-01, -1.17833674e-01],
       [ 2.06111848e-01,  5.31630337e-01, -4.23693061e-02],
       [ 2.43705526e-01,  3.85574400e-01, -7.21050724e-02],
       [ 2.71365404e-01,  2.88217336e-01, -9.90345553e-02],
       [ 2.96253532e-01,  2.01953590e-01, -1.19571425e-01],
       [ 1.43166170e-01,  5.36946058e-01, -5.67498207e-02],
       [ 1.70969039e-01,  3.91911119e-01, -8.83900821e-02],
       [ 1.88378930e-01,  2.94533193e-01, -1.144369

# Data collection

In [70]:
# os.mkdir('MP_DATA\h')
data = 'MP_DATA'
actions = np.array(['hello', 'yes', 'iloveyou'])
sequence_length  = 500

In [71]:
cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5) as hands:
    for action in actions:
        #os.mkdir(f'{data}/{action}')
        for sequence in range(sequence_length):
            ret, frame = cap.read()

            # flip the image
            frame = cv2.flip(frame, 1)

            # bgr to rgb
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # make detections
            result = hands.process(image)
            if result.multi_hand_landmarks:
                for landmark in result.multi_hand_landmarks:
                    mp_drawings.draw_landmarks(frame, landmark, mp_hands.HAND_CONNECTIONS,
                                         mp_drawings.DrawingSpec(color=(100, 131, 25), thickness=2, circle_radius=2),
                                         mp_drawings.DrawingSpec(color=(22, 55, 200), thickness=3, circle_radius=2))

            if sequence == 0:
                cv2.putText(frame, f'Collecting for {action} frame:{sequence}', (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0),
                                   4, cv2.LINE_AA)
                cv2.imshow('frame', frame)
                cv2.waitKey(2000)
            elif sequence == sequence_length //2:
                cv2.putText(frame, f'Transition', (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0),
                                   4, cv2.LINE_AA)
                cv2.imshow('frame', frame)
                cv2.waitKey(2000)
            else:
                cv2.putText(frame, f'Collecting for {action} frame:{sequence}', (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0),
                                   4, cv2.LINE_AA)
                cv2.imshow('frame', frame)

                # render results of detections
            keypoints = extract_keypoints(result)
            np_path = os.path.join(data, action, str(sequence))
            np.save(np_path, keypoints)


            # cv2.imwrite(os.path.join('output images', f'{str(uuid.uuid1())}.jpg'), frame)
            #cv2.imshow('Hand tracking', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()

In [72]:
label_map = {'hello':0, 'yes':1, 'iloveyou':2}

In [73]:
window = []
labels = []
for action in actions:
    for sequence in range(sequence_length):
        res = np.load(os.path.join(data, action, f'{sequence}.npy'))
        if len(res) == 21:
            window.append(res)
            labels.append(label_map[action])

In [74]:
labels = to_categorical(np.array(labels)).astype(int)
window = np.array(window)

In [75]:
len(labels)

1499

In [76]:
for i in labels:
    print(i)
    break

[1 0 0]


In [77]:
X_train, X_test, y_train, y_test = train_test_split(window, labels, test_size=0.20)

In [78]:
len(X_train)

1199

In [79]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [80]:
for points, label in train_dataset.take(1):
    print(points, label)

tf.Tensor(
[[ 8.22325230e-01  5.38365602e-01  2.61576616e-07]
 [ 7.75764465e-01  5.22445917e-01 -1.62287373e-02]
 [ 7.34960735e-01  4.82295632e-01 -2.46598721e-02]
 [ 7.08441257e-01  4.43525702e-01 -3.21732908e-02]
 [ 6.81060910e-01  4.21327233e-01 -4.01151702e-02]
 [ 7.48155296e-01  3.83358181e-01 -1.18763344e-02]
 [ 7.23598719e-01  3.22783887e-01 -2.55192965e-02]
 [ 7.10560143e-01  2.84206659e-01 -3.79504040e-02]
 [ 7.01931119e-01  2.49311119e-01 -4.75268848e-02]
 [ 7.78792202e-01  3.64237815e-01 -1.61658023e-02]
 [ 7.61977077e-01  2.90060490e-01 -2.67705806e-02]
 [ 7.53168166e-01  2.42945865e-01 -3.80699039e-02]
 [ 7.46467471e-01  2.02110603e-01 -4.71088998e-02]
 [ 8.09985459e-01  3.63802731e-01 -2.32575089e-02]
 [ 8.06159794e-01  2.92079836e-01 -3.63337807e-02]
 [ 8.05096745e-01  2.45436385e-01 -4.62223962e-02]
 [ 8.02924573e-01  2.04259083e-01 -5.36865965e-02]
 [ 8.40804100e-01  3.77762944e-01 -3.18750292e-02]
 [ 8.45517159e-01  3.23270559e-01 -4.38410901e-02]
 [ 8.48888040e-01  2

In [81]:
def augment(points, label):
    # jitter points
    points += tf.random.uniform(points.shape, -0.005, 0.005, dtype=tf.float64)
    # shuffle points
    points = tf.random.shuffle(points)
    return points, label

In [82]:
batch_size = 128
train_ds = train_dataset.shuffle(len(train_dataset)).map(augment).batch(batch_size)
val_ds = val_dataset.batch(batch_size)

In [83]:
len(train_ds)

10

# Model Building

In [84]:
NUM_CLASSES = 3

In [85]:
def conv_bn(x, filters):
    x = layers.Conv1D(filters, kernel_size=1, padding='same')(x)
    x = layers.BatchNormalization(momentum=0.0)(x)
    x = layers.Activation('relu')(x)
    return x

def dense_bn(x, filters):
    x = layers.Dense(filters)(x)
    x = layers.BatchNormalization(momentum=0.0)(x)
    x = layers.Activation('relu')(x)
    return x

In [87]:
# T-net layers for pointnet
def tnet(inputs, num_features):

    # Initalise bias as the indentity matrix
    #bias = keras.initializers.Constant(np.eye(num_features).flatten())
    #reg = OrthogonalRegularizer(num_features)

    x = conv_bn(inputs, 32)
    x = conv_bn(x, 64)
    x = conv_bn(x, 512)
    x = layers.GlobalMaxPooling1D()(x)
    x = dense_bn(x, 256)
    x = dense_bn(x, 128)
    x = layers.Dense(num_features * num_features)(x)
    feat_T = layers.Reshape((num_features, num_features))(x)
    
    # Apply affine transformation to input features
    return layers.Dot(axes=(2, 1))([inputs, feat_T])

In [88]:
inputs = keras.Input(shape=(21,3))

x = tnet(inputs, 3)
x = conv_bn(x, 32)
x = conv_bn(x, 32)
x = tnet(x, 32)
x = conv_bn(x, 32)
x = conv_bn(x, 64)
x = conv_bn(x, 512)
x = layers.GlobalMaxPooling1D()(x)
x = dense_bn(x, 256)
x = layers.Dropout(0.3)(x)
x = dense_bn(x, 128)
x = layers.Dropout(0.3)(x)

outputs = layers.Dense(NUM_CLASSES, activation ='softmax')(x)

model = keras.Model(inputs=inputs, outputs=outputs, name='pointnet')
model.summary()

Model: "pointnet"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 21, 3)]      0           []                               
                                                                                                  
 conv1d_44 (Conv1D)             (None, 21, 32)       128         ['input_5[0][0]']                
                                                                                                  
 batch_normalization_68 (BatchN  (None, 21, 32)      128         ['conv1d_44[0][0]']              
 ormalization)                                                                                    
                                                                                                  
 activation_68 (Activation)     (None, 21, 32)       0           ['batch_normalization_68[0

 activation_77 (Activation)     (None, 21, 512)      0           ['batch_normalization_77[0][0]'] 
                                                                                                  
 global_max_pooling1d_13 (Globa  (None, 512)         0           ['activation_77[0][0]']          
 lMaxPooling1D)                                                                                   
                                                                                                  
 dense_39 (Dense)               (None, 256)          131328      ['global_max_pooling1d_13[0][0]']
                                                                                                  
 batch_normalization_78 (BatchN  (None, 256)         1024        ['dense_39[0][0]']               
 ormalization)                                                                                    
                                                                                                  
 activatio

In [89]:
callbacks = keras.callbacks.TensorBoard(log_dir='logdir')
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                              patience=2)

In [90]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [91]:
model.fit(train_ds, epochs=20, validation_data=val_ds, callbacks=[callbacks, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x1834a2c7940>

In [92]:
predictions = model.predict(X_test)
predictions = actions[np.argmax(predictions, 1)]
predictions[:10]



array(['yes', 'yes', 'hello', 'iloveyou', 'yes', 'iloveyou', 'hello',
       'hello', 'yes', 'iloveyou'], dtype='<U8')

In [93]:
model.evaluate(X_test, y_test)



[0.003853917121887207, 1.0]

In [94]:
actual = actions[np.argmax(y_test, 1)]
actual[:10]

array(['yes', 'yes', 'hello', 'iloveyou', 'yes', 'iloveyou', 'hello',
       'hello', 'yes', 'iloveyou'], dtype='<U8')

In [98]:
model.save('hand_pose_model')



INFO:tensorflow:Assets written to: hand_pose_model\assets


INFO:tensorflow:Assets written to: hand_pose_model\assets


In [546]:
model = tf.keras.models.load_model('hand_pose_model')

In [100]:
cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        # flip the image
        frame = cv2.flip(frame, 1)
        
        # bgr to rgb
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # make detections


        result = hands.process(image)
        # render results of detections
        if result.multi_hand_landmarks:
            for landmark in result.multi_hand_landmarks:
                mp_drawings.draw_landmarks(frame, landmark, mp_hands.HAND_CONNECTIONS,
                                         mp_drawings.DrawingSpec(color=(100, 131, 25), thickness=2, circle_radius=2),
                                         mp_drawings.DrawingSpec(color=(22, 55, 200), thickness=3, circle_radius=2))
      
      
            keypoints = extract_keypoints(result)
        
            predictions = model.predict(tf.expand_dims(keypoints, 0))
            predictions = actions[np.argmax(predictions)]
            cv2.putText(frame, str(predictions), (200, 150), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 255, 0), 2)
        # cv2.imwrite(os.path.join('output images', f'{str(uuid.uuid1())}.jpg'), frame)
        cv2.imshow('Hand tracking', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()











