In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_holistic = mp.solutions.holistic
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [4]:
def mp_detect(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False                  
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [5]:
def mp_draw(image, results):
    mp_drawing.draw_landmarks(
        image,
        results.face_landmarks,
        mp_holistic.FACEMESH_CONTOURS,
        landmark_drawing_spec=None,
        connection_drawing_spec=mp_drawing_styles
        .get_default_face_mesh_contours_style())
    mp_drawing.draw_landmarks(
        image,
        results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles
        .get_default_pose_landmarks_style())
    mp_drawing.draw_landmarks(
        image,
        results.right_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles
        .get_default_hand_landmarks_style())
    mp_drawing.draw_landmarks(
        image,
        results.left_hand_landmarks,
        mp_holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles
        .get_default_hand_landmarks_style())

In [6]:
MP_POSE_KPS = 33
MP_HAND_KPS = 21

def mp_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() \
    if results.pose_landmarks \
    else np.zeros(MP_POSE_KPS*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() \
    if results.left_hand_landmarks \
    else np.zeros(MP_HAND_KPS*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() \
    if results.right_hand_landmarks \
    else np.zeros(MP_HAND_KPS*3)
    return np.concatenate([pose, lh, rh])

In [15]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('dataset') 

# Actions that we try to detect
actions = np.array(['hello', 'my', 'name', 'I', 'learning', 'sign', 'friend', 'iloveyou'])

# Thirty videos worth of data
no_sequences = 20

# Videos are going to be 30 frames in length
sequence_length = 30

last_file_number = 20
file_nums = range(last_file_number, last_file_number+no_sequences)

In [42]:
for action in actions: 
    for sequence in file_nums:
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [43]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in file_nums:
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mp_detect(frame, holistic)

                # Draw landmarks
                mp_draw(image, results)
                
                # NEW Apply wait logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                
                # NEW Export keypoints
                keypoints = mp_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv2.destroyAllWindows()

KeyboardInterrupt: 

In [16]:
label_map = {label:num for num, label in enumerate(actions)}

In [31]:
sequences, labels = [], []
for word in os.listdir(DATA_PATH):
    word_dir = os.path.join(DATA_PATH,word)
    for sequence in os.listdir(word_dir):
        sequence_dir = os.path.join(word_dir,sequence)
        window = []
        if int(sequence) in range(2):
            continue
        for frame_num in os.listdir(sequence_dir):
            file = os.path.join(sequence_dir, frame_num)
            res = np.load(file)
            window.append(res)
        print(len(window))
        sequences.append(window)
        labels.append(label_map[word])

30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
0
0
0
0
0
0
0
0
0
30
0
0
0
0
0
0
0
0
0
0
30
0
30
30
30
30
30


In [32]:
sequences, labels = [], []
for action in actions:
    for sequence in range(2,no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [33]:
X = np.array(sequences)
Y = to_categorical(labels).astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05)

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [35]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [36]:
def create_lstm_model():
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, activation='tanh', input_shape=(X.shape[1],X.shape[-1])))
    model.add(LSTM(128, return_sequences=True, activation='tanh'))
    model.add(LSTM(64, return_sequences=False, activation='tanh'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(actions.shape[0], activation='softmax'))
    return model
model = create_lstm_model()

In [37]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(136, 30, 258) (8, 30, 258) (136, 8) (8, 8)


In [87]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(269, 30, 258) (15, 30, 258) (269, 8) (15, 8)


In [88]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [89]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_18 (LSTM)              (None, 30, 64)            82688     
                                                                 
 lstm_19 (LSTM)              (None, 30, 128)           98816     
                                                                 
 lstm_20 (LSTM)              (None, 64)                49408     
                                                                 
 dense_18 (Dense)            (None, 64)                4160      
                                                                 
 dense_19 (Dense)            (None, 32)                2080      
                                                                 
 dense_20 (Dense)            (None, 8)                 264       
                                                                 
Total params: 237,416
Trainable params: 237,416
Non-tr

In [90]:
model.fit(X_train, Y_train, epochs=160, callbacks=[tb_callback]);

Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160

KeyboardInterrupt: 

In [35]:
# model.save('best.h5')

In [2]:
from tensorflow import keras
model = keras.models.load_model('best.h5')

In [7]:
from collections import deque
# 1. New detection variables
sequence = []
sentence = []
predictions = deque(maxlen=10)
threshold = 0.8

out = cv2.VideoWriter('outpy.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (640,480))
cap = cv2.VideoCapture(0)
# Set mediapipe model 
actions = np.array(['hello', 'my', 'sign', 'I', 'learning', 'friend', 'sign',  'I love you!'])
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        print(frame.shape)

        # Make detections
        image, results = mp_detect(frame, holistic)
        
        # Draw landmarks
        mp_draw(image, results)
        
        # 2. Prediction logic
        keypoints = mp_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            cache = np.unique(predictions)
            if len(cache) < 2 and cache[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold: 
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

                if len(sentence) > 5: 
                    sentence = sentence[-5:]

            # Viz probabilities
            # image = prob_viz(res, actions, image, colors)

        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        out.write(image)
        # Show to screen
        cv2.imshow('MediaPipe Holistic', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    out.release()
    cv2.destroyAllWindows()

(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
I
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 640, 3)
hello
(480, 