**1-Import and Install Dependencies**

In [4]:
%config IPCompleter.greedy=True

In [5]:
# !pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib

In [6]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp 
from mediapipe.python.solutions import drawing_utils as mp_drawing         #Drawing utilities
from mediapipe.python.solutions import holistic as mp_holistic             #Holistic model
from mediapipe.python.solutions import hands as mp_hands                   # Hand info
from mediapipe.python.solutions import face_mesh  as mp_face_mesh          # Face info
from mediapipe.python.solutions import pose as mp_pose                     # Pose info

**2-Keypoints using MP Holistic**

In [7]:
#mp_holistic = mp.solutions.holistic #Holistic model
#mp_drawing = mp.solutions.download_utils #Drawing utilities

In [8]:
def mediapipe_detection(image,model):
    image =cv2.cvtColor(image,cv2.COLOR_BGR2RGB) # Color Convertion BGR 2 RGB
    image.flags.writeable = False                # image is no longer writeable
    results = model.process(image)               # Make Prediction
    image.flags.writeable = True                 # image is writeable
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR) # Color Convertion RGB 2 BGR
    return image , results

In [9]:
def drawing_the_landmarks(image , results):
    
    mp_drawing.draw_landmarks(image , results.face_landmarks , mp_face_mesh.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(0,70,255),thickness=1,circle_radius=1),
                             mp_drawing.DrawingSpec(color=(0,255,0),thickness=1,circle_radius=1))      # Drawing the FACE CONNECTIONS
    
    mp_drawing.draw_landmarks(image , results.pose_landmarks,mp_pose.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(0,70,255),thickness=1,circle_radius=1),
                              mp_drawing.DrawingSpec(color=(0,255,0),thickness=1,circle_radius=1))      # Drawing the POSE CONNECTIONS
    
    mp_drawing.draw_landmarks(image , results.right_hand_landmarks,mp_hands.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(0,70,255),thickness=1,circle_radius=1),
                              mp_drawing.DrawingSpec(color=(0,255,0),thickness=1,circle_radius=1))       # Drawing the RIGHT HAND CONNECTIONS

    mp_drawing.draw_landmarks(image , results.left_hand_landmarks,mp_hands.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(0,70,255),thickness=1,circle_radius=1),
                              mp_drawing.DrawingSpec(color=(0,255,0),thickness=1,circle_radius=1))        # Drawing the LEFT HAND CONNECTIONS

In [10]:
cap = cv2.VideoCapture(0)                             # Open our Webcam
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic :  # Set mediapipe model
    while cap.isOpened():                                  # as the cam is still open
        ret, frame = cap.read()                            # Read the cam feed
        frame = cv2.flip(frame,1)
        image , results = mediapipe_detection(frame,holistic) # Make detection
        #print(results)

        drawing_the_landmarks(image , results)
        cv2.imshow('OpenCV Feed',image)                    # Show feed
        if cv2.waitKey(10) & 0xFF == ord('q'):             #press 'q' to end feed
            break
    cap.release()                                          #End Cap
    cv2.destroyAllWindows()                                #Close cam Window

**3-Extract Keypoint Values**

In [11]:
def Extract_Keypoints(results):
    
    #Get the Face Landmarks if exist, and zeros if not 
    if results.face_landmarks:
        Face_marks= np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten()
    else:
        Face_marks= np.zeros(33*4) 

    #Get the Pose Landmarks if exist, and zeros if not
    if results.pose_landmarks:
        Pose_marks= np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten()  
    else:
        Pose_marks= np.zeros(468*3)
    
    #Get the R Hand Landmarks if exist, and zeros if not
    if results.right_hand_landmarks:
        R_Hand_marks = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten()  
    else:
        R_Hand_marks = np.zeros(21*3)
    #Get the L Hand Landmarks if exist, and zeros if not
    if results.left_hand_landmarks:
        L_Hand_marks= np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() 
    else:
        L_Hand_marks = np.zeros(21*3)
    return np.concatenate([Pose_marks,Face_marks,L_Hand_marks,R_Hand_marks])

**4-Setup Folders for Collection**

In [12]:
DATA_PATH = os.path.join('MP_data')                  # Path for exported Numpy arrays
actions = np.array(['Hello','Good','Excellent'])   # Actions that we try to detect
no_sequences=30                                   # Thirty videos worth of data
sequences_lenght=30                                 # 30 frames for the Video

In [13]:
# for action in actions:
#     for sequence in range(no_sequences):
#         try:
#             os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
#         except:
#             pass

**5-Collect Keypoint Values for Training and Testing**

In [14]:
# cap = cv2.VideoCapture(0)                             # Open our Webcam

# with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic :  # Set mediapipe model
    
#     for action in actions:                                         # for the Action named ...
#         for sequence in range(no_sequences):                       # for the sequence number ...
#             for frame_num in range(sequences_lenght):              # for the frame number ...
                
        
#                 ret, frame = cap.read()                            # Read the cam feed
#                 frame = cv2.flip(frame,1)                          # Flip the cam feed on the y axis
        
#                 image , results = mediapipe_detection(frame,holistic) # Make detection of the pose, face, and hands coordinations
        
#                 drawing_the_landmarks(image , results)                # Draw the marks of the detection
                



#                 #Wait for sometime at the start of every new sequence
                
#                 if frame_num == 0:
#                     cv2.putText(image,'Starting Collection', (120,200),
#                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0),4,cv2.LINE_AA)
#                     cv2.putText(image,'Collecting Frames for {} Video Number {}'.format(action,sequence), (15,12),
#                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255),1,cv2.LINE_AA)
#                     cv2.imshow('OpenCV Feed',image)                    # Show feed
#                     cv2.waitKey(2000)
#                 else:
                    
#                     cv2.putText(image,'Collecting Frames for {} Video Number {}'.format(action,sequence), (15,12),
#                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255),1,cv2.LINE_AA)
#                     cv2.imshow('OpenCV Feed',image)                    # Show feed

#                 #Extract and Save the landmarks as numpy arrays
#                 Key_Points_Export = Extract_Keypoints(results)
#                 npy_Path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
#                 np.save(npy_Path, Key_Points_Export)

                
                
        
#                 if cv2.waitKey(10) & 0xFF == ord('q'):             #press 'q' to end feed
#                     break
            
#     cap.release()                                          #End Cap
#     cv2.destroyAllWindows()  

**6-Preprocess Data and Create Labels and Features**

In [15]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [16]:
Label_map = {label:num for num,label in enumerate(actions)}

In [17]:
Label_map

{'Hello': 0, 'Good': 1, 'Excellent': 2}

In [18]:
sequences , Label = [] , []
for action in actions:
    for sequence in range(no_sequences):
        win = []
        for frame_num in range(sequences_lenght):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), '{}.npy'.format(frame_num)))
            win.append(res)
        sequences.append(win)
        Label.append(Label_map[action])

In [19]:

np.array(sequences).shape

(90, 30, 1662)

In [20]:
np.array(Label).shape

(90,)

In [21]:
x = np.array(sequences)
x.shape

(90, 30, 1662)

In [22]:
y = to_categorical(Label).astype(int)

In [23]:
y.shape

(90, 3)

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

In [25]:
y_train.shape

(85, 3)

**7-Build and Train LSTM Neural Network**

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Dense
from tensorflow.keras.callbacks import TensorBoard

In [27]:
logging_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir = logging_dir)


In [28]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [29]:
import keras
base_lr = 0.005
# including a global_clipnorm is extremely important in object detection tasks
optimizer = keras.optimizers.SGD(
    learning_rate=base_lr, momentum=0.85, global_clipnorm=10.0
)

In [30]:
model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['categorical_accuracy'])

In [None]:
model.fit(x_train,y_train,epochs=300,callbacks=[tb_callback])

In [None]:
model.summary()

**8-Make Predictions**

In [None]:
res_mod = model.predict(x_test)

In [None]:
actions[np.argmax(res_mod[2])]

In [None]:
actions[np.argmax(y_test[2])]

**9-Save Weights**

In [None]:
model.save('Action.h5')

In [31]:
model.load_weights('Action.h5')

**10-Evaluation using Confusion Matrix and Accuracy**

In [32]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [33]:
ytry = model.predict(x_test)
ytrue = np.argmax(y_test, axis= 1 ).tolist()
ytry = np.argmax(ytry, axis= 1 ).tolist()



In [34]:
multilabel_confusion_matrix(ytrue,ytry)

array([[[3, 0],
        [0, 2]],

       [[3, 0],
        [0, 2]],

       [[4, 0],
        [0, 1]]], dtype=int64)

In [35]:
accuracy_score(ytrue,ytry)

1.0

**11-Test in Real Time**

In [36]:
colors = [(245,117,16),(117,245,16),(16,117,145)]
def prob_viz(res,actions,input_frame,colors):
    output_frame = input_frame.copy()
    for num , prob in enumerate(res):
        cv2.rectangle(output_frame,(0, 60 + num*40),(int(prob*100), 90 + num*40), colors[num],-1)
        cv2.putText(output_frame, actions[num], (0, 85 + num*40),cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
    return output_frame

In [37]:
Sequence = []
sentence = []
predictions = []
threshold = 0.6        # Detection Variables




cap = cv2.VideoCapture(0)                             # Open our Webcam
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic :  # Set mediapipe model
    while cap.isOpened():                                  # as the cam is still open
        ret, frame = cap.read()                            # Read the cam feed
        frame = cv2.flip(frame,1)
        image , results = mediapipe_detection(frame,holistic) # Make detection
        #print(results)


        keypoints = Extract_Keypoints(results)
        Sequence.append(keypoints)
        Sequence = Sequence[-30:]
        if len(Sequence) == 30:
            Res = model.predict(np.expand_dims(Sequence, axis=0))[0]
            #print(actions[np.argmax(Res)])
            predictions.append(np.argmax(Res))


            if np.unique(predictions[-10:])[0]==np.argmax(Res):
                if Res[np.argmax(Res)] > threshold:
                    
                    if len(sentence) > 0:
                        if actions[np.argmax(Res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(Res)])
                    else:
                        sentence.append(actions[np.argmax(Res)])
            if len(sentence) > 5:
                sentence = sentence[-5:]

            image = prob_viz(Res,actions,image,colors)
        cv2.rectangle(image, (0,0) , (640,40), (245,117,16), -1 )
        cv2.putText(image, ' '.join(sentence), (3,30),
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
                
        
        cv2.imshow('OpenCV Feed',image)                    # Show feed
        if cv2.waitKey(10) & 0xFF == ord('q'):             #press 'q' to end feed
            break
    cap.release()                                          #End Cap
    cv2.destroyAllWindows()                                #Close cam Window




In [None]:
cap.release()                                          #End Cap
cv2.destroyAllWindows()