In [1]:
import torch
import torch.nn as nn
import cv2
import mediapipe as mp
from model import SignDetection

In [2]:
import os
import numpy as np
from collections import deque

In [3]:
MODEL_PATH=os.path.join('model','model')
RENDER_PATH=os.path.join('renders')
gesture_model=torch.load(os.path.join(MODEL_PATH,'model_2000.pt')).cuda()

In [4]:
actions=['I am OK','STOP !','Descend','I am not OK','Ascend']

In [5]:
mp_draw=mp.solutions.drawing_utils
mp_hands=mp.solutions.hands
mp_holistic=mp.solutions.holistic

In [8]:
cap =cv2.VideoCapture(0)
frame_deque=deque(maxlen=20)
conclusion_deque=deque(maxlen=10)
draw_frames=False #This flag decides if you want to draw the wireframe on the video stream or not for aesthetics. 
#draw_frames if True will cover the hands, body and the hands with the wireframes. False will keep the video clean.
#draw_frames does not impact the quality of the detection

vid_filename=os.path.join(RENDER_PATH,'render'+'.mp4')
fourcc_codec=cv2.VideoWriter_fourcc(*'MPEG')
fps=cap.get(cv2.CAP_PROP_FPS)*0.8
dimensions=(cap.get(cv2.CAP_PROP_FRAME_WIDTH),cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
output = cv2.VideoWriter(vid_filename, cv2.VideoWriter_fourcc(*'MPEG'),fps, (640,480))

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            isframe, frame=cap.read()
            if not isframe:
                print('No frame')
            frame=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
            frame.flags.writeable = False 
            grid=hands.process(frame)
            whole=holistic.process(frame)
            frame.flags.writeable = True
            grids=grid.multi_hand_landmarks
            if draw_frames:
                mp_draw.draw_landmarks(frame, whole.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
                mp_draw.draw_landmarks(frame, whole.pose_landmarks, mp_holistic.POSE_CONNECTIONS) 
            if grids:
                if draw_frames:
                    for landmark in grids:
                        mp_draw.draw_landmarks(frame,landmark,mp_hands.HAND_CONNECTIONS)
                hand1=np.array([[lm.x,lm.y,lm.z] for lm in grids[0].landmark]).flatten()
                if len(grids)>1:
                    hand2=np.array([[lm.x,lm.y,lm.z] for lm in grids[1].landmark]).flatten()
                else:
                    hand2=np.zeros((21,3)).flatten() #21 landmarks each having x,y,z coordinates
                f_grid=np.hstack((hand1,hand2))
                frame_deque.append(f_grid)
                if len(frame_deque)==20:
                    input_gest=torch.from_numpy(np.expand_dims(np.array(frame_deque),axis=0)).type(torch.FloatTensor).cuda() #LSTM model needs the input as a 3-D tensor - batch X frames X token_no. per frame
                    predicted_label=gesture_model(input_gest)
                    label=actions[torch.argmax(predicted_label, dim=1).item()]
                    conclusion_deque.append(label)
                    prob=np.round(torch.max(predicted_label).cpu().detach().numpy(),3)
                    if label == np.unique(conclusion_deque)[0]:
                        text=label
                    else:
                        text=''
                    cv2.putText(frame,text,(60,65),cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0,0, 0), 2, cv2.LINE_AA)
                else:
                    pass
            frame=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
            output.write(frame)
            cv2.imshow('Output',frame)

            if cv2.waitKey(10) & 0xFF==ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()
        output.release()   

In [None]:
cap.release()
cv2.destroyAllWindows()