# 1. Importing and Installing dependencies

In [1]:
!pip install tensorflow==2.13.0 opencv-python mediapipe scikit-learn matplotlib 
!pip install pyttsx3

import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import time
from mediapipe.python.solutions.face_mesh_connections import FACEMESH_CONTOURS
from mediapipe.python.solutions.face_mesh_connections import FACEMESH_TESSELATION





# 2. Using mp Holistic to capture keypoints

In [2]:
mp_holistic=mp.solutions.holistic #out of several models like hand, face mesh, iris we are choosing the holistic model
mp_drawing=mp.solutions.drawing_utils #Drawing utilities

def mediapipe_detection(image,model):#format for image is rgb
    image=cv.cvtColor(image,cv.COLOR_BGR2RGB) #convert image from bgr to rgb
    image.flags.writeable=False                #image is no longer writeable
    results=model.process(image)               #make prediction
    image.flags.writeable=True                 #now img is writeable
    image=cv.cvtColor(image,cv.COLOR_RGB2BGR)#change color to original format
    return image, results
    
def draw_styled_landmarks(image,results):
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(80,110,10),thickness=1,circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121),thickness=1,circle_radius=1) 
                             )
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10),thickness=2,circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121),thickness=2,circle_radius=2) 
                             )
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76),thickness=2,circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250),thickness=2,circle_radius=2)
                             )
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66),thickness=2,circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230),thickness=2,circle_radius=2)
                             )
    


# 3. Extract key points

In [3]:
def extract_keyPoints(results):
    pose=np.array([[ res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh=np.array([[res.x,res.y,res.z]for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh=np.array([[res.x,res.y,res.z]for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face=np.array([[res.x,res.y,res.z]for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    return np.concatenate([pose,face,lh,rh])


# 4. Setting up folders for collection of key points

In [4]:
DATA_PATH=os.path.join('MP_Data')#path for exported data
actions=np.array(['hello','thanks','sorry'])#actions that we are detecting
no_seq=30 #thirty videos data
seq_len=30 #each vid has 30 frames 
for action in actions:
    for seq in range(no_seq):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(seq)))
            #print(f"Created directory: ")
        except:
            pass
            #print(f"Directory already exists: ")

# 5. Collect keypoint values for training & testing

# 6. Preprocess Data & Create Labels and Features

In [5]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
label_map={label:num for num,label in enumerate(actions)}
sequences, labels=[],[]
for action in actions:
    for seq in range(no_seq):
        window=[]
        for frame_num in range(seq_len):
            res=np.load(os.path.join(DATA_PATH,action,str(seq),"{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

X=np.array(sequences)
y=to_categorical(labels).astype(int)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.05)


label_map

{'hello': 0, 'thanks': 1, 'sorry': 2}

In [6]:
cap.release() 
cv.destroyAllWindows()

cap.release()
cv.destroyAllWindows()

# 7. Build and train LSTM neural network

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model
log_dir=os.path.join('Logs')
tb_callback=TensorBoard(log_dir=log_dir)
model= Sequential()
model.add(LSTM(64,return_sequences=True,activation='relu',input_shape=(30,1662)))
model.add(LSTM(128,return_sequences=True,activation='relu'))
model.add(LSTM(64,return_sequences=False,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(actions.shape[0],activation='softmax'))
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])
#model.fit(X_train, y_train,epochs=2000,callbacks=[tb_callback])
model = load_model('action.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 64)            442112    
                                                                 
 lstm_1 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596675 (2.28 MB)
Trainable params: 596675 

# 8. Make Predictions

In [7]:
res= model.predict(X_test)
actions[np.argmax(res[2])]



'thanks'

In [22]:
actions[np.argmax(y_test[2])]

'thanks'

### we can see that prediction is matching 

In [8]:
actions[np.argmax(res[2])]==actions[np.argmax(y_test[2])]

True

# 10. Evaluation using Confusion Matrix & Accuracy

In [7]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
yhat=model.predict(X_train)
ytrue=np.argmax(y_train,axis=1).tolist()
yhat=np.argmax(yhat,axis=1).tolist()
accuracy_score(ytrue,yhat)*100



91.76470588235294

# 11. Real time 

In [8]:
import pyttsx3

# Initialize the TTS engine
engine = pyttsx3.init()
engine.setProperty('rate', 150)    # Speed of speech (words per minute)
engine.setProperty('volume', 1.0)  # Volume level (0.0 to 1.0)


In [9]:
sequence=[]
sentence=[]
predictions=[]
threshold=0.75

cap=cv.VideoCapture(0)#substitute 0 with video if instead of camera you want to use video
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        #read frame
        ret,frame=cap.read() #reads image in bgr format

        #make detection
        image,results=mediapipe_detection(frame, holistic)
        #print(results)
        #draw landmarks
        draw_styled_landmarks(frame,results)
        
        keypoints=extract_keyPoints(results)
        sequence.append(keypoints)
        sequence=sequence[-30:]
        
        if len(sequence)==30:
            res=model.predict(np.expand_dims(sequence,axis=0))[0]
                #print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
        if len(predictions)>10:
            if np.unique(predictions[-10:])[0]==np.argmax(res):
                if res[np.argmax(res)]>threshold:
                    if len(sentence)>0:
                        if actions[np.argmax(res)]!=sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                            engine.say(sentence[-1])
                            engine.runAndWait()
                    else:
                        sentence.append(actions[np.argmax(res)])
                        engine.say(sentence[-1])
                        engine.runAndWait()
                        
        
                    
        if len(sentence)>5:
            sentence=sentence[-5:]
            
        #image=prob_viz(res,actions,frame,colors)
        
        cv.rectangle(frame,(0,0),(640,40),(245,117,16),-1)
        cv.putText(frame,' '.join(sentence),(3,30),cv.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv.LINE_AA)
        engine.say
                    
                    
        #show to screen with name opencv feed
        cv.imshow('OpenCV feed',frame)
        if cv.waitKey(10)& 0xFF==ord('q'):
            break
    cap.release()
    cv.destroyAllWindows()

