In [1]:
!pip install tensorflow==2.8.0 tensorflow-gpu==2.8.0 opencv-python sklearn matplotlib




In [2]:
!pip install mediapipe



In [13]:
!pip list

Package                      Version
---------------------------- -------------------
absl-py                      1.4.0
asttokens                    2.2.1
astunparse                   1.6.3
attrs                        23.1.0
backcall                     0.2.0
cachetools                   5.3.1
certifi                      2023.5.7
cffi                         1.15.1
charset-normalizer           3.2.0
colorama                     0.4.6
comm                         0.1.3
contourpy                    1.1.0
cycler                       0.11.0
debugpy                      1.6.7
decorator                    5.1.1
executing                    1.2.0
flatbuffers                  23.5.26
fonttools                    4.41.0
gast                         0.5.4
google-auth                  2.22.0
google-auth-oauthlib         0.4.6
google-pasta                 0.2.0
grpcio                       1.56.0
h5py                         3.9.0
idna                         3.4
ipykernel                    6

You should consider upgrading via the 'C:\Users\SRIJAN\Dropbox\PC\Documents\Programing\Projects\Python\Deep_learning_proj\Sign language detection\signlanguage\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
import cv2
import numpy as np
from matplotlib import pyplot as plt
import os
import time 
import mediapipe as mp # mediapipe will give us the keypoints from our image


# Keypoints using Media Pipe Holistics

In [2]:
mp_holistic=mp.solutions.holistic #brings in our holistic model- used to make our detection
mp_drawing=mp.solutions.drawing_utils #drawing utilities- used to draw them

In [3]:
def mediapipe_detection(image,model):
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)#colous conversion- since opencv reads in BGR and mediapipe RGB
    image.flags.writeable=False #image is no longer writable- saves memory
    results=model.process(image)#process the keypoints in the image using mediapipe holistic. Basically prediction on the frame
    image.flags.writeable=True
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR)#convert it back to BGR
    return image,results

In [4]:
def draw_landmarks(image, results): #helper function to draw the lanmarks on the image
    mp_drawing.draw_landmarks(image, results.face_landmarks, 
                                  mp_holistic.FACEMESH_CONTOURS,mp_drawing.DrawingSpec(color=(80,256,121),thickness=1,circle_radius=1),
                                 mp_drawing.DrawingSpec(color=(110,256,121),thickness=1,circle_radius=1))#connects one landmark to another, contor is for the dots and connections is for the lines>
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [22]:
cap=cv2.VideoCapture(0) #turns on the webcam, '0' is for webcam
#access mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    while cap.isOpened(): #loop through all the frames
        ret,frame=cap.read() #read the feed, feed is read one frame at atime. Frame is the image

        #make detections
        image,results=mediapipe_detection(frame,holistic)
       # we have different types of functions for our results, landmark functions. for ex face landmark detects face keypoints nd left hand landmarks for left hand
        
        #draw landmarks
        draw_landmarks(image,results)
        
        #show to screen
        cv2.imshow('Open Feed',image) #showing the update 'frame'
        
        
        if cv2.waitKey(10)&0xFF==ord('q'): #quit. Q on our keyboard
            break
    cap.release()#release the frames
    cv2.destroyAllWindows()#closes the frames

# Extract Keypoint

In [29]:
def extract_keypoints(results):
    pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)

    lh=np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    #what we are doing here is a bit of an error handling if any of the right hand or left hand array is empty we are filling it with zeroes of same shaep
    #we knoe waxh hand landmark has 21 landmar elements comprising of 3 axes so (21*3)
    #also list comprehension is used for insertion then the whole is converted to and array and flattened out

    rh=np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    #what we are doing here is a bit of an error handling if any of the right hand or left hand array is empty we are filling it with zeroes of same shaep
    #we knoe waxh hand landmark has 21 landmar elements comprising of 3 axes so (21*3)
    #also list comprehension is used for insertion then the whole is converted to and array and flattened out

    face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    #what we are doing here is a bit of an error handling if any of the right hand or left hand array is empty we are filling it with zeroes of same shaep
    #we knoe waxh hand landmark has 468 landmar elements comprising of 3 axes so (468*3)
    #also list comprehension is used for insertion then the whole is converted to and array and flattened out
    
    return np.concatenate([pose,face,lh,rh])

In [10]:
result_test=extract_keypoints(results)

In [11]:
np.save('0',result_test)#save the test extracted keypoint result in a file


# Folders for collection

In [13]:
data_path=os.path.join('MP_Data')#path for exported keypoint data, numpy arrays

actions=np.array(['hello','thanks','iloveyou'])#Actions that we will tryto detect

no_seq=30 #30 videos worth of data

seq_len=30#each video/seq will have 30 frames

#the key diff between object detection and action det that we use multiple friends in the latter

In [13]:
#total len of data
1662*30*30*3

4487400

In [None]:
for action in actions:
    for seq in range(no_seq):
        try:
            os.makedirs(os.path.join(data_path,action,str(seq)))
        except Exception as e:
            print(e)
        
#creates directiories for eac h action and 30 seq
#effectively MP_Data/hello/0.....MP_Data/Hello/29
#and similar for the rest two
#each seq(Folder 0..29) are gonna inturn contain 30 frames worth of data

# Collects keypoint values for training and testing 

In [None]:
cap=cv2.VideoCapture(0) #turns on the webcam, '0' is for webcam
#access mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    
    #loop thorugh actions
    for action in actions:
        #loop through sequences aka videos
        for seq in range(no_seq):
            #loop though video length
            for frame_num in range(seq_len):
                
                
        
                ret,frame=cap.read() #read the feed, feed is read one frame at atime. Frame is the image

                #make detections
                image,results=mediapipe_detection(frame,holistic)
               # we have different types of functions for our results, landmark functions. for ex face landmark detects face keypoints nd left hand landmarks for left hand
                
                #draw landmarks
                draw_landmarks(image,results)
                
                #apply wait logic
                if frame_num==0:
                    cv2.putText(image,'STARTING COLLECTION',(120,200),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),4,cv2.LINE_AA)
                    cv2.putText(image,f'Collecting frames for {action} video number {seq}',(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(125,255,178),1,cv2.LINE_AA)
                    cv2.waitKey(2000)#if the frame is just starting add a collection break before the next sequence
                else:
                    cv2.putText(image,f'Collecting frames for {action} video number {seq}',(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,0,0),2,cv2.LINE_AA)
                
                #Export keypoints
                keypoints=extract_keypoints(results)
                npy_path=os.path.join(data_path,action,str(seq),str(frame_num))
                np.save(npy_path,keypoints)
                #show to screen
                cv2.imshow('Open Feed',image) #showing the update 'frame'
        
        
                if cv2.waitKey(10)&0xFF==ord('q'): #quit. Q on our keyboard
                    break
    cap.release()#release the frames
    cv2.destroyAllWindows()#closes the frames

# Preprocess data and create labels and features

In [16]:
!pip install scikit-learn



In [9]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [14]:
label_map={label:num for num,label in enumerate(actions)}

In [None]:
label_map

In [15]:
sequences,labels=[],[] #sequences=30 videos for each action
for action in actions:
    for seq in range(no_seq):
        window=[] #create an empty list for each video to append every 30 frame containing 1662 keypoints
        for frame_num in range(seq_len):
            res=np.load(os.path.join(data_path,action,str(seq),f'{frame_num}.npy'))#loading up keypoints from each frame
            window.append(res)#appending each frame with keypoints
        sequences.append(window) #append the 30 frames to a video
        labels.append(label_map[action]) #sets the labels for each action

In [16]:
x=np.array(sequences) # we got 90 vides(3*30, 30 videos for each action) each video has got 30 frames and eaxch frame has got 1662 keypoints
#this is the x labels

In [17]:
y=to_categorical(labels).astype(int) #y labes using keras to convert the labels to flags

In [None]:
y #100 represents hello, 010 thank you, 001 i love you

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
...     x, y, test_size=0.05)


# Train the LSTM NN

In [19]:
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.callbacks import TensorBoard,EarlyStopping

In [20]:
log_dir=os.path.join('logs')
tb_callback=TensorBoard(log_dir=log_dir) #create a call back directory
early=EarlyStopping(monitor='categorical_accuracy',patience=6)

In [48]:
model=Sequential()
model.add(LSTM(64,return_sequences=True,activation='relu',input_shape=(30,1662))) #since there are 30 frames in each video with 1662 keypoints we will use it as input shape
#return_seq is used as memory
model.add(LSTM(128,return_sequences=True,activation='relu'))
#model.add(Dropout(0.02))
model.add(LSTM(64,return_sequences=False,activation='relu'))

#model.add(Dropout(0.22))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
#model.add(Dropout(0.3))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.018))
model.add(Dense(actions.shape[0],activation='softmax'))




In [49]:
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [50]:
model.fit(X_train,y_train,epochs=200,callbacks=[tb_callback]) #trained till 124 epochs as accuraccy was pretty good

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200


KeyboardInterrupt: 

In [25]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596,675
Trainable params: 596,675
Non-tr

# Make predictions


In [31]:
res=model.predict(X_test)

In [34]:
res

array([[3.4855652e-01, 3.1455502e-01, 3.3688846e-01],
       [9.7081135e-04, 1.6120598e-02, 9.8290861e-01],
       [1.8294715e-05, 9.9941278e-01, 5.6892732e-04],
       [1.6121319e-05, 9.9933547e-01, 6.4838812e-04],
       [8.6418194e-01, 1.2694535e-01, 8.8727018e-03]], dtype=float32)

In [None]:
res

In [64]:
actions[np.argmax(res[4])]

'hello'

In [46]:
y_test.shape

(5, 3)

In [63]:
actions[np.argmax(y_test[4])]

'hello'

#### Save the weights

In [51]:
model.save('action2.h5')#save the model

In [6]:
import tensorflow as tf

In [52]:
action_model = tf.keras.models.load_model('action2.h5')



# Evaluating confusion matrix

In [21]:
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score

In [68]:
yhat=action_model.predict(X_test)

In [69]:
ytrue=np.argmax(y_test,axis=1).tolist()
yhat=np.argmax(yhat,axis=1).tolist()

In [32]:
multilabel_confusion_matrix(ytrue,yhat) #represents the matrices for 3 categories hello,thankyou and iloveyou
#matrix is arranged in true +ve,False +ve,False -ve,True n-ve

array([[[3, 0],
        [2, 0]],

       [[0, 4],
        [0, 1]],

       [[3, 0],
        [2, 0]]], dtype=int64)

In [70]:
accuracy_score(ytrue,yhat)

0.6

# Test in real time

In [53]:
#new detection variables
sequence=[]
sentence=[]
threshold=0.73



cap=cv2.VideoCapture(0) #turns on the webcam, '0' is for webcam
#access mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    while cap.isOpened(): #loop through all the frames
        ret,frame=cap.read() #read the feed, feed is read one frame at atime. Frame is the image

        #make detections
        image,results=mediapipe_detection(frame,holistic)
        #print(results)
       # we have different types of functions for our results, landmark functions. for ex face landmark detects face keypoints nd left hand landmarks for left hand
        
        #draw landmarks
        draw_landmarks(image,results)

        #prediction logic
        while True:
        
        
            keypoints= extract_keypoints(results)
            
            sequence.insert(0,keypoints)
            
            sequence=sequence[:30]

            if(len(sequence)==30):break
           
    
        res=action_model.predict(np.expand_dims(sequence,axis=0))[0]
        #print(actions[np.argmax(res)])

        #viz. logic
        if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])
        

        if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            #image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
       
        
        #show to screen
        cv2.imshow('Open Feed',image) #showing the update 'frame'
        
        
        if cv2.waitKey(10)&0xFF==ord('q'): #quit. Q on our keyboard
            break
    cap.release()#release the frames
    cv2.destroyAllWindows()#closes the frames