### Problem Statement

### Import and add dependncy

In [None]:
!pip install tensorflow opencv-python mediapipe scikit-learn matplotlib

In [None]:
import time
import numpy as np
from matplotlib import pyplot as plt
import os#Work with filepaths
import cv2#Open Cv
import mediapipe as mp

### Taking keypoints using mediapipe holistics

In [None]:
#Setting up mediapipe holistic's
#Media pipe holistic to make detection
#Media pipe drawing will draw those points
#We will create them as function to easily access them
mp_holistic=mp.solutions.holistic#Holistic model
mp_drawing=mp.solutions.drawing_utils#Drawing the utilites

In [None]:
#Creating mediapipe detection function
def mediapipe_detection(image,model):
    #To the function we pass image and holistic model for detection
    #So when wwe get feed from opencv it is of format bgr(blue,green,red)
    #For detection we need them to be rgb we will change that using opencv
    #Here image is the frames from opencv
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)#Color conversion
    image.flags.writeable=False#Image is no longer writeable
    results=model.process(image)#Making detection
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR)#Color conversion
    image.flags.writeable=True#Image is writeable again
    return image,results #Returning the results

In [None]:
#To draw the points onto the image
def draw_landmarks(image,results):
    #drawing landmarks using mp_drawings.draw_landmark
    #For Face
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION)
    #For Pose
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS)
    #For Left Hand
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    #For Right Hand
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS)
    #Pose landmark shows what landmark connected to other landmarks

In [None]:
mp_holistic.FACEMESH_TESSELATION

In [None]:
#Formatted draw style landmarks
def draw_styled_landmarks(image,result):
    #For Face
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION
                              #color landmark
                             ,mp_drawing.DrawingSpec(color=(80,110,10),thickness=1,circle_radius=1)
                             #color connections
                             ,mp_drawing.DrawingSpec(color=(80,256,121),thickness=1,circle_radius=1))
    #For Pose
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS
                             #color landmark
                             ,mp_drawing.DrawingSpec(color=(80,22,10),thickness=2,circle_radius=4)
                             #color connections
                             ,mp_drawing.DrawingSpec(color=(80,44,121),thickness=2,circle_radius=2))
    #For Left Hand
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS
                             #color landmark
                             ,mp_drawing.DrawingSpec(color=(121,22,76),thickness=2,circle_radius=4)
                             #color connections
                             ,mp_drawing.DrawingSpec(color=(121,44,250),thickness=2,circle_radius=2))
    #For Right Hand
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS
                             #color landmark
                             ,mp_drawing.DrawingSpec(color=(245,117,66),thickness=2,circle_radius=4)
                             #color connections
                             ,mp_drawing.DrawingSpec(color=(245,66,230),thickness=2,circle_radius=2))

In [None]:
#Accessing video through webcm using OpenCV
#We loop thorugh all frames in camera to create video
cap=cv2.VideoCapture(0)#To acccess our webcam 
#here 0 represents device

#Accessing the holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():#It checks wheather we are accessing or not
        #Read feed
        ret,frame=cap.read()#It reads our frames
        
        #Make detections
        image,results=mediapipe_detection(frame,holistic)
        
        #Drwing Landmarks
        draw_styled_landmarks(image,results)
        
        #To showw to screen
        #Rendering
        cv2.imshow("OpenCv Feed",image)

        #To Exit o break the feed
        if cv2.waitKey(10) & 0xFF==ord('q'):
            break #it waits and if we press q breaks the loop
    cap.release()#It releases the webcam
    cv2.destroyAllWindows()#Destroy the cv window

In [None]:
len(results.face_landmarks.landmark)

In [None]:
draw_landmarks(frame,results)
plt.imshow(frame)
#This show last frame captured
#Color Conversion
plt.imshow(cv2.cvtColor(frame,cv2.COLOR_BGR2RGB))

In [None]:
draw_styled_landmarks(frame,results)
plt.imshow(frame)
#This show last frame captured
#Color Conversion
plt.imshow(cv2.cvtColor(frame,cv2.COLOR_BGR2RGB))

### Extract Keypoints

In [None]:
len(results.face_landmarks.landmark)

In [None]:
#Obtaining values using list comprenhension
def extract_keypoints(results):
    #Pose
    # If else statement returns zero array if results is empty
    pose=np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    #We need to flatten it to get in one array
    #Left hand
    lh=np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)
    #Right hand
    rh=np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)
    #Face
    face=np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    #returning the values in a single concatenated array
    return np.concatenate([pose,face,lh,rh])

In [None]:
extract_keypoints(results).shape

### Setup folders for collection

In [None]:
#Path for exported data,Numpyarray
DATA_PATH=os.path.join('MP_data')

#Actions we are going to try to detect
actions=np.array(['hello','thanks','loveyou'])
#We use 30 different frames of data to detect actions
#30 videos worth of data
no_sequences=30
#Videos are going to be length of 30 frames
sequence_length=30

In [None]:
#Creating folders to save the data
for action in actions:
    for sequence in range(no_sequences):
        try:
            #makedirs will create sub folders
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass
#We will create 30 folders for every actions

### Collecting data for training and testing

In [None]:
#Accessing video through webcm using OpenCV
#We loop thorugh all frames in camera to create video
cap=cv2.VideoCapture(0)#To acccess our webcam 
#here 0 represents device

#Accessing the holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    #Loop through actions:hello,thanks etc
    for action in actions:
        #Loop through videos each action 30 videos
        for sequence in range(no_sequences):
            #Loop through each frame per video 30 frames
            for frame_num in range(sequence_length):
                #Read feed
                ret,frame=cap.read()#It reads our frames

                #Make detections
                image,results=mediapipe_detection(frame,holistic)

                #Drwing Landmarks
                draw_styled_landmarks(image,results)
                    
                #Applying wait logic 
                #To give break between videos and say to user what video he is at
                if frame_num==0:
                    cv2.putText(image,'STARTING COLLECION',(120,200),
                                   cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),4,cv2.LINE_AA)
                    cv2.putText(image,f'Collecting frames for {action} Video number {sequence}'
                                    ,(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
                    cv2.waitKey(2000)#This waits for 2 second after every video captured
                    
                else:
                    cv2.putText(image,f'Collecting frames for {action} Video number {sequence}'
                                    ,(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)

                #Extracting the keypoints
                keypoints=extract_keypoints(results)
                #Saving the extracted keypoints
                #Path to save the key points
                npy_path=os.path.join(DATA_PATH,action,str(sequence),str(frame_num))
                #Saving keypoints
                np.save(npy_path,keypoints)
                    
                #To show to screen
                #Rendering
                cv2.imshow("OpenCv Feed",image)

                #To Exit o break the feed
                if cv2.waitKey(10) & 0xFF==ord('q'):
                    break #it waits and if we press q breaks the loop
    cap.release()#It releases the webcam
    cv2.destroyAllWindows()#Destroy the cv window

In [None]:
#To destroy open cv in middle
cap.release()#It releases the webcam
cv2.destroyAllWindows()#Destroy the cv window

### Preprocessing Data and Creating Labels

In [None]:
#To split data for training and testing 
from sklearn.model_selection import train_test_split
#To convert data into one encoded data
from tensorflow.keras.utils import to_categorical

In [None]:
#Creating label map
label_map={label:num for num,label in enumerate(actions)}

In [None]:
label_map
#We created dictionary for labels with set of id

In [None]:
#In preprocessing we put every frame np file in single one

In [None]:
#Creating 2 blank arrays
sequences,labels=[],[]
#here sequences represent our feature data(x) and labels represent labels(y)
#Going through our actions
for action in actions:
    #Going through our 30 videos
    for sequence in range(no_sequences):
        #Creating a blank array window
        window=[]
        #Going through each frames
        for frame_num in range(sequence_length):
            #Loading up the respective frame using np.load()
            res=np.load(os.path.join(DATA_PATH,action,str(sequence),f"{frame_num}.npy"))
            #Adding the value to the window array
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
X=np.array(sequences)
X.shape

In [None]:
y=to_categorical(labels).astype(int)
y
#Here 
#[1,0,0]-hello
#[0,1,0]-thanks
#[0,0,1]-bye

In [None]:
#Forming training and testing partition
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.04,random_state=1)
#6 for test other 114 to train

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

### Building and Training LSTM Neural Network

In [None]:
#Importing sequential model,LSTM layer and dense layer
from tensorflow.keras.models import Sequential #Allows us to build sequential neural model
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.callbacks import TensorBoard
#tensorboard --logdir=. use this to check logs
#It allows us to logging inside tensorboard to trace and moniter our model as it is training

In [None]:
#Tensorboard allow us to moniter our accuracy as it is training
#Create a log directory to setuo tensorboard callbacks
log_dir=os.path.join('Logs')
tb_callback=TensorBoard(log_dir=log_dir)

In [None]:
#Initiating the model
model=Sequential()
#Adding 3 sets of LSTM models
model.add(LSTM(64,return_sequences=True,activation='relu',input_shape=(30,1662)))
model.add(LSTM(128,return_sequences=True,activation='relu'))
model.add(LSTM(64,return_sequences=False,activation='relu'))
#Next layer is dense layer which we dont need to return sequences
#Take a look at andrew ng deeplearing specializations
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(actions.shape[0],activation='softmax'))

In [None]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])
#categorical_crossentropy is needed for multi classification model

In [None]:
#Fit and train the model
model.fit(X_train,y_train,epochs=2000,callbacks=[tb_callback])

In [None]:
model.summary()

### Making Prediction

In [None]:
res=model.predict(X_test)

In [None]:
actions[np.argmax(res[0])]

In [None]:
actions[np.argmax(y_test[0])]

In [None]:
for i in range(0,5):
    print(f"\nPrediction:{actions[np.argmax(res[0])]}\tActual Value:{actions[np.argmax(y_test[0])]}")

### Save Weights

In [None]:
model.save('action.h5')

In [None]:
model.load_weights('action.h5')

### Evalution using confusion matrix and accuracy

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_test)

In [None]:
#Extracting the predicted classes
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)

### Testing in Realtime

In [None]:
#To render the probality
colors=[(245,117,16),(117,245,16),(16,117,245)]
#One color for each actions
def prob_viz(res,actions,input_frame,colors):
    output_frame=input_frame.copy()#Copy of the frames
    for num,prob in enumerate(res):
        #Dynamicllay placing the rectangle
        cv2.rectangle(output_frame,(0,60+num*40),(int(prob*100),90+num*40),colors[num],-1)
        cv2.putText(output_frame,actions[num],(0,85+num*40),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv2.LINE_AA)
    return output_frame

In [None]:
plt.imshow(prob_viz(res,actions,image,colors))

In [None]:
#New detection variables
sequence=[]#Collect 30 frames for prediction
sentence=[]#Concatenate our history of detection
threshold=0.7#To only give result if it is above threshold

#Accessing video through webcm using OpenCV
#We loop thorugh all frames in camera to create video
cap=cv2.VideoCapture(0)#To acccess our webcam 
#here 0 represents device

#Accessing the holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():#It checks wheather we are accessing or not
        #Read feed
        ret,frame=cap.read()#It reads our frames
        
        #Make detections
        image,results=mediapipe_detection(frame,holistic)
        
        #Drwing Landmarks
        draw_styled_landmarks(image,results)
        
        #Prediction Logic
        keypoints=extract_keypoints(results)
        sequence.append(keypoints)
        sequence=sequence[-30:]#Grabs last 30 frames
        
        #Run prediction if only 30 sequences collected
        if len(sequence)==30:
            res=model.predict(np.expand_dims(sequence,axis=0))[0]
            
            
        #Rendering logic to show prediction in the opencv feed
        #Checking wheather our result is above threshold
        if res[np.argmax(res)].any()>threshold:
            #Checking because we want to check next sequence
            if len(sentence)>0:
                #Checking if current action is not equal to last sentence
                if actions[np.argmax(res)]!=sentence[-1]:
                    sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
                
        if len(sentence)>5:
            #if sentence is greater than 5 grabbing last 5 value
            sentence=sentence[-5:]
            
        
        #Rendering to show the predection
        cv2.rectangle(image,(0,0),(640,40),(245,117,16),-1)
        cv2.putText(image,' '.join(sentence),(3,30),cv2.FONT_HERSHEY_SIMPLEX,
                       1,(255,255,255),2,cv2.LINE_AA)
        
        #Probabilities
        try:
            image=prob_viz(res,actions,image,colors)
        except TypeError:
            pass
        
        #To showw to screen
        #Rendering
        cv2.imshow("OpenCv Feed",image)

        #To Exit o break the feed
        if cv2.waitKey(10) & 0xFF==ord('q'):
            break #it waits and if we press q breaks the loop
    cap.release()#It releases the webcam
    cv2.destroyAllWindows()#Destroy the cv window

In [None]:
#To destroy open cv in middle
cap.release()#It releases the webcam
cv2.destroyAllWindows()#Destroy the cv window

In [None]:
res[np.argmax(res)]

In [None]:
np.argmax(res)