In [1]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp
import sklearn
#import tensorflow

import tensorflow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM ,Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model

from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [2]:
mp_holistic = mp.solutions.holistic #makes detections
mp_drawing = mp.solutions.drawing_utils #draws detections

In [3]:
model = load_model("model_new_final.h5")

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False   #image not writeable
    results = model.process(image)  #make prediction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [5]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                                                                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    

In [6]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    pose=pose[:69]
    return np.concatenate([pose,lh,rh])

In [7]:
#Data_Path = os.path.join("test video_all/test video")
Data_Path = os.path.join("test video")

In [8]:
actions = np.array(["Book","Do","Eat","Go","Good","Hello","Home","Hungry","I","Morning","No","Not","Pizza" , "Place" ,"Read","School","Student","Teacher","Thank You", "This" , "Tomorrow" ,"Want", "What", "Yes", "Yesterday","You"])


In [9]:
len(actions)

26

In [10]:
label_map = {label:num for num , label in enumerate(actions)}

In [11]:
length = 3

In [12]:
for sequence in range(length):
        try:
            os.makedirs(os.path.join(Data_Path, (str(sequence))))
        except:
            pass

**Emotion detection part**

In [13]:
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)


mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Landmark indices
RIGHT_INNER_EYELID = 133 
LEFT_INNER_EYELID = 362   

right_inner_eyebrow = 55
left_inner_eyebrow = 285

RIGHT_EYE_OUTER = 33   # Right outer eye corner
LEFT_EYE_OUTER = 263   # Left outer eye corner

In [14]:
def face_emotion(frame):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    face_results = face_mesh.process(rgb_frame)

    if face_results.multi_face_landmarks:
        for face_landmarks in face_results.multi_face_landmarks:
            # Draw face mesh landmarks
            """ mp_drawing.draw_landmarks(
                frame, face_landmarks, mp_face_mesh.FACEMESH_TESSELATION,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
            ) """

            # Extract image dimensions
            h, w, _ = frame.shape

            # Function to get pixel coordinates
            def get_pixel_coords(landmark):
                return int(landmark.x * w), int(landmark.y * h)

            # Get the upper eyelid points
            x1_right, y1_right = get_pixel_coords(face_landmarks.landmark[RIGHT_INNER_EYELID])
            x1_left, y1_left = get_pixel_coords(face_landmarks.landmark[LEFT_INNER_EYELID])

            x2_right, y2_right = get_pixel_coords(face_landmarks.landmark[right_inner_eyebrow])
            x2_left, y2_left = get_pixel_coords(face_landmarks.landmark[left_inner_eyebrow])

            # Calculate Euclidean distance
            dist_right = np.linalg.norm(np.array([x1_right, y1_right]) - np.array([x2_right, y2_right]))
            dist_left = np.linalg.norm(np.array([x1_left, y1_left]) - np.array([x2_left, y2_left]))

            # Get outer eye corners
            x_r_eye, y_r_eye = get_pixel_coords(face_landmarks.landmark[RIGHT_EYE_OUTER])
            x_l_eye, y_l_eye = get_pixel_coords(face_landmarks.landmark[LEFT_EYE_OUTER])

            # Compute inter-eye distance (used for normalization)
            eye_distance = np.linalg.norm(np.array([x_r_eye, y_r_eye]) - np.array([x_l_eye, y_l_eye]))

            # Normalize distances
            norm_dist_right = dist_right / eye_distance
            norm_dist_left = dist_left / eye_distance

            # Check if the person is frowning
            if (norm_dist_right <= 0.16) and (norm_dist_left <= 0.16):
                return ("?")
            else:
                return (".")
            

In [17]:
cap = cv2.VideoCapture(0)
frown_count = 0

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        
    for sequence in range(length):
        
        for frame_num in range(30):

            ret, frame = cap.read()

            if sequence == (length-1):
                emotion = face_emotion(frame)
                if emotion == "?":
                    frown_count += 1
                
            image, results = mediapipe_detection(frame, holistic)
                            
            draw_landmarks(image, results)
                        
            if frame_num == 0: 
                cv2.putText(image, 'STARTING COLLECTION', (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, 'Collecting Video Number {}'.format(sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(1500)
            else: 
                cv2.putText(image, 'Collecting Video Number {}'.format(sequence), (15,12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                
                cv2.imshow('OpenCV Feed', image)               
            
            keypoints = extract_keypoints(results)

            npy_path = os.path.join(Data_Path, (str(sequence)), str(frame_num))
            np.save(npy_path, keypoints)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                    
    cap.release()
    cv2.destroyAllWindows()
    

In [27]:
if frown_count > 15:
    suffix = "?"
else:
    suffix = "."

In [28]:
print(frown_count)

19


In [29]:
cv2.destroyAllWindows()

In [30]:
def normalize_keypoints(keypoints, center_keypoint, reference_distance):
    # Reshape the keypoints into (x, y, z) coordinates
    #print(keypoints)
    keypoints = keypoints.reshape(-1, 3)
    
    #print(keypoints)
    # Subtract center keypoint to get relative coordinates
    relative_keypoints = keypoints - center_keypoint

    #print(relative_keypoints)
    
    # If reference distance is provided, scale the keypoints
    relative_keypoints = relative_keypoints / reference_distance
    #relative_keypoints = keypoints / reference_distance
    

    return relative_keypoints.flatten()

In [31]:
sequences = []


for sequence in range(length):
    window = []
    for frame_num in range(30):
        frame = np.load(os.path.join(Data_Path,str(sequence), f"{frame_num}.npy"))
        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)
        left_shoulder = frame[11*3:11*3+3]
        right_shoulder = frame[12*3:12*3+3]
        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)
        if not reference_distance:
            reference_distance=1
        

        frame = normalize_keypoints(frame, center_keypoint, reference_distance)
        window.append(frame)
    sequences.append(window)

In [32]:
import numpy as np
from scipy.ndimage import uniform_filter1d  # For temporal smoothing

# Function to calculate relative hand keypoints
def preprocess_hand_keypoints(hand_keypoints):
    # If there are hand keypoints, calculate relative positions with respect to the wrist (0th keypoint)
    if np.any(hand_keypoints):
        wrist_keypoint = hand_keypoints[0:3]  # Wrist is the first keypoint in MediaPipe
        relative_hand_keypoints = (hand_keypoints.reshape(-1, 3) - wrist_keypoint)  # Relative to wrist
    else:
        relative_hand_keypoints = np.zeros(21 * 3)  # If no hand keypoints, return zeros
    return relative_hand_keypoints.flatten()


In [33]:



# Preprocessing: Including relative hand keypoints and temporal smoothing
sequences, labels = [], []


for sequence in range(length):
    window = []
    for frame_num in range(30):
        frame = np.load(os.path.join(Data_Path,str(sequence), f"{frame_num}.npy"))
        
        # Center keypoint (nose) and shoulder distance (for normalization)
        center_keypoint = frame[0:3]  # Nose keypoint (x, y, z)
        left_shoulder = frame[11*3:11*3+3]
        right_shoulder = frame[12*3:12*3+3]
        reference_distance = np.linalg.norm(left_shoulder - right_shoulder)
        if not reference_distance:
            reference_distance = 1
        
        # Normalize the pose keypoints relative to the nose
        normalized_pose = normalize_keypoints(frame[:69], center_keypoint, reference_distance)
        
        # Preprocess left hand keypoints (relative to wrist)
        left_hand = frame[69:69 + 21*3]
        relative_left_hand = preprocess_hand_keypoints(left_hand)
        
        # Preprocess right hand keypoints (relative to wrist)
        right_hand = frame[69 + 21*3:]
        relative_right_hand = preprocess_hand_keypoints(right_hand)
        
        # Concatenate normalized pose, relative left hand, and relative right hand keypoints
        full_frame = np.concatenate([normalized_pose, relative_left_hand, relative_right_hand])
        
        window.append(full_frame)
    
    # Convert the window into a numpy array for smoothing
    window = np.array(window)
    
    # Apply temporal smoothing using a moving average filter
    smoothed_window = uniform_filter1d(window, size=3, axis=0)
    
    sequences.append(smoothed_window)



In [34]:
print(np.array(sequences).shape)

(3, 30, 195)


In [35]:
raw = ""

for sign in range(length):
    res = model.predict(np.expand_dims(sequences[sign], axis=0))
    if sign == length-1:
        raw = raw + str(actions[np.argmax(res)])
    else:
        raw = raw + str(actions[np.argmax(res)]) + " "
    raw = raw.upper()
    print(actions[np.argmax(res)] , np.max(res))
    #print(res)
raw = raw + suffix
print(raw)

You 0.9990669
Want 0.8951767
Eat 0.9998957
YOU WANT EAT?


In [None]:
from langchain import PromptTemplate, FewShotPromptTemplate
from langchain_community.llms import Ollama
from langchain_openai import OpenAI
from langchain.chains import LLMChain


from openai import OpenAI

client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=""                                                  #enter api key
)



messages = [
    {"role": "system", "content": "You are an AI skilled at translating raw sign language input into grammatically correct English sentences. Remember that when a word is repeated twice, it means that the word is in plural form not that it is 2 in quantity."},
    {"role": "user", "content": "Translate the following sign language into proper English sentences."},
    
    {"role": "assistant", "content": "Raw Input: 'HOME RAIN HEAVY.'\nTranslation: 'It is raining heavily in my home area.'"},

    {"role": "assistant", "content": "Raw Input: 'I TOMORROW EAT FRUIT FRUIT.'\nTranslation: 'Tomorrow I will eat fruits.'"},
    
    {"role": "assistant", "content": "Raw Input: 'CLASS STUDENTS SIT.'\nTranslation: 'There are students sitting in the class.'"},
    
    {"role": "assistant", "content": "Raw Input: 'I TONIGHT HOME GO LATE.'\nTranslation: 'I will go home late tonight.'"},

    {"role": "assistant", "content": "Raw Input: 'YOU HUNGRY?'\nTranslation: 'Are you feeling hungry?'"},
    
    {"role": "user", "content": f"Raw Input: {raw}"},
]


llm = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3-8B-Instruct", 
	messages=messages, 
	max_tokens=20
)

final = str((llm.choices[0].message.content))
print(final)

from gtts import gTTS

import os

language = 'en'


myobj = gTTS(text=final, lang=language, slow=False)


myobj.save("welcome.mp3")


os.system("start welcome.mp3")


Translation: 'Do you want to eat?'


0