# Real-Time Arabic Sign Language Translation

This notebook contains code for running real-time translation from the ArSL model.

In [None]:
# Required imports
import mediapipe as mp
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer
from PIL import Image, ImageFont, ImageDraw
from arabic_reshaper import reshape
from bidi.algorithm import get_display

## Custom TemporalAttention Layer

This is required for loading the model correctly.

In [None]:
# Define custom temporal attention layer
class TemporalAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        # Initialize attention weight
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')

    def call(self, x):
        # Calculate attention weights
        e = tf.tanh(tf.matmul(x, self.W))
        a = tf.nn.softmax(e, axis=1)
        # Apply attention to the input sequence
        output = x * a
        # Aggregate the attentionaly weighted features over the sequence
        return tf.reduce_sum(output, axis=1)

## Arabic Support

In [None]:
# Function to display arabic text on frame
def display_arabic_text(frame, text):
    # Convert OpenCV frame to PIL Image
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)
    
    # Load Arabic font 
    font = ImageFont.truetype(r"F:\SignComm\model test\V1\font\NotoSansArabic-VariableFont_wdth,wght.ttf", 30)
    
    # Reshape and apply Bidi
    reshaped_text = reshape(text)
    bidi_text = get_display(reshaped_text)
    
    # Draw text
    draw.text((50, 100), bidi_text, font=font, fill=(0, 255, 0))
    
    # Convert back to OpenCV format
    frame[:] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

In [None]:
# Function to get the Arabic label from the index
def get_arabic_label(index):
    arabic_labels = [ "اسمك ايه ؟", "اشاره", "الحمدلله",
                     "السلام عليكم","الصم","اللغه العربيه","ان شاء الله",
                     "انا","انت","ايه ؟","برنامج","تخرج",
                     "جميل","دكتور","شكرا",
                     "طالب","عامل ايه ؟",
                     "فكرة","في","كلية حاسبات و معلومات",
                     "مترجم","مجتمع","مساعده",
                     "مشروع","ناجح","هدف",
                     "وعليكم السلام","و"]
    return arabic_labels[index]

## MediaPipe Setup for Landmark Extraction

In [None]:
# Initialize MediaPipe hand and pose solutions outside the function
mp_hands = mp.solutions.hands.Hands(
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)
mp_pose = mp.solutions.pose.Pose(
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# Function to extract landmarks from a single frame
def extract_landmarks_single(frame):
    # Use the global MediaPipe hand and pose solutions
    global mp_hands, mp_pose

    # Convert frame to RGB for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process landmarks
    hand_results = mp_hands.process(frame_rgb)
    pose_results = mp_pose.process(frame_rgb)

    # Same logic as extract_landmarks but for a single frame
    frame_data = []

    # Pose landmarks
    if pose_results.pose_landmarks:
        pose_data = [[lmk.x, lmk.y, lmk.z] for lmk in pose_results.pose_landmarks.landmark]
    else:
        pose_data = [[0,0,0]]*33

    # Hand landmarks
    left_hand = [[0,0,0]]*21
    right_hand = [[0,0,0]]*21

    if hand_results.multi_hand_landmarks:
        for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                  hand_results.multi_handedness):
            if handedness.classification[0].label == "Left":
                left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
            else:
                right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

    # Flatten and return
    frame_data = np.array(pose_data + left_hand + right_hand).flatten()
    return frame_data if (np.any(left_hand) or np.any(right_hand)) else None

## Real-Time Translation Function

In [None]:
# Function for real-time translation
def real_time_translation(model, seq_length=30):
    cap = cv2.VideoCapture(0)  # Open default camera
    buffer = []  # Initialize frame buffer

    while cap.isOpened():
        ret, frame = cap.read()   # Read frame from the camera
        if not ret: break   # Break if no frame is read 

        # Process the frame to get hand and pose landmarks
        processed_frame = extract_landmarks_single(frame)

        if processed_frame is None:
            # Display a text to show hands if not detected
            cv2.putText(frame, "Show Hands", (50,50),
                      cv2.FONT_HERSHEY_SIMPLEX, 1,
                      (0,0,255), 2)

        # Append the frame to the buffer if landmarks are detected
        else:  # Has hands
            buffer.append(processed_frame) # Append the processed frame to buffer
            buffer = [f for f in buffer if f is not None][-seq_length:]  # Keep only the most recent frames and filter out any None


            if len(buffer) == seq_length:
                # Make a prediction using the model
                prediction = model.predict(np.array([buffer]))
                arabic_word = get_arabic_label(np.argmax(prediction)) # Get the predicted word
                display_arabic_text(frame, arabic_word) # Display it on the frame

        # Display the frame
        cv2.imshow('Translation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Exit if 'q' is pressed
            break

    cap.release()  # Release the camera
    cv2.destroyAllWindows()  # Close all windows

## Load Model and Run Translation

In [None]:
# Function to load the model
def load_model(path):
    return tf.keras.models.load_model(
        path,
        custom_objects={'TemporalAttention': TemporalAttention}
    )

In [None]:
# Load model and run real-time translation
# Make sure to update the path to your trained model
model = load_model('best_model2.keras')
real_time_translation(model)