In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install mediapipe tensorflow opencv-python arabic-reshaper python-bidi numpy scikit-learn

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Instal

In [3]:
import mediapipe as mp
import cv2
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Flatten
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Layer, LSTM, Dense, Permute, Multiply, Flatten
from tensorflow.keras.layers import Bidirectional
import random
import shutil
from pathlib import Path
import logging

1. Preprocessing with MediaPipe

In [4]:
mp_hands = mp.solutions.hands.Hands()
# Function to extract pose and hand landmarks from a video
def extract_landmarks(video_path):

    global mp_hands

    # Open video file
    cap = cv2.VideoCapture(video_path)
    sequence = []   # List to hold frame data

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break    # Break if no frame

        # Process frame
        # Convert frame to RGB for MediaPipe processing
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame to get hand and pose landmarks
        hand_results = mp_hands.process(frame_rgb)


        # Initialize lists for hands landmarks (21 landmarks each)
        left_hand = [[0,0,0]]*21
        right_hand = [[0,0,0]]*21

        # Extract hand landmarks
        if hand_results.multi_hand_landmarks:
            for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                      hand_results.multi_handedness):
                if handedness.classification[0].label == "Left":
                    left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
                else:
                    right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

        # Flatten and combine all landmarks
        frame_data = np.array(left_hand + right_hand).flatten()

        # Only add frames with hands detected
        if hand_results.multi_hand_landmarks:
            sequence.append(frame_data)


    cap.release()  # Release the video


    if len(sequence) == 0:
        print(f"Warning: No hands detected in {video_path}")
    return np.array(sequence)

    cap.release()  # Release the video
    return np.array(sequence)  # Return the sequence of landmarks

2. Data Conversion to Numpy Arrays

In [5]:
def sequence_sampling(sequence, target_length):
    # If sequence is shorter than target, pad with zeros
    if len(sequence) <= target_length:
        padded = np.zeros((target_length, sequence.shape[1]))
        padded[:len(sequence)] = sequence
        return padded

    # If sequence is longer, sample frames evenly throughout the video
    # np.linspace generates evenly spaced indices from start to end
    indices = np.linspace(0, len(sequence)-1, target_length, dtype=int)
    return sequence[indices]

# Process videos for specific signs and save as numpy arrays. to split preprocessing
def process_dataset(input_dir, output_dir, sign_list, seq_length=30):
    """
    Args:
        input_dir: Path to dataset directory
        output_dir: Base directory for processed output
        sign_list: List of sign names to process( subset of the dataset )
        seq_length: Target sequence length
    """
    allowed_extensions = ['.mp4', '.avi', '.mov']  # List of video file extensions

    # Loop through each requested sign
    for sign_name in sign_list:
        sign_path = os.path.join(input_dir, sign_name)
        if not os.path.isdir(sign_path):
            print(f"Warning: Sign '{sign_name}' not found in {input_dir}")
            continue

        # Create output directory for this sign
        sign_output_dir = os.path.join(output_dir, sign_name)
        os.makedirs(sign_output_dir, exist_ok=True)

        print(f"Processing sign: {sign_name}")

        # Loop through each signer folder
        for signer_name in os.listdir(sign_path):
            signer_path = os.path.join(sign_path, signer_name)
            if not os.path.isdir(signer_path):
                continue

            # Loop through each video file
            for video_file in os.listdir(signer_path):
                # Skip non-video files
                if not any(video_file.lower().endswith(ext) for ext in allowed_extensions):
                    continue

                video_path = os.path.join(signer_path, video_file)

                # Extract landmarks from video
                sequence = extract_landmarks(video_path)

                if len(sequence) == 0:  # Skip videos with no hands detected
                    print(f"  Skipping {video_file} - no hands detected")
                    continue

                # Update features count for hands only (21 landmarks per hand, 3 coordinates per landmark)
                num_features = 42 * 3  # 2 hands × 21 landmarks × 3 coordinates

                # Use smart sequence sampling instead of simple truncation/padding
                padded_sequence = sequence_sampling(sequence, seq_length)

                # Save with signer info in filename
                base_name = os.path.splitext(video_file)[0]
                numpy_filename = f"{signer_name}_{base_name}.npy"
                np.save(os.path.join(sign_output_dir, numpy_filename), padded_sequence)

    print(f"Processing complete. Output saved to {output_dir}")

In [6]:
# !ls V1/mini_dataset/
# lists for different notebooks
# sign_list = [
#     "اسمك ايه ؟", "اشاره", "الحمدلله", "السلام عليكم", "الصم", "اللغه العربيه",
#     "ان شاء الله", "انا", "انت", "ايه ؟", "برنامج", "تخرج", "جميل", "دكتور",
#     "شكرا", "طالب", "عامل ايه ؟", "فكرة", "في", "كلية حاسبات و معلومات",
#     "مترجم", "مجتمع", "مساعده", "مشروع", "ناجح", "هدف", "و", "وعليكم السلام"
# ]

list1 = ['اسمك ايه ؟', 'اشاره', 'الحمدلله', 'السلام عليكم', 'الصم', 'اللغه العربيه', 'ان شاء الله']
# list2 = ['انا', 'انت', 'ايه ؟', 'برنامج', 'تخرج', 'جميل', 'دكتور']
# list3 = ['شكرا', 'طالب', 'عامل ايه ؟', 'فكرة', 'في', 'كلية حاسبات و معلومات', 'مترجم']
# list4 = ['مجتمع', 'مساعده', 'مشروع', 'ناجح', 'هدف', 'و', 'وعليكم السلام']

In [None]:
# Set the paths
input_video_dir = r"/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/Videos"  # Raw videos (28 subfolders)
output_numpy_dir = r"/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/processed_videos"  # Processed numpy data
# model_save_path = "/model.h5"     # Trained model path

# to process all videos
#should probably add something to the function to show progress
process_dataset(input_dir= input_video_dir, output_dir = output_numpy_dir, sign_list = list1, seq_length=30)

Processing sign: اسمك ايه ؟


3. Data loading

In [None]:


# Function to load data from numpy arrays and prepare for model training
def load_data(numpy_dir, label_mapping, test_size=0.15, val_size=0.15, random_state=42):
    X = []  # List to hold feature data
    y = []  # List to hold label data

    # Loop through each sign directory
    for sign_name in os.listdir(numpy_dir):
        sign_dir = os.path.join(numpy_dir, sign_name)

        # Skip if not a directory
        if not os.path.isdir(sign_dir):
            continue

        # Get class index for this sign
        class_idx = label_mapping[sign_name]

        # Process all numpy files in this sign directory
        for file in os.listdir(sign_dir):
            if not file.endswith(".npy") or "Mohamed" in file:
                continue

            # Load numpy array from the file
            data_path = os.path.join(sign_dir, file)
            data = np.load(data_path)

            # Append data and label
            X.append(data)
            y.append(class_idx)

    X = np.array(X)  # Convert feature list to numpy array
    num_classes = len(label_mapping)
    y = to_categorical(y, num_classes=num_classes)


    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Second split: separate validation set from remaining data
    # Adjust validation size to account for the reduced dataset
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adjusted, random_state=random_state
    )

    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")

    return X_train, X_val, X_test, y_train, y_val, y_test

4. LSTM Model with Attention

In [None]:


# Define custom temporal attention layer
class TemporalAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        # Initialize attention weight
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')

    def call(self, x):
        # Calculate attention weights
        e = tf.tanh(tf.matmul(x, self.W))
        a = tf.nn.softmax(e, axis=1)
        # Apply attention to the input sequence
        output = x * a
        # Aggregate the attentionaly weighted features over the sequence
        return tf.reduce_sum(output, axis=1)

# Function to build the LSTM model with attention
def build_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # Bi-directional LSTM layers with return sequences
    x = Bidirectional(LSTM(256, return_sequences=True))(inputs)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)

    # Temporal Attention Layer
    attention = TemporalAttention()(x)

    # Classification using dense layer
    outputs = Dense(num_classes, activation='softmax')(attention)

    # Create model object
    model = Model(inputs, outputs)
    # Compile the model
    model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
    return model

5. Training and Evaluation

In [None]:
# Function to train and evaluate the model
def train_model(X_train, X_val, X_test, y_train, y_val, y_test, num_classes, epochs=5, batch_size=8):
    # Build the model using the specified input shape and number of classes
    model = build_model(X_train.shape[1:], num_classes)

    # Create output directory for model checkpoints
    os.makedirs('model_checkpoints', exist_ok=True)

    # Define callbacks for saving the best model and learning rate reduction
    callbacks = [
        # Save the best model based on validation accuracy
        tf.keras.callbacks.ModelCheckpoint(
            'model_checkpoints/best_model.keras',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        ),
        # Reduce learning rate when validation accuracy plateaus
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_accuracy',
            factor=0.5,
            patience=3,
            verbose=1
        )
    ]

    # Train the model using validation data for monitoring
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),  # Use validation set during training
        epochs=epochs,
        batch_size=batch_size,
        verbose=1,
        callbacks=callbacks
    )

    # Evaluate the model on test data
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
    print(f"\nTest accuracy: {test_acc:.4f}")

    return model, history

In [None]:
# Create label mapping based on your dataset's sign names
label_mapping_main = {
    'اسمك ايه ؟': 0,
    'اشاره': 1,
    'الحمدلله': 2,
    'السلام عليكم': 3,
    'اللغه العربيه': 4,
    'ان شاء الله': 5,
    'انا': 6,
    'انت': 7,
    'ايه ؟': 8,
    'برنامج': 9,
    'تخرج': 10,
    'جميل': 11,
    'دكتور': 12,
    'شكرا': 13,
    'الصم': 14,
    'طالب': 15,
    'عامل ايه ؟': 16,
    'فكرة': 17,
    'في': 18,
    'كلية حاسبات و معلومات': 19,
    'مترجم': 20,
    'مجتمع': 21,
    'مساعده': 22,
    'مشروع': 23,
    'ناجح': 24,
    'هدف': 25,
    'وعليكم السلام': 26,
    'و': 27,
}

# Create label mapping using only the uncommented labels
label_mapping = {
    'اسمك ايه ؟': 0,
    'اشاره': 1,
    'الحمدلله': 2,
    'السلام عليكم': 3,
    'اللغه العربيه': 4,
    'ان شاء الله': 5,
    'الصم': 6,
}

processed_data_dir = r"E:\Current Semester\GP\ArSL_Model\V1\mini_output"  # Update this path
X_train, X_val, X_test, y_train, y_val, y_test = load_data(
    numpy_dir=processed_data_dir,
    label_mapping=label_mapping,
    test_size=0.15,
    val_size=0.15,
    random_state=42
)

# Get number of classes from label mapping
num_classes = len(label_mapping)
print(f"Training with {num_classes} classes")


model, history = train_model(
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    num_classes=num_classes,
    epochs=10,  # Increase for better results
    batch_size=8
)

# Final evaluation on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Final Model Accuracy: {test_accuracy*100:.2f}%")

# Save the final model
model.save('arsl_recognition_model_2.keras')
print("Model saved successfully")

In [None]:
# Visualize training history
plt.figure(figsize=(12, 4))

# Plot training & validation accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])

# Plot training & validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])

plt.tight_layout()
plt.show()

6. Arabic Support

In [None]:
# Function to display arabic text on frame
def display_arabic_text(frame, text):
    reshaped_text = reshape(text)  # Reshape text for Arabic display
    bidi_text = get_display(reshaped_text)  # Get display text for correct display order
    cv2.putText(frame, bidi_text, (50,100),
               cv2.FONT_HERSHEY_SIMPLEX, 1,
               (0,255,0), 2)

In [None]:
# Function to get the Arabic label from the index
def get_arabic_label(index):
    arabic_labels = [ "اسمك ايه ؟", "اشاره", "الحمدلله","السلام عليكم","اللغه العربيه","ان شاء الله","انا","انت","ايه ؟","برنامج","تخرج",
    "جميل","دكتور","شكرا","الصم","طالب","عامل ايه ؟","فكرة","في","كلية حاسبات و معلومات","مترجم","مجتمع","مساعده","مشروع","ناجح","هدف","وعليكم السلام","و"]
    return arabic_labels[index]

7. Real-Time Translation

In [None]:
# for realtime translation

# Initialize MediaPipe hand solution outside the function
mp_hands = mp.solutions.hands.Hands()
# mp_pose = mp.solutions.pose.Pose()  # Comment out or remove this line

# Function to extract landmarks from a single frame
def extract_landmarks_single(frame):
    # Use the global MediaPipe hand solution
    global mp_hands

    # Convert frame to RGB for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process hand landmarks
    hand_results = mp_hands.process(frame_rgb)

    # Hand landmarks
    left_hand = [[0,0,0]]*21
    right_hand = [[0,0,0]]*21

    if hand_results.multi_hand_landmarks:
        for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                  hand_results.multi_handedness):
            if handedness.classification[0].label == "Left":
                left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
            else:
                right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

    # Flatten and return
    frame_data = np.array(left_hand + right_hand).flatten()
    return frame_data if (np.any(left_hand) or np.any(right_hand)) else None

In [None]:
# Function for real-time translation
def real_time_translation(model, seq_length=30):
    cap = cv2.VideoCapture(0)  # Open default camera
    buffer = []  # Initialize frame buffer

    while cap.isOpened():
        ret, frame = cap.read()   # Read frame from the camera
        if not ret: break   # Break if no frame is read

        # Process the frame to get hand and pose landmarks
        processed_frame = extract_landmarks_single(frame)

        if processed_frame is None:
            # Display a text to show hands if not detected
            cv2.putText(frame, "Show Hands", (50,50),
                      cv2.FONT_HERSHEY_SIMPLEX, 1,
                      (0,0,255), 2)

        # Append the frame to the buffer if landmarks are detected
        else:  # Has hands
            buffer.append(processed_frame) # Append the processed frame to buffer
            buffer = [f for f in buffer if f is not None][-seq_length:]  # Keep only the most recent frames and filter out any None


            if len(buffer) == seq_length:
                # Make a prediction using the model
                prediction = model.predict(np.array([buffer]))
                arabic_word = get_arabic_label(np.argmax(prediction)) # Get the predicted word
                display_arabic_text(frame, arabic_word) # Display it on the frame

        # Display the frame
        cv2.imshow('Translation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Exit if 'q' is pressed
            break

    cap.release()  # Release the camera
    cv2.destroyAllWindows()  # Close all windows

8. Save/Load Model

In [None]:
# Function to save the model
def save_model(model, path):
    model.save(path)

# Function to load the model
def load_model(path):
    return tf.keras.models.load_model(
        path,
        custom_objects={'TemporalAttention': TemporalAttention}
    )

In [None]:
model = load_model(model_save_path)
real_time_translation(model)