In [None]:
import mediapipe as mp
import cv2
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Flatten
from arabic_reshaper import reshape
from bidi.algorithm import get_display

1. Preprocessing with MediaPipe

In [None]:
# Function to extract pose and hand landmarks from a video
def extract_landmarks(video_path):
    # Initialize MediaPipe solutions for hands and pose detection
    mp_hands = mp.solutions.hands
    mp_pose = mp.solutions.pose
    hands = mp_hands.Hands()
    pose = mp_pose.Pose()

    # Open video file
    cap = cv2.VideoCapture(video_path)
    sequence = []   # List to hold frame data

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break    # Break if no frame

        # Process frame
        # Convert frame to RGB for MediaPipe processing
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame to get hand and pose landmarks
        hand_results = hands.process(frame_rgb)
        pose_results = pose.process(frame_rgb)


        # Get landmarks, Initialize lists to hold landmarks
        frame_data = []

        # Extract pose landmarks (33 landmarks)
        if pose_results.pose_landmarks:
            pose_data = [[lmk.x, lmk.y, lmk.z] for lmk in pose_results.pose_landmarks.landmark]
        else:
            pose_data = [[0,0,0]]*33   # Pad if no pose detected

        # Initialize lists for hands landmarks (21 landmarks each)
        left_hand = [[0,0,0]]*21
        right_hand = [[0,0,0]]*21

        # Extract hand landmarks
        if hand_results.multi_hand_landmarks:
            for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                      hand_results.multi_handedness):
                if handedness.classification[0].label == "Left":
                    left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
                else:
                    right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

        # Flatten and combine all landmarks
        frame_data = np.array(pose_data + left_hand + right_hand).flatten()

        # Only add frames with hands (Append frame to sequence only if hands are present)
        if (np.any(left_hand != [0,0,0]) or np.any(right_hand != [0,0,0])):
           sequence.append(frame_data)

    if len(sequence) == 0:
        print(f"Warning: No hands detected in {video_path}")
    return np.array(sequence)

    cap.release()  # Release the video
    return np.array(sequence)  # Return the sequence of landmarks

In [None]:
# Initialize MediaPipe hand and pose solutions outside the function
mp_hands = mp.solutions.hands.Hands()
mp_pose = mp.solutions.pose.Pose()

# Function to extract landmarks from a single frame
def extract_landmarks_single(frame):
    # Use the global MediaPipe hand and pose solutions
    global mp_hands, mp_pose
    hands = mp_hands.Hands()
    pose = mp_pose.Pose()

    # Convert frame to RGB for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process landmarks
    hand_results = hands.process(frame_rgb)
    pose_results = pose.process(frame_rgb)

    # Same logic as extract_landmarks but for a single frame
    frame_data = []

    # Pose landmarks
    if pose_results.pose_landmarks:
        pose_data = [[lmk.x, lmk.y, lmk.z] for lmk in pose_results.pose_landmarks.landmark]
    else:
        pose_data = [[0,0,0]]*33

    # Hand landmarks
    left_hand = [[0,0,0]]*21
    right_hand = [[0,0,0]]*21

    if hand_results.multi_hand_landmarks:
        for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                  hand_results.multi_handedness):
            if handedness.classification[0].label == "Left":
                left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
            else:
                right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

    # Flatten and return
    frame_data = np.array(pose_data + left_hand + right_hand).flatten()
    return frame_data if (np.any(left_hand) or np.any(right_hand)) else None

2. Data Conversion to Numpy Arrays

In [None]:
# Function to process all videos in a directory and save as numpy arrays
def process_dataset(input_dir, output_dir, seq_length=30):
    os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist
    allowed_extensions = ['.mp4', '.avi', '.mov']  # List of video file extensions

    # Loop through each sign folder
    for sign_name in os.listdir(input_dir):
        sign_path = os.path.join(input_dir, sign_name)
        if not os.path.isdir(sign_path):  # Skip if it's not a directory
            continue

        # Loop through each signer folder
        for signer_name in os.listdir(sign_path):
            signer_path = os.path.join(sign_path, signer_name)
            if not os.path.isdir(signer_path):
                continue

            # Loop through each video file
            for video_file in os.listdir(signer_path):
                # Skip non-video files
                if not any(video_file.lower().endswith(ext) for ext in allowed_extensions):
                  continue

                video_path = os.path.join(signer_path, video_file)

                sequence = extract_landmarks(video_path)  # Extract landmarks from video

                if len(sequence) == 0:  # Skip videos with no hands detected
                  continue

                # Padding with zeros to match sequence length
                num_features = 75 * 3  # 33 pose + 21*2 hands
                padded_sequence = np.zeros((seq_length, num_features))

                if len(sequence) > seq_length:
                   padded_sequence = sequence[:seq_length]  # If sequence is longer, truncate
                else:
                   padded_sequence[:len(sequence)] = sequence   # If sequence is shorter, pad with zeros

                # Save with sign_name in filename
                base_name = os.path.splitext(video_file)[0]
                numpy_filename = f"{sign_name}_{signer_name}_{base_name}.npy"
                np.save(os.path.join(output_dir, numpy_filename), padded_sequence) # Save the padded sequence as numpy array

In [None]:
# Set the paths
input_video_dir = r"C:\Users\DELL\Desktop\ArSL_Model\ArSL_Dataset\Videos"  # Raw videos (28 subfolders)
output_numpy_dir = r"C:\Users\DELL\Desktop\ArSL_Model\output_dir"  # Processed numpy data
model_save_path = r"C:\Users\DELL\Desktop\ArSL_Model\model.h5"     # Trained model path

# to process all videos
process_dataset(input_video_dir, output_numpy_dir, seq_length=30)

3. Data loading

In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Flatten
# from arabic_reshaper import reshape
# from bidi.algorithm import get_display

In [2]:
def find_full_key_by_first_word(first_word):
    for key in label_mapping.keys():
        # Split the key into words and check if the first word matches the input
        words = key.split()
        if words and words[0] == first_word:
            return key
    return None

# Function to load data from numpy arrays and prepare for model training
def load_data(numpy_dir, label_mapping):
    X = []  # List to hold feature data
    y = []  # List to hold label data

    # Loop through each numpy file in directory
    for file in os.listdir(numpy_dir):
        if not file.endswith(".npy") or file.startswith("اسمك") :  # Skip non-numpy files
            continue

        # Extract label from the first part of the filename
        label = file.split("_")[0]
        label = find_full_key_by_first_word(label)
        class_idx = label_mapping[label]  # Get the class index from label mapping

        data = np.load(os.path.join(numpy_dir, file)) # Load numpy array from the file
        X.append(data)  # Append data to features list
        y.append(class_idx) # Append label to labels list

    X = np.array(X)  # Convert feature list to numpy array
    y = to_categorical(y, num_classes=28)  # Convert labels to encoded format
    return train_test_split(X, y, test_size=0.2) # Split the data into training and testing sets

4. LSTM Model with Attention

In [3]:
from tensorflow.keras.layers import Layer, LSTM, Dense, Permute, Multiply, Flatten
from tensorflow.keras.layers import Bidirectional

# Define custom temporal attention layer
class TemporalAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        # Initialize attention weight
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')

    def call(self, x):
        # Calculate attention weights
        e = tf.tanh(tf.matmul(x, self.W))
        a = tf.nn.softmax(e, axis=1)
        # Apply attention to the input sequence
        output = x * a
        # Aggregate the attentionaly weighted features over the sequence
        return tf.reduce_sum(output, axis=1)

# Function to build the LSTM model with attention
def build_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # Bi-directional LSTM layers with return sequences
    x = Bidirectional(LSTM(256, return_sequences=True))(inputs)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)

    # Temporal Attention Layer
    attention = TemporalAttention()(x)

    # Classification using dense layer
    outputs = Dense(num_classes, activation='softmax')(attention)

    # Create model object
    model = Model(inputs, outputs)
    # Compile the model
    model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
    return model
# print("ran")

5. Training and Evaluation

In [10]:
# Function to train and evaluate the model
def train_model(X_train, y_train, X_test, y_test):
    # Build the model using the specified input shape and number of classes
    model = build_model(X_train.shape[1:], 28)

    # Define callbacks for early stopping and saving the best model
    callbacks = [
        # I'll remove EarlyStopping to ensure full 5 epochs
        # tf.keras.callbacks.EarlyStopping(patience=10),
        tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
    ]

    # Train the model
    history = model.fit(X_train, y_train,
                      validation_data=(X_test, y_test),
                      epochs=100,
                      batch_size=64,
                      verbose=1,  # show progress
                      callbacks=callbacks)
    return model   # Return the trained model

In [11]:
# Create label mapping based on your dataset's sign names
label_mapping = {
    'اسمك ايه ؟': 0,
    'اشاره': 1,
    'الحمدلله': 2,
    'السلام عليكم': 3,
    'الصم': 4,
    'اللغه العربيه': 5,
    'ان شاء الله': 6,
    'انا': 7,
    'انت': 8,
    'ايه ؟': 9,
    'برنامج': 10,
    'تخرج': 11,
    'جميل': 12,
    'دكتور': 13,
    'شكرا': 14,
    'طالب': 15,
    'عامل ايه ؟': 16,
    'فكرة': 17,
    'في': 18,
    'كلية حاسبات و معلومات': 19,
    'مترجم': 20,
    'مجتمع': 21,
    'مساعده': 22,
    'مشروع': 23,
    'ناجح': 24,
    'هدف': 25,
    'و': 26,
    'وعليكم السلام': 27,
}
output_numpy_dir = "/kaggle/input/processed-arsl-dataset/processed_npy_arrays"
# Load data and train
X_train, X_test, y_train, y_test = load_data(output_numpy_dir, label_mapping)
model = train_model(X_train, y_train, X_test, y_test)

# Final evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Final Model Accuracy: {accuracy*100:.2f}%")

Epoch 1/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.4668 - loss: 1.6701 - val_accuracy: 0.8540 - val_loss: 0.4562
Epoch 2/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8612 - loss: 0.4335 - val_accuracy: 0.9221 - val_loss: 0.2692
Epoch 3/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9062 - loss: 0.2911 - val_accuracy: 0.9312 - val_loss: 0.2228
Epoch 4/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9378 - loss: 0.1923 - val_accuracy: 0.9403 - val_loss: 0.1891
Epoch 5/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9565 - loss: 0.1426 - val_accuracy: 0.9474 - val_loss: 0.1602
Epoch 6/100
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9592 - loss: 0.1228 - val_accuracy: 0.9661 - val_loss: 0.1128
Epoch 7/100
[1m