In [None]:
# !pip install mediapipe tensorflow opencv-python arabic-reshaper python-bidi

In [None]:
import mediapipe as mp
import cv2
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Flatten
from arabic_reshaper import reshape
from bidi.algorithm import get_display

1. Preprocessing with MediaPipe

In [None]:
input_video_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/Videos"
output_numpy_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/output_dir"
import os
import json
# Add this recovery function BEFORE calling process_dataset()
def recover_checkpoint(output_dir, checkpoint_file, batch_size):
    """Rebuild checkpoint from existing output files"""
    # Get all existing output files
    existing_files = set([f for f in os.listdir(output_dir) if f.endswith('.npy')])

    # Get full list of expected video paths
    input_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/Videos"  # Update this path
    video_paths = []
    for root, _, files in os.walk(input_dir):
        for file in files:
            if any(file.lower().endswith(ext) for ext in ['.mp4', '.avi', '.mov']):
                video_path = os.path.join(root, file)
                rel_path = os.path.relpath(video_path, input_dir)
                output_name = rel_path.replace(os.path.sep, '_').replace(' ', '_') + '.npy'
                video_paths.append(output_name)

    # Calculate progress
    checkpoint = {
        'processed': list(existing_files),
        'total_videos': len(video_paths),
        'version': 2
    }

    # Calculate batches completed based on processed count and batch size
    processed_count = len(existing_files)
    checkpoint['batch_counter'] = (processed_count // batch_size)

    # Save checkpoint
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint, f)

    print(f"Recovered: {processed_count} files | Estimated batches: {checkpoint['batch_counter']//batch_size}")

# Then modify your process_dataset call to:
checkpoint_file = '/content/drive/MyDrive/SignComm_Dataset/processing_checkpoint.json'
batch_size = 5  # Match your actual batch size

# Recover checkpoint before processing
if os.path.exists(checkpoint_file):
    recover_checkpoint(output_numpy_dir, checkpoint_file, batch_size)

# Then run processing as normal
# process_dataset(input_video_dir, output_numpy_dir, seq_length=30, batch_size=batch_size)

In [None]:
# All-in-one cell for running the dataset processing
def run_processing():
    # 1. Install required packages
    import sys
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'mediapipe', 'tensorflow', 'opencv-python',
                          'arabic-reshaper', 'python-bidi', 'scikit-learn', 'psutil'])

    # 2. Mount Google Drive if in Colab
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("Google Drive mounted successfully")
    except ImportError:
        print("Not running in Colab, skipping drive mount")

    # 3. Import necessary libraries
    import mediapipe as mp
    import cv2
    import numpy as np
    import os
    import json
    import gc
    import psutil
    from pathlib import Path
    from tqdm import tqdm

    # 4. Configure MediaPipe
    mp_hands = mp.solutions.hands
    mp_pose = mp.solutions.pose

    # 5. Memory monitoring function
    def print_memory_usage():
        process = psutil.Process(os.getpid())
        print(f"Memory usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")

    # 6. Landmark extraction with preallocated arrays
    def extract_landmarks(video_path, hands_model, pose_model):
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_BUFFERSIZE, 10)

        num_features = 75 * 3  # 33 pose + 21*2 hands
        initial_capacity = 100  # Initial frame buffer size
        sequence = np.zeros((initial_capacity, num_features), dtype=np.float32)
        current_frame = 0
        hands_detected = False

        try:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break

                # Process frame
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                hand_results = hands_model.process(frame_rgb)
                pose_results = pose_model.process(frame_rgb)

                # Initialize landmark data
                frame_data = np.zeros(num_features, dtype=np.float32)

                # Pose landmarks (0-98)
                if pose_results.pose_landmarks:
                    pose_flat = np.array(
                        [[lmk.x, lmk.y, lmk.z] for lmk in pose_results.pose_landmarks.landmark]
                    ).flatten()
                    frame_data[:99] = pose_flat

                # Hands (99-225)
                hand_idx = 99
                if hand_results.multi_hand_landmarks:
                    for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                              hand_results.multi_handedness):
                        hand_flat = np.array(
                            [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
                        ).flatten()
                        # Left hand (99-161), Right hand (162-224)
                        start_idx = 99 if handedness.classification[0].label == "Left" else 162
                        frame_data[start_idx:start_idx+63] = hand_flat
                        hands_detected = True

                # Resize buffer if needed
                if current_frame >= sequence.shape[0]:
                    new_sequence = np.zeros((sequence.shape[0]*2, num_features), dtype=np.float32)
                    new_sequence[:sequence.shape[0]] = sequence
                    sequence = new_sequence

                sequence[current_frame] = frame_data
                current_frame += 1

        except Exception as e:
            print(f"Error processing frame: {str(e)}")
            return None
        finally:
            cap.release()

        if not hands_detected:
            print(f"No hands detected in {video_path}")
            return None

        return sequence[:current_frame]

    # 7. Dataset processing with batch handling
    # Modified dataset processing function with relaxed checkpoint handling
    def process_dataset(input_dir, output_dir, seq_length=30, batch_size=10,
                      checkpoint_file='processing_checkpoint.json'):
        os.makedirs(output_dir, exist_ok=True)
        allowed_extensions = ['.mp4', '.avi', '.mov']

        # Initialize checkpoint structure
        default_checkpoint = {
            'processed': [],
            'version': 2
        }

        # Load or initialize checkpoint
        if os.path.exists(checkpoint_file):
            try:
                with open(checkpoint_file, 'r') as f:
                    checkpoint = json.load(f)

                # Handle legacy formats
                if isinstance(checkpoint, list):  # Old list-only format
                    checkpoint = {'processed': checkpoint, 'version': 1}
                elif 'total_videos' in checkpoint:  # Remove deprecated field
                    del checkpoint['total_videos']

                processed_videos = set(checkpoint.get('processed', []))

            except Exception as e:
                print(f"Error loading checkpoint: {str(e)}, starting fresh")
                processed_videos = set()
        else:
            processed_videos = set()

        # Always build fresh video list (handles dataset changes)
        video_paths = []
        for root, _, files in os.walk(input_dir):
            for file in files:
                if any(file.lower().endswith(ext) for ext in allowed_extensions):
                    video_path = os.path.join(root, file)
                    rel_path = os.path.relpath(video_path, input_dir)
                    output_name = rel_path.replace(os.path.sep, '_').replace(' ', '_') + '.npy'
                    video_paths.append((video_path, output_name))

        # Filter out already processed videos
        unprocessed_videos = [(vp, on) for vp, on in video_paths if on not in processed_videos]
        print(f"Found {len(video_paths)} total videos ({len(unprocessed_videos)} remaining)")

        # Process in batches
        for batch_idx in range(0, len(unprocessed_videos), batch_size):
            current_batch = unprocessed_videos[batch_idx:batch_idx + batch_size]
            print(f"\nProcessing batch {(batch_idx//batch_size)+1}/{(len(unprocessed_videos)//batch_size)+1}")
            print_memory_usage()

            # Initialize MediaPipe models per batch
            hands = mp_hands.Hands()
            pose = mp_pose.Pose()
            batch_processed = []

            try:
                for video_path, output_name in current_batch:
                    try:
                        print(f"Processing: {os.path.basename(video_path)}")
                        sequence = extract_landmarks(video_path, hands, pose)
                        if sequence is None:
                            continue

                        # Pad/trim sequence
                        padded = np.zeros((seq_length, sequence.shape[1]), dtype=np.float32)
                        if len(sequence) > seq_length:
                            padded = sequence[:seq_length]
                        else:
                            padded[:len(sequence)] = sequence

                        # Save results
                        np.save(os.path.join(output_dir, output_name), padded)
                        batch_processed.append(output_name)

                    except Exception as e:
                        print(f"Error processing {video_path}: {str(e)}")

                # Update checkpoint after batch
                processed_videos.update(batch_processed)
                with open(checkpoint_file, 'w') as f:
                    json.dump({
                        'processed': list(processed_videos),
                        'version': 2
                    }, f)
                print(f"Completed batch {(batch_idx//batch_size)+1}")
                print_memory_usage()

            finally:
                # Release resources
                hands.close()
                pose.close()
                del hands
                del pose
                gc.collect()

        print(f"\nProcessing completed. Total videos processed: {len(processed_videos)}")
        return processed_videos

    # 8. Set paths and run processing
    input_video_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/Videos"
    output_numpy_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/output_dir"

    return process_dataset(input_video_dir, output_numpy_dir, seq_length=30, batch_size=5, checkpoint_file="/content/drive/MyDrive/SignComm_Dataset/processing_checkpoint.json")

# Execute the function
processed_videos = run_processing()





# input_video_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/Videos"
# output_numpy_dir = "/content/drive/MyDrive/SignComm_Dataset/ArSL_Dataset/output_dir"

2. Data Conversion to Numpy Arrays

In [None]:
# Function to process all videos in a directory and save as numpy arrays
def process_dataset(input_dir, output_dir, seq_length=30, batch_size=10, checkpoint_file='processing_checkpoint.json'):
  os.makedirs(output_dir, exist_ok=True)
  allowed_extensions = ['.mp4', '.avi', '.mov']
  num_features = 75 * 3  # 33 pose + 21*2 hands

  # Load checkpoint if exists
  processed_videos = set()
  if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
      processed_videos = set(json.load(f))
    print(f"Loaded checkpoint with {len(processed_videos)} processed videos")

  # Get all video paths first
  all_videos = []
  for sign_name in os.listdir(input_dir):
    sign_path = os.path.join(input_dir, sign_name)
    if not os.path.isdir(sign_path):
      continue

    for signer_name in os.listdir(sign_path):
      signer_path = os.path.join(sign_path, signer_name)
      if not os.path.isdir(signer_path):
        continue

      for video_file in os.listdir(signer_path):
        if not any(video_file.lower().endswith(ext) for ext in allowed_extensions):
          continue

        video_path = os.path.join(signer_path, video_file)
        output_name = f"{sign_name}_{signer_name}_{os.path.splitext(video_file)[0]}.npy"
        output_path = os.path.join(output_dir, output_name)

        # Skip if already processed
        if os.path.exists(output_path) or output_name in processed_videos:
          continue

        all_videos.append((video_path, sign_name, signer_name, video_file))

  # Process in batches to limit memory usage
  total_videos = len(all_videos)
  print(f"Found {total_videos} videos to process")

  # Process in batches to limit memory usage
  for i in range(0, total_videos, batch_size):
    batch = all_videos[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(total_videos + batch_size - 1)//batch_size}: {i}-{min(i+batch_size, total_videos)} of {total_videos} videos")

    for video_index, (video_path, sign_name, signer_name, video_file) in enumerate(batch):
      try:
        print(f"  Processing video {i + video_index + 1}/{total_videos}: {video_file}")

        # Extract landmarks
        sequence = extract_landmarks(video_path)

        if len(sequence) == 0:
          print(f"  Skipping {video_path} - no hands detected")
          continue

        # Create padded sequence
        padded_sequence = np.zeros((seq_length, num_features))

        if len(sequence) > seq_length:
          padded_sequence = sequence[:seq_length]
        else:
          padded_sequence[:len(sequence)] = sequence

        # Save numpy array
        output_name = f"{sign_name}_{signer_name}_{os.path.splitext(video_file)[0]}.npy"
        np.save(os.path.join(output_dir, output_name), padded_sequence)

        # Add to processed videos and update checkpoint
        processed_videos.add(output_name)
        with open(checkpoint_file, 'w') as f:
          json.dump(list(processed_videos), f)

        # Force garbage collection after each video
        sequence = None
        padded_sequence = None
        gc.collect()

      except Exception as e:
        print(f"  Error processing {video_path}: {str(e)}")

      # Explicitly release MediaPipe resources
      if 'mp_hands' in locals():
        mp_hands.close()
      if 'mp_pose' in locals():
        mp_pose.close()

      # Force garbage collection after each video
      gc.collect()

    # Force aggressive garbage collection after each batch
    gc.collect()

  print(f"Processing completed. Total videos processed: {len(processed_videos)}")
  return processed_videos

In [None]:
# Set the paths
input_video_dir = "/path/to/your/raw/videos"  # Raw videos (28 subfolders)
output_numpy_dir = "/path/to/processed/data"  # Processed numpy data

# to process all videos
process_dataset(input_video_dir, output_numpy_dir, seq_length=30)

-----------------------------END OF PREPROCESSING AND DATA CONVERSION TO NUMPY----------------------------------------------

3. Data loading

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def find_full_key_by_first_word(first_word):
    for key in label_mapping.keys():
        # Split the key into words and check if the first word matches the input
        words = key.split()
        if words and words[0] == first_word:
            return key
    return None

# Function to load data from numpy arrays and prepare for model training
def load_data(numpy_dir, label_mapping):
    X = []  # List to hold feature data
    y = []  # List to hold label data

    # Loop through each numpy file in directory
    for file in os.listdir(numpy_dir):
        if not file.endswith(".npy") or file.startswith("اسمك") :  # Skip non-numpy files
            continue

        # Extract label from the first part of the filename
        label = file.split("_")[0]
        label = find_full_key_by_first_word(label)
        class_idx = label_mapping[label]  # Get the class index from label mapping

        data = np.load(os.path.join(numpy_dir, file)) # Load numpy array from the file
        X.append(data)  # Append data to features list
        y.append(class_idx) # Append label to labels list

    X = np.array(X)  # Convert feature list to numpy array
    y = to_categorical(y, num_classes=28)  # Convert labels to encoded format
    return train_test_split(X, y, test_size=0.2) # Split the data into training and testing sets

4. LSTM Model with Attention

In [None]:
from tensorflow.keras.layers import Layer, LSTM, Dense, Permute, Multiply, Flatten
from tensorflow.keras.layers import Bidirectional

# Define custom temporal attention layer
class TemporalAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        # Initialize attention weight
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal')

    def call(self, x):
        # Calculate attention weights
        e = tf.tanh(tf.matmul(x, self.W))
        a = tf.nn.softmax(e, axis=1)
        # Apply attention to the input sequence
        output = x * a
        # Aggregate the attentionaly weighted features over the sequence
        return tf.reduce_sum(output, axis=1)

# Function to build the LSTM model with attention
def build_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # Bi-directional LSTM layers with return sequences, Combat overfitting by adding dropout layers to the LSTM model
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.3))(inputs)
    x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2))(x)

    # Temporal Attention Layer
    attention = TemporalAttention()(x)

    # Classification using dense layer
    outputs = Dense(num_classes, activation='softmax')(attention)

    # Create model object
    model = Model(inputs, outputs)
    # Compile the model
    model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
    return model

5. Training and Evaluation

In [None]:
# Function to train and evaluate the model
def train_model(X_train, y_train, X_test, y_test):
    # Build the model using the specified input shape and number of classes
    model = build_model(X_train.shape[1:], 28)

    # Define callbacks for early stopping and saving the best model
    callbacks = [
        # tf.keras.callbacks.EarlyStopping(patience=10),
        tf.keras.callbacks.ModelCheckpoint('best_model2.keras', save_best_only=True)
    ]

    # Train the model
    history = model.fit(X_train, y_train,
                      validation_data=(X_test, y_test),
                      epochs=100,
                      batch_size=32,
                      verbose=1,  # show progress
                      callbacks=callbacks)
    return model   # Return the trained model

In [None]:
# Create label mapping based on your dataset's sign names
label_mapping = {
    'اسمك ايه ؟': 0,
    'اشاره': 1,
    'الحمدلله': 2,
    'السلام عليكم': 3,
    'اللغه العربيه': 4,
    'ان شاء الله': 5,
    'انا': 6,
    'انت': 7,
    'ايه ؟': 8,
    'برنامج': 9,
    'تخرج': 10,
    'جميل': 11,
    'دكتور': 12,
    'شكرا': 13,
    'الصم': 14,
    'طالب': 15,
    'عامل ايه ؟': 16,
    'فكرة': 17,
    'في': 18,
    'كلية حاسبات و معلومات': 19,
    'مترجم': 20,
    'مجتمع': 21,
    'مساعده': 22,
    'مشروع': 23,
    'ناجح': 24,
    'هدف': 25,
    'وعليكم السلام': 26,
    'و': 27,
}

output_numpy_dir = r"C:\Users\DELL\Desktop\ArSL_Model2\processed_npy_arrays"  # Processed numpy data

# Load data and train
X_train, X_test, y_train, y_test = load_data(output_numpy_dir, label_mapping)
model = train_model(X_train, y_train, X_test, y_test)

# Final evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Final Model Accuracy: {accuracy*100:.2f}%")  #accuracy 99.02%, in 31m and 30.6s execution time

---------------------------------------------------END OF MODEL TRAINING----------------------------------------------------

6. Arabic Support

In [None]:
from PIL import Image, ImageFont, ImageDraw
import numpy as np
import cv2
from arabic_reshaper import reshape
from bidi.algorithm import get_display

# # Function to display arabic text on frame
def display_arabic_text(frame, text):
    # Convert OpenCV frame to PIL Image
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)
    
    # Load Arabic font 
    font = ImageFont.truetype(r"C:\Users\DELL\Desktop\ArSL_Model2\font\static\NotoSansArabic_Condensed-Bold.ttf", 30)
    
    # Reshape and apply Bidi
    reshaped_text = reshape(text)
    bidi_text = get_display(reshaped_text)
    
    # Draw text
    draw.text((50, 100), bidi_text, font=font, fill=(0, 255, 0))
    
    # Convert back to OpenCV format
    frame[:] = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

In [None]:
# Function to get the Arabic label from the index
def get_arabic_label(index):
    arabic_labels = [ "اسمك ايه ؟", "اشاره", "الحمدلله",
                     "السلام عليكم","الصم","اللغه العربيه","ان شاء الله",
                     "انا","انت","ايه ؟","برنامج","تخرج",
                     "جميل","دكتور","شكرا",
                     "طالب","عامل ايه ؟",
                     "فكرة","في","كلية حاسبات و معلومات",
                     "مترجم","مجتمع","مساعده",
                     "مشروع","ناجح","هدف",
                     "وعليكم السلام","و"]
    return arabic_labels[index]

In [None]:
# Initialize MediaPipe hand and pose solutions outside the function
mp_hands = mp.solutions.hands.Hands(
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)
mp_pose = mp.solutions.pose.Pose(
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

# Function to extract landmarks from a single frame
def extract_landmarks_single(frame):
    # Use the global MediaPipe hand and pose solutions
    global mp_hands, mp_pose

    # Convert frame to RGB for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process landmarks
    hand_results = mp_hands.process(frame_rgb)
    pose_results = mp_pose.process(frame_rgb)

    # Same logic as extract_landmarks but for a single frame
    frame_data = []

    # Pose landmarks
    if pose_results.pose_landmarks:
        pose_data = [[lmk.x, lmk.y, lmk.z] for lmk in pose_results.pose_landmarks.landmark]
    else:
        pose_data = [[0,0,0]]*33

    # Hand landmarks
    left_hand = [[0,0,0]]*21
    right_hand = [[0,0,0]]*21

    if hand_results.multi_hand_landmarks:
        for hand, handedness in zip(hand_results.multi_hand_landmarks,
                                  hand_results.multi_handedness):
            if handedness.classification[0].label == "Left":
                left_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]
            else:
                right_hand = [[lmk.x, lmk.y, lmk.z] for lmk in hand.landmark]

    # Flatten and return
    frame_data = np.array(pose_data + left_hand + right_hand).flatten()
    return frame_data if (np.any(left_hand) or np.any(right_hand)) else None

7. Real-Time Translation

In [None]:
# Function for real-time translation
def real_time_translation(model, seq_length=30):
    cap = cv2.VideoCapture(0)  # Open default camera
    buffer = []  # Initialize frame buffer

    while cap.isOpened():
        ret, frame = cap.read()   # Read frame from the camera
        if not ret: break   # Break if no frame is read 

        # Process the frame to get hand and pose landmarks
        processed_frame = extract_landmarks_single(frame)

        if processed_frame is None:
            # Display a text to show hands if not detected
            cv2.putText(frame, "Show Hands", (50,50),
                      cv2.FONT_HERSHEY_SIMPLEX, 1,
                      (0,0,255), 2)

        # Append the frame to the buffer if landmarks are detected
        else:  # Has hands
            buffer.append(processed_frame) # Append the processed frame to buffer
            buffer = [f for f in buffer if f is not None][-seq_length:]  # Keep only the most recent frames and filter out any None


            if len(buffer) == seq_length:
                # Make a prediction using the model
                prediction = model.predict(np.array([buffer]))
                arabic_word = get_arabic_label(np.argmax(prediction)) # Get the predicted word
                display_arabic_text(frame, arabic_word) # Display it on the frame

        # Display the frame
        cv2.imshow('Translation', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Exit if 'q' is pressed
            break

    cap.release()  # Release the camera
    cv2.destroyAllWindows()  # Close all windows

8. Load Model

In [None]:
# Function to load the model
def load_model(path):
    return tf.keras.models.load_model(
        path,
        custom_objects={'TemporalAttention': TemporalAttention}
    )

In [None]:
model = load_model('best_model2.keras')
real_time_translation(model)