In [None]:
# Required imports
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import TimeDistributed, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from PIL import Image, ImageFont, ImageDraw
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:

# Create label mapping based on your dataset's sign names
label_mapping = {
    'اسمك ايه ؟': 0, 'اشاره': 1, 'الحمدلله': 2,
    'السلام عليكم': 3, 'الصم': 4, 'اللغه العربيه': 5,
    'ان شاء الله': 6, 'انا': 7, 'انت': 8,
    'ايه ؟': 9, 'برنامج': 10, 'تخرج': 11,
    'جميل': 12, 'دكتور': 13, 'شكرا': 14,
    'طالب': 15, 'عامل ايه ؟': 16, 'فكرة': 17,
    'في': 18, 'كلية حاسبات و معلومات': 19, 'مترجم': 20,
    'مجتمع': 21, 'مساعده': 22, 'مشروع': 23,
    'ناجح': 24, 'هدف': 25, 'وعليكم السلام': 26, 'و': 27
}

# Create reverse mapping for inference
idx_to_label = {v: k for k, v in label_mapping.items()}



In [None]:
# Configuration
INPUT_VIDEO_DIR = "F:/SignComm/model test/V1/Videos"  # Path to the original videos
FRAME_HEIGHT = 128
FRAME_WIDTH = 128
NUM_FRAMES = 30  # Number of frames to extract from each video
NUM_CLASSES = 28

In [None]:
# Function to extract frames from a video
def extract_frames(video_path, num_frames=30, target_size=(128, 128)):
    frames = []
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # If video has fewer frames than required, we'll duplicate the last frame
    if total_frames <= 0:
        print(f"Error reading video: {video_path}")
        return None
    
    # Calculate frame indices to extract (evenly distributed)
    if total_frames >= num_frames:
        # Extract evenly spaced frames
        indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    else:
        # Use all frames and pad with the last frame
        indices = list(range(total_frames)) + [total_frames-1] * (num_frames - total_frames)
    
    # Extract the frames at calculated indices
    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            # Resize and convert to RGB
            frame = cv2.resize(frame, target_size)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        else:
            print(f"Error reading frame {idx} from {video_path}")
            # Use previous frame if available, or a blank frame
            if frames:
                frames.append(frames[-1])
            else:
                frames.append(np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8))
    
    cap.release()
    return np.array(frames)


In [None]:
# Load and preprocess the dataset
def load_dataset(video_dir, label_mapping, num_frames=30, target_size=(128, 128)):
    X = []
    y = []
    
    # Collect all video paths
    video_paths = []
    labels = []
    
    # Walk through the directory structure
    for sign_name in os.listdir(video_dir):
        sign_dir = os.path.join(video_dir, sign_name)
        if not os.path.isdir(sign_dir):
            continue
        
        # Check if the sign name is in our label mapping
        if sign_name not in label_mapping:
            print(f"Warning: Sign '{sign_name}' not found in label mapping, skipping...")
            continue
            
        label_idx = label_mapping[sign_name]
        
        # Process all videos for this sign
        for signer_name in os.listdir(sign_dir):
            signer_dir = os.path.join(sign_dir, signer_name)
            if not os.path.isdir(signer_dir):
                continue
                
            for video_file in os.listdir(signer_dir):
                if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
                    video_path = os.path.join(signer_dir, video_file)
                    video_paths.append(video_path)
                    labels.append(label_idx)
    
    # Process videos and extract frames
    print(f"Processing {len(video_paths)} videos...")
    
    for i, (video_path, label) in enumerate(tqdm(list(zip(video_paths, labels)))):
        frames = extract_frames(video_path, num_frames, target_size)
        if frames is not None:
            X.append(frames)
            y.append(label)
    
    # Convert to numpy arrays and normalize pixel values
    X = np.array(X, dtype=np.float32) / 255.0
    y = to_categorical(y, num_classes=len(label_mapping))
    
    return X, y


In [None]:
# Load dataset and split into train/test sets
def prepare_data():
    print("Loading and preprocessing dataset...")
    X, y = load_dataset(INPUT_VIDEO_DIR, label_mapping, NUM_FRAMES, (FRAME_HEIGHT, FRAME_WIDTH))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Dataset loaded: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples")
    return X_train, X_test, y_train, y_test


In [None]:

# CNN+LSTM Model
def build_cnn_lstm_model(input_shape, num_classes):
    # Base CNN model for feature extraction (MobileNetV2 for efficiency)
    base_model = MobileNetV2(
        input_shape=(input_shape[1], input_shape[2], 3),
        include_top=False,
        weights='imagenet',
        pooling='avg'
    )
    
    # Freeze early layers to prevent overfitting
    for layer in base_model.layers[:-20]:  # Keep some layers trainable
        layer.trainable = False
    
    # Create the model
    model = models.Sequential([
        # The TimeDistributed wrapper applies the CNN to each frame
        TimeDistributed(
            base_model,
            input_shape=(input_shape[0], input_shape[1], input_shape[2], 3)
        ),
        
        # Add LSTM layers for temporal features
        LSTM(256, return_sequences=True, dropout=0.3),
        LSTM(128, dropout=0.3),
        
        # Classification layers
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    # Compile the model
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.0001),  # Lower learning rate for fine-tuning
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model



In [None]:
# Function to train the model
def train_model():
    # Load and prepare data
    X_train, X_test, y_train, y_test = prepare_data()
    
    # Create the model
    input_shape = (NUM_FRAMES, FRAME_HEIGHT, FRAME_WIDTH, 3)
    model = build_cnn_lstm_model(input_shape, NUM_CLASSES)
    model.summary()
    
    # Define callbacks
    callbacks = [
        EarlyStopping(
            monitor='val_loss', 
            patience=10, 
            restore_best_weights=True
        ),
        ModelCheckpoint(
            'best_cnn_video_model.keras', 
            monitor='val_accuracy', 
            save_best_only=True,
            verbose=1
        )
    ]
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=8,  # Small batch size due to large model
        validation_data=(X_test, y_test),
        callbacks=callbacks
    )
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {accuracy*100:.2f}%")
    
    return model

In [None]:
# Function to display Arabic text on frame
def display_arabic_text(frame, text):
    # Convert OpenCV frame to PIL Image
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)
    
    # Load Arabic font
    font = ImageFont.truetype(r"F:\SignComm\model test\V1\font\NotoSansArabic-VariableFont_wdth,wght.ttf", 30)
    
    # Reshape and apply Bidi
    reshaped_text = reshape(text)
    bidi_text = get_display(reshaped_text)
    
    # Draw text
    draw.text((50, 100), bidi_text, font=font, fill=(0, 255, 0))
    
    # Convert back to OpenCV format
    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

In [None]:
# Real-time inference function
def real_time_inference():
    # Load the saved model
    model = tf.keras.models.load_model('best_cnn_video_model.keras')
    
    # Initialize video capture
    cap = cv2.VideoCapture(0)
    
    # Frame buffer and parameters
    frame_buffer = []
    buffer_size = NUM_FRAMES
    frame_interval = 1  # Capture every N frames
    count = 0
    
    # Inference parameters
    prediction_history = []
    prediction_window = 5
    current_prediction = None
    confidence_threshold = 0.7
    
    print("Starting real-time inference. Press 'q' to quit.")
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break
            
        # Show the original frame
        display_frame = frame.copy()
        
        # Process frame for the buffer (every N frames)
        count += 1
        if count % frame_interval == 0:
            # Preprocess the frame (resize and normalize)
            processed_frame = cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT))
            processed_frame = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)
            processed_frame = processed_frame.astype(np.float32) / 255.0
            
            # Add to buffer
            frame_buffer.append(processed_frame)
            
            # Keep only the most recent frames
            if len(frame_buffer) > buffer_size:
                frame_buffer.pop(0)
                
            # Make prediction when buffer is full
            if len(frame_buffer) == buffer_size:
                # Prepare input batch [1, num_frames, height, width, channels]
                input_data = np.array([frame_buffer])
                
                # Get prediction
                predictions = model.predict(input_data, verbose=0)
                pred_idx = np.argmax(predictions[0])
                confidence = predictions[0][pred_idx]
                
                # Display confidence
                confidence_text = f"Confidence: {confidence*100:.1f}%"
                cv2.putText(
                    display_frame, confidence_text, (10, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2
                )
                
                # Add to prediction history if confident enough
                if confidence > confidence_threshold:
                    prediction_history.append(pred_idx)
                    prediction_history = prediction_history[-prediction_window:]  # Keep only recent predictions
                    
                    # Only update displayed prediction when we have enough history
                    if len(prediction_history) >= 3:
                        # Get most common prediction in the window
                        from collections import Counter
                        counter = Counter(prediction_history)
                        most_common = counter.most_common(1)[0]
                        most_common_idx, count = most_common
                        
                        # Only change prediction if it appears enough times
                        if count >= max(2, len(prediction_history) * 0.6):
                            current_prediction = most_common_idx
                
                # Display the current prediction
                if current_prediction is not None:
                    arabic_text = idx_to_label[current_prediction]
                    display_frame = display_arabic_text(display_frame, arabic_text)
        
        # Display frame count indicator
        buffer_status = f"Frames: {len(frame_buffer)}/{buffer_size}"
        cv2.putText(
            display_frame, buffer_status, (10, 60), 
            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2
        )
        
        # Show the frame
        cv2.imshow('Real-time Sign Language Recognition', display_frame)
        
        # Break on 'q' press
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    # Release resources
    cap.release()
    cv2.destroyAllWindows()


In [None]:
# Main execution point
if __name__ == "__main__":
    # Uncomment the following line to train the model
    # model = train_model()
    
    # Uncomment the following line to run real-time inference
    # real_time_inference()
    
    print("Select an operation by uncommenting the appropriate line in the code.")