# Sign Language Detection Model Training
This notebook trains a model to recognize American Sign Language (ASL) gestures.
The model works during specified time periods (6 PM to 10 PM) as per requirements.

In [1]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from datetime import datetime
import mediapipe as mp

## Dataset Setup
Using ASL Alphabet dataset for training

In [2]:
# ASL alphabet letters (A-Z)
asl_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
               'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2' '3', '4', '5', '6', '7', '8', '9']

all_classes = asl_letters 

# Create dataset structure
dataset_path = '../datasets/sign_language_data'

for class_name in all_classes:
    os.makedirs(f'{dataset_path}/train/{class_name}', exist_ok=True)
    os.makedirs(f'{dataset_path}/val/{class_name}', exist_ok=True)


## Time-based Operation Check

In [3]:
def is_operational_time():
    """
    Check if current time is between 6 PM and 10 PM
    """
    current_time = datetime.now().time()
    start_time = datetime.strptime('18:00', '%H:%M').time()  # 6 PM
    end_time = datetime.strptime('22:00', '%H:%M').time()    # 10 PM
    
    return start_time <= current_time <= end_time

print(f'Current operational status: {is_operational_time()}')

Current operational status: False


## Data Preprocessing with Hand Landmarks

In [4]:
# Initialize MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)

# Image parameters
IMG_WIDTH, IMG_HEIGHT = 224, 224
BATCH_SIZE = 32
EPOCHS = 25

def extract_hand_landmarks(image):
    """
    Extract hand landmarks from image
    """
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_image)
    
    landmarks = []
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for landmark in hand_landmarks.landmark:
                landmarks.extend([landmark.x, landmark.y, landmark.z])
    
    # Pad with zeros if no hands detected
    while len(landmarks) < 63:  # 21 landmarks * 3 coordinates
        landmarks.append(0.0)
    
    return np.array(landmarks[:63])  # Take first 63 values

# Data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    validation_split=0.2
)

val_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

## Model Architecture
Using MobileNetV2 for efficient real-time processing

In [None]:
import tensorflow as tf

# Image branch
image_input = tf.keras.Input(shape=(224, 224, 3), name="images")
x1 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu")(image_input)
x1 = tf.keras.layers.MaxPooling2D()(x1)
x1 = tf.keras.layers.Flatten()(x1)
x1 = tf.keras.layers.Dense(128, activation="relu")(x1)

# Landmark branch
landmark_input = tf.keras.Input(shape=(21, 2), name="landmarks")
x2 = tf.keras.layers.Dense(64, activation="relu")(landmark_input)

# collapse to 2D
x2 = tf.keras.layers.GlobalAveragePooling1D()(x2)   # (batch, 64)
# alternative: x2 = tf.keras.layers.Flatten()(x2)   # (batch, 21*64)

# Merge
merged = tf.keras.layers.Concatenate()([x1, x2])
x = tf.keras.layers.Dense(128, activation="relu")(merged)
output = tf.keras.layers.Dense(len(set(labels)), activation="softmax")(x)

# Build model
model = tf.keras.Model(inputs=[image_input, landmark_input], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 images (InputLayer)         [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv2d_3 (Conv2D)           (None, 222, 222, 32)         896       ['images[0][0]']              
                                                                                                  
 max_pooling2d_3 (MaxPoolin  (None, 111, 111, 32)         0         ['conv2d_3[0][0]']            
 g2D)                                                                                             
                                                                                                  
 landmarks (InputLayer)      [(None, 21, 2)]              0         []                      

## Custom Data Generator for Multi-input Model

In [17]:
class SignLanguageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, image_paths, labels, batch_size=32, img_size=(224, 224)):
        self.image_paths = image_paths
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size

        # Create label mapping
        self.all_classes = sorted(set(labels))
        self.label_to_index = {label: idx for idx, label in enumerate(self.all_classes)}

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_indexes = self.image_paths[index * self.batch_size:(index + 1) * self.batch_size]
        batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]

        return self._generate_data(batch_indexes, batch_labels)

    def _generate_data(self, batch_paths, batch_labels):
        # Allocate memory for this batch
        X_images = np.zeros((len(batch_paths), self.img_size[0], self.img_size[1], 3), dtype=np.float32)
        X_landmarks = np.zeros((len(batch_paths), 21, 2), dtype=np.float32)  # <-- FIXED
        y = np.zeros((len(batch_paths), len(self.all_classes)), dtype=np.float32)

        for i, (path, label) in enumerate(zip(batch_paths, batch_labels)):
            # Load image
            img = cv2.imread(path)
            img = cv2.resize(img, self.img_size)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            X_images[i] = img_rgb / 255.0

            # Extract landmarks
            landmarks = extract_hand_landmarks(img_rgb)
            if landmarks is not None:
                landmarks = np.array(landmarks).reshape(-1, 3)[:, :2]  # keep x,y only
            else:
                landmarks = np.zeros((21, 2))

            X_landmarks[i] = landmarks

            # One-hot encode label
            label_index = self.label_to_index[label]
            y[i] = tf.keras.utils.to_categorical(label_index, num_classes=len(self.all_classes))

        return [X_images, X_landmarks], y


## Real-time Prediction Function

In [18]:
def predict_sign_language(frame, model):
    """
    Predict sign language from video frame
    Only works during operational hours (6 PM - 10 PM)
    """
    if not is_operational_time():
        return {'prediction': 'Model not operational', 'confidence': 0.0, 'operational': False}
    
    # Preprocess image
    img = cv2.resize(frame, (IMG_HEIGHT, IMG_WIDTH))
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_normalized = np.expand_dims(img_rgb / 255.0, axis=0)
    
    # Extract landmarks
    landmarks = extract_hand_landmarks(frame)
    landmarks_batch = np.expand_dims(landmarks, axis=0)
    
    # Predict
    predictions = model.predict([img_normalized, landmarks_batch])
    predicted_class = np.argmax(predictions[0])
    confidence = float(np.max(predictions[0]))
    
    return {
        'prediction': all_classes[predicted_class],
        'confidence': confidence,
        'operational': True
    }

def draw_hand_landmarks(image, results):
    """
    Draw hand landmarks on image
    """
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    return image

print('Prediction functions created.')

Prediction functions created.


## Training

In [19]:
import os
import glob

# Path to your dataset root folder
train_dir = r"C:\Users\sarva\Emotion_detection-main\datasets\sign_language_data\train"

paths = []
labels = []

# Loop through each subfolder (each class)
for folder in os.listdir(train_dir):
    folder_path = os.path.join(train_dir, folder)
    if os.path.isdir(folder_path):  
        for file in glob.glob(os.path.join(folder_path, "*.*")):  
            paths.append(file)
            labels.append(folder)  # label is the folder name

print("Total images:", len(paths))
print("Unique labels:", sorted(set(labels)))
print("Sample path:", paths[0] if paths else " No images found")
print("Sample label:", labels[0] if labels else " No labels found")


Total images: 2515
Unique labels: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Sample path: C:\Users\sarva\Emotion_detection-main\datasets\sign_language_data\train\0\hand1_0_bot_seg_1_cropped.jpeg
Sample label: 0


In [20]:
import cv2
import numpy as np

# Pick one image path from your dataset
sample_img_path = paths[0]

# Load image
img = cv2.imread(sample_img_path)

# Extract landmarks
landmarks = extract_hand_landmarks(img)

if landmarks is not None:
    landmarks = np.array(landmarks).reshape(21, 3)[:, :2]
else:
    landmarks = np.zeros((21, 2))

print("Final landmarks shape:", landmarks.shape)
print("First 5 points:\n", landmarks[:5])


Final landmarks shape: (21, 2)
First 5 points:
 [[0.30983633 0.33478421]
 [0.34160697 0.32744119]
 [0.38675517 0.29366809]
 [0.4496811  0.27904955]
 [0.50699651 0.28079921]]


In [21]:
# Pick one test image
img_path = paths[0]
img = cv2.imread(img_path)
img = cv2.resize(img, (224, 224))
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Extract landmarks
landmarks = extract_hand_landmarks(img_rgb)
if landmarks is not None:
    landmarks = np.array(landmarks).reshape(-1, 3)[:, :2]  # x,y only
else:
    landmarks = np.zeros((21, 2))

# Allocate a placeholder for testing
X_landmarks = np.zeros((1, 21, 2))  
X_landmarks[0] = landmarks

print("Final X_landmarks shape:", X_landmarks.shape)
print("First 5 points:\n", X_landmarks[0][:5])


Final X_landmarks shape: (1, 21, 2)
First 5 points:
 [[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [24]:
from sklearn.model_selection import train_test_split

# Split your dataset
train_paths, val_paths, train_labels, val_labels = train_test_split(
    paths, labels, test_size=0.2, stratify=labels, random_state=42
)

# Create generators
train_generator = SignLanguageDataGenerator(train_paths, train_labels, batch_size=32)
val_generator = SignLanguageDataGenerator(val_paths, val_labels, batch_size=32)

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3),
    tf.keras.callbacks.ModelCheckpoint('sign_language_model.h5', save_best_only=True)
]

# Train with validation
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=25,
    callbacks=callbacks
)

Epoch 1/25

  saving_api.save_model(


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


## Save Model

In [None]:
model.save('sign_language_detection_model_finals.h5')
print('Model saved successfully!')

Model saved successfully!
