DATA PRE-PROCESSING

In [1]:
import cv2
import numpy as np
import os

data_dir = "C:/Users/sachi/Documents/aisign/venv/dataset"
image_size = (64, 64)
x, y = [], []

for label, sign in enumerate(os.listdir(data_dir)):
    for img_name in os.listdir(os.path.join(data_dir, sign)):
        img_path = os.path.join(data_dir, sign, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, image_size)
        x.append(img)
        y.append(label)

x = np.array(x) / 255.0  # Normalize
y = np.array(y)


HAND LANDMARKS

In [5]:
import mediapipe as mp

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)

def extract_hand_landmarks(image):
    results = hands.process(image)
    keypoints = []
    if results.multi_hand_landmarks:
        for hand in results.multi_hand_landmarks:
            for point in hand.landmark:
                keypoints.extend(point.x)
                keypoints.extend(point.y)
    
    return keypoints


In [1]:
import tensorflow as tf
print(tf.__version__)


2.19.0


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(64, 64, 3)),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(29, activation='softmax')  
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x, y, epochs=10, validation_split=0.2)
model.save("ai_sign_model.h5")

Epoch 1/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 30ms/step - accuracy: 0.6329 - loss: 1.2179 - val_accuracy: 0.0270 - val_loss: 29.7954
Epoch 2/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 27ms/step - accuracy: 0.9823 - loss: 0.0571 - val_accuracy: 0.0322 - val_loss: 40.8878
Epoch 3/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 30ms/step - accuracy: 0.9917 - loss: 0.0293 - val_accuracy: 0.0324 - val_loss: 37.5418
Epoch 4/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 30ms/step - accuracy: 0.9932 - loss: 0.0229 - val_accuracy: 0.0434 - val_loss: 38.3234
Epoch 5/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 30ms/step - accuracy: 0.9950 - loss: 0.0190 - val_accuracy: 0.0337 - val_loss: 39.1705
Epoch 6/10
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 31ms/step - accuracy: 0.9949 - loss: 0.0182 - val_accuracy: 0.0413 - val_loss: 46.360



In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Split the data into training and validation sets
X_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding since we're using categorical_crossentropy
y_train = to_categorical(y_train, num_classes=29)
y_val = to_categorical(y_val, num_classes=29)

# Reshape the input data for LSTM (samples, time steps, features)
# Assuming each image is treated as a sequence of rows
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], -1)
x_val = x_val.reshape(x_val.shape[0], x_val.shape[1], -1)

model = Sequential([
    LSTM(64, return_sequences=True, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),  
    LSTM(128, return_sequences=True, activation='relu'),
    LSTM(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(29, activation='softmax')  
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(x_val, y_val))
model.save("ai_sign_lstm_model.h5")

  super().__init__(**kwargs)


Epoch 1/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 41ms/step - accuracy: 0.0435 - loss: 478171.5625 - val_accuracy: 0.0326 - val_loss: 3.3685
Epoch 2/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 39ms/step - accuracy: 0.0340 - loss: 3.3678 - val_accuracy: 0.0328 - val_loss: 3.3680
Epoch 3/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 38ms/step - accuracy: 0.0334 - loss: 3.3676 - val_accuracy: 0.0326 - val_loss: 3.3680
Epoch 4/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 38ms/step - accuracy: 0.0353 - loss: 3.3675 - val_accuracy: 0.0346 - val_loss: 3.3680
Epoch 5/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 38ms/step - accuracy: 0.0322 - loss: 3.3676 - val_accuracy: 0.0339 - val_loss: 3.3680
Epoch 6/50
[1m2175/2175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 38ms/step - accuracy: 0.0354 - loss: 3.3675 - val_accuracy: 0.0333 - val_loss: 3.3680



In [6]:
import tensorflow as tf

model = tf.keras.models.load_model("ai_sign_model.h5")



In [7]:
print(model.input_shape)

(None, 64, 64, 3)


In [10]:
import os
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Load the trained model
model = tf.keras.models.load_model("ai_sign_lstm_model.h5")  # Ensure model file exists

# Define class labels (Modify based on your dataset)
CLASS_LABELS = {
    0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G", 7: "H", 8: "I",
    9: "J", 10: "K", 11: "L", 12: "M", 13: "N", 14: "O", 15: "P", 16: "Q",
    17: "R", 18: "S", 19: "T", 20: "U", 21: "V", 22: "W", 23: "X", 24: "Y",
    25: "Z", 26: "Nothing", 27: "Space", 28: "Delete"
}

def extract_landmarks(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    
    keypoints = []
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for point in hand_landmarks.landmark:
                keypoints.extend([point.x, point.y, point.z])

    while len(keypoints) < 192:  # Ensure correct shape
        keypoints.append(0.0)

    if keypoints:
        print(f"✅ Landmarks extracted for image!")  # Debug message

    return keypoints if keypoints else None


# Path to test dataset folder
TEST_FOLDER = "C:/Users/sachi/Documents/aisign/venv/test_dataset"

# Variables for evaluation
correct_predictions = 0
total_samples = 0
frame_sequence = []  # Store last 64 frames

# Loop through test images
for filename in os.listdir(TEST_FOLDER):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        total_samples += 1
        img_path = os.path.join(TEST_FOLDER, filename)

        # Load image
        image = cv2.imread(img_path)

        # Extract hand landmarks
        landmarks = extract_landmarks(image)

        if landmarks:
            frame_sequence.append(landmarks)

            # Maintain only last 64 frames
            if len(frame_sequence) > 64:
                frame_sequence.pop(0)

            if len(frame_sequence) == 64:  # Predict only if 64 frames are available
                input_data = np.array(frame_sequence, dtype=np.float32).reshape(1, 64, 192)
                prediction = model.predict(input_data)
                predicted_label = np.argmax(prediction)

                # Extract actual label from filename (e.g., "A_1.jpg" → "A")
                actual_label = filename[0].upper()
                actual_label_index = [k for k, v in CLASS_LABELS.items() if v == actual_label]

                if actual_label_index and actual_label_index[0] == predicted_label:
                    correct_predictions += 1

# Calculate accuracy
accuracy = (correct_predictions / total_samples) * 100 if total_samples > 0 else 0
print(f"✅ Model Accuracy on Test Dataset: {accuracy:.2f}%")




✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Landmarks extracted for image!
✅ Model Accuracy on Test Dataset: 0.00%


In [1]:
import mediapipe as mp
import cv2
import numpy as np
import tensorflow as tf

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Load the trained model
model = tf.keras.models.load_model("ai_sign_model.h5")  # Use the CNN model instead of LSTM

# Define class labels
CLASS_LABELS = {
    0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G", 7: "H", 8: "I",
    9: "J", 10: "K", 11: "L", 12: "M", 13: "N", 14: "O", 15: "P", 16: "Q",
    17: "R", 18: "S", 19: "T", 20: "U", 21: "V", 22: "W", 23: "X", 24: "Y",
    25: "Z", 26: "Nothing", 27: "Space", 28: "Delete"
}

# Function to preprocess image
def preprocess_image(image):
    # Resize image to match model input size
    image = cv2.resize(image, (64, 64))
    # Normalize pixel values
    image = image / 255.0
    # Add batch dimension
    image = np.expand_dims(image, axis=0)
    return image

# Start Video Capture
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Process frame for prediction
    processed_frame = preprocess_image(frame)
    
    # Make prediction
    prediction = model.predict(processed_frame)
    gesture_index = np.argmax(prediction)
    gesture = CLASS_LABELS.get(gesture_index, "Unknown")

    # Draw hand landmarks
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Display prediction on video feed
    cv2.putText(frame, f"Gesture: {gesture}", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # Show video feed
    cv2.imshow("Sign Language Recognition", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

KeyboardInterrupt: 