# 1. Install & Import Libraries

In [1]:
import cv2
import numpy as np
import os
import mediapipe as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard

# 2. MediaPipe Setup

In [5]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [6]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [7]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh   = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh   = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# 3. Load ASL Dataset

In [3]:
DATASET_PATH = "asl_dataset"
actions = sorted(os.listdir(DATASET_PATH))  # sorted for consistency
print("Classes:")
for act in actions:
    print("-", act)

Classes:
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- a
- b
- c
- d
- e
- f
- g
- h
- i
- j
- k
- l
- m
- n
- o
- p
- q
- r
- s
- t
- u
- v
- w
- x
- y
- z


In [4]:
sequence_length = 30   # number of frames per sample

# 4. Build Features & Labels

In [9]:
sequences, labels = [], []

with mp_holistic.Holistic(min_detection_confidence=0.5,
                          min_tracking_confidence=0.5) as holistic:
    for idx, action in enumerate(actions):
        folder = os.path.join(DATASET_PATH, action)
        for img_file in os.listdir(folder):
            img_path = os.path.join(folder, img_file)
            image = cv2.imread(img_path)
            if image is None:
                continue  # skip corrupted images

            image, results = mediapipe_detection(image, holistic)
            keypoints = extract_keypoints(results)

            sequences.append(keypoints)
            labels.append(idx)

I0000 00:00:1755316606.826939 2493737 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1755316606.945103 2530105 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755316606.963749 2530106 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755316606.965642 2530109 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755316606.965658 2530103 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755316606.965684 2530108 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

In [10]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

print("Feature Shape:", X.shape)   # (num_images, 1662)
print("Label Shape:", y.shape)     # (num_images, num_classes)

Feature Shape: (2515, 1662)
Label Shape: (2515, 36)


In [12]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

# 5. Train/Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 6. Build Model

In [14]:
model = Sequential([
    Dense(512, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(len(actions), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
model.summary()

In [18]:
history = model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          epochs=300, batch_size=32)

Epoch 1/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4955 - loss: 1.7801 - val_accuracy: 0.4453 - val_loss: 1.9871
Epoch 2/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4965 - loss: 1.7724 - val_accuracy: 0.4592 - val_loss: 1.9518
Epoch 3/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4826 - loss: 1.7642 - val_accuracy: 0.4553 - val_loss: 1.9430
Epoch 4/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5000 - loss: 1.7431 - val_accuracy: 0.4513 - val_loss: 1.9406
Epoch 5/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4871 - loss: 1.7446 - val_accuracy: 0.4732 - val_loss: 1.9346
Epoch 6/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4995 - loss: 1.7313 - val_accuracy: 0.4592 - val_loss: 1.9390
Epoch 7/300
[1m63/63[0m [32m━━━

# Trying CNN

In [19]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = 64  # resize images to 64x64
BATCH_SIZE = 32

datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=10,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_gen = datagen.flow_from_directory(
    DATASET_PATH,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training'
)

val_gen = datagen.flow_from_directory(
    DATASET_PATH,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation'
)


Found 2012 images belonging to 36 classes.
Found 503 images belonging to 36 classes.


In [20]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(train_gen.num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [22]:
history = model.fit(train_gen, validation_data=val_gen, epochs=25)

  self._warn_if_super_not_called()


Epoch 1/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 113ms/step - accuracy: 0.1625 - loss: 3.0912 - val_accuracy: 0.4573 - val_loss: 1.8255
Epoch 2/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 128ms/step - accuracy: 0.4478 - loss: 1.7519 - val_accuracy: 0.6720 - val_loss: 1.1170
Epoch 3/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 105ms/step - accuracy: 0.5895 - loss: 1.2557 - val_accuracy: 0.7296 - val_loss: 0.8579
Epoch 4/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 106ms/step - accuracy: 0.6844 - loss: 0.9573 - val_accuracy: 0.8151 - val_loss: 0.5672
Epoch 5/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 129ms/step - accuracy: 0.7286 - loss: 0.8128 - val_accuracy: 0.7873 - val_loss: 0.5758
Epoch 6/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 132ms/step - accuracy: 0.7604 - loss: 0.7065 - val_accuracy: 0.7813 - val_loss: 0.5963
Epoch 7/25
[1m63/63[0m [3

In [23]:
model.save('asl_cnn_model.h5')



In [26]:
import json

with open("class_indices.json", "w") as f:
    json.dump(train_gen.class_indices, f)

# Real Time Testing

In [31]:
import os

# Path to your dataset root (same one used for training)
DATASET_PATH = "asl_dataset"

# Get folder names (classes), sorted for consistency
actions = sorted(os.listdir(DATASET_PATH))

# Build mapping: {class_index: class_name}
classes = {idx: action for idx, action in enumerate(actions)}

print(classes)  # e.g. {0: '0', 1: '1', ..., 10: 'a', 11: 'b', ...}

{0: '.DS_Store', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: 'a', 12: 'b', 13: 'c', 14: 'd', 15: 'e', 16: 'f', 17: 'g', 18: 'h', 19: 'i', 20: 'j', 21: 'k', 22: 'l', 23: 'm', 24: 'n', 25: 'o', 26: 'p', 27: 'q', 28: 'r', 29: 's', 30: 't', 31: 'u', 32: 'v', 33: 'w', 34: 'x', 35: 'y', 36: 'z'}


In [32]:
import mediapipe as mp

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(min_detection_confidence=0.7)

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get bounding box around hand
            h, w, _ = frame.shape
            xs = [lm.x for lm in hand_landmarks.landmark]
            ys = [lm.y for lm in hand_landmarks.landmark]
            x_min, x_max = int(min(xs) * w), int(max(xs) * w)
            y_min, y_max = int(min(ys) * h), int(max(ys) * h)

            # Crop hand region
            hand_img = frame[y_min:y_max, x_min:x_max]
            if hand_img.size == 0:  # avoid empty crop
                continue

            hand_img = cv2.resize(hand_img, (IMG_SIZE, IMG_SIZE))
            hand_img = hand_img.astype("float32") / 255.0
            hand_img = np.expand_dims(hand_img, axis=0)

            # Predict
            preds = model.predict(hand_img, verbose=0)
            pred_class = np.argmax(preds)
            pred_label = classes[pred_class]
            confidence = np.max(preds)

            # Draw results
            cv2.putText(frame, f"{pred_label} ({confidence:.2f})",
                        (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX,
                        1, (0, 255, 0), 2)
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.imshow("ASL Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1755321331.118220 2493737 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1755321331.136734 2616185 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1755321331.150447 2616185 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
