In [1]:
import cv2
import numpy as np
import os
import time
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model

# ----------- Load Training Data from Folder Structure -----------
def load_data_from_folder(base_path):
    X, y = [], []
    max_per_class = 350  # Increase for better performance
    for label in sorted(os.listdir(base_path)):
        label_folder = os.path.join(base_path, label)
        if not os.path.isdir(label_folder):
            continue
        print(f"Loading images from: {label_folder}")
        img_count = 0
        for img_file in os.listdir(label_folder):
            if img_count >= max_per_class:
                break
            img_path = os.path.join(label_folder, img_file)
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                img = cv2.resize(img, (75, 75))
                img = np.stack((img,) * 3, axis=-1)  # Convert to 3 channels
                X.append(img)
                y.append(label)
                img_count += 1
    return np.array(X), np.array(y)

# ----------- Load Flat Test Images from Filenames -----------
def load_test_images_from_single_folder(base_path):
    X, y = [], []
    for filename in sorted(os.listdir(base_path)):
        if filename.lower().endswith(('.jpg', '.png')):
            label = filename[0].upper()
            if label not in list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                continue
            img_path = os.path.join(base_path, filename)
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                img = cv2.resize(img, (75, 75))
                img = np.stack((img,) * 3, axis=-1)
                X.append(img)
                y.append(label)
    print(f"✅ Loaded {len(X)} test images from flat folder.")
    return np.array(X), np.array(y)

# ----------- Paths to Dataset -----------
train_path = r"C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train"
test_path = r"C:\Users\asusn\Desktop\ASL\asl_alphabet_test\asl_alphabet_test"

print("📦 Loading training data...")
X_train, y_train = load_data_from_folder(train_path)
print("📦 Loading testing data...")
X_test, y_test = load_test_images_from_single_folder(test_path)

if X_test.size == 0 or y_test.size == 0:
    print("❌ No test data loaded. Please check test folder.")
    exit()

# ----------- Normalize and Encode -----------
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Save the LabelEncoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)
print("✅ Label encoder saved as label_encoder.pkl")

# Flatten for traditional ML models
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# ----------- CNN using MobileNetV2 -----------
def get_mobilenet_model():
    base_model = MobileNetV2(include_top=False, weights='imagenet', input_shape=(75, 75, 3))
    base_model.trainable = False  # Freeze for speed
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dense(64, activation='relu')(x)
    output = Dense(len(np.unique(y_train_encoded)), activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

print("🧠 Training MobileNetV2 CNN model...")
cnn_model = get_mobilenet_model()
cnn_model.fit(X_train, y_train_encoded,
              epochs=7, validation_data=(X_test, y_test_encoded), verbose=1)

# Save the trained CNN model
cnn_model.save("asl_mobilenetv2.h5")
print("✅ CNN model saved as asl_mobilenetv2.h5")

# Predict with CNN
cnn_pred = np.argmax(cnn_model.predict(X_test, verbose=0), axis=1)

# ----------- Random Forest -----------
print("🌲 Training Random Forest (50 trees)...")
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train_flat, y_train_encoded)
rf_pred = rf.predict(X_test_flat)

# ----------- XGBoost -----------
print("🚀 Training XGBoost (50 trees)...")
xgb = XGBClassifier(n_estimators=50, eval_metric='mlogloss', use_label_encoder=False)
xgb.fit(X_train_flat, y_train_encoded)
xgb_pred = xgb.predict(X_test_flat)

# ----------- Ensemble Prediction -----------
ensemble_preds = np.array([cnn_pred, rf_pred, xgb_pred])
final_preds = mode(ensemble_preds, axis=0).mode.flatten()

# ----------- Evaluation -----------
print("\n✅ Ensemble Accuracy:", accuracy_score(y_test_encoded, final_preds))
print("📊 Classification Report:\n", classification_report(y_test_encoded, final_preds))



📦 Loading training data...
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\A
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\B
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\C
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\D
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\E
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\F
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\G
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\H
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\I
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\J
Loading images from: C:\Users\asusn\Desktop\ASL\asl_alphabet_train\asl_alphabet_train\K
Loadi



✅ CNN model saved as asl_mobilenetv2.h5
🌲 Training Random Forest (50 trees)...
🚀 Training XGBoost (50 trees)...

✅ Ensemble Accuracy: 0.9285714285714286
📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      0.50      0.67         2
          

In [2]:
import cv2
import time
import numpy as np
from scipy.stats import mode
import mediapipe as mp

# Assuming the models and label encoder are already loaded
# cnn_model, rf, xgb, and le should be defined before running this code

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7
)

print("\n📷 Starting Webcam... Press 'q' to quit.\n")
cap = cv2.VideoCapture(0)

buffer_text = ""
last_prediction_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            h, w, _ = frame.shape
            x_coords = [lm.x * w for lm in hand_landmarks.landmark]
            y_coords = [lm.y * h for lm in hand_landmarks.landmark]
            xmin, xmax = int(min(x_coords)), int(max(x_coords))
            ymin, ymax = int(min(y_coords)), int(max(y_coords))

            pad = 20
            xmin = max(xmin - pad, 0)
            ymin = max(ymin - pad, 0)
            xmax = min(xmax + pad, w)
            ymax = min(ymax + pad, h)

            roi = frame[ymin:ymax, xmin:xmax]
            if roi.size == 0:
                continue

            # Preprocess ROI
            img = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img, (75, 75))
            img = np.stack((img,) * 3, axis=-1)  # Convert to 3 channels
            img_norm = img.astype('float32') / 255.0

            img_cnn = np.expand_dims(img_norm, axis=0)           # For CNN
            img_flat = img_norm.reshape(1, -1)                    # For RF/XGB

            # Model predictions
            pred_cnn = np.argmax(cnn_model.predict(img_cnn, verbose=0))
            pred_rf = rf.predict(img_flat)[0]
            pred_xgb = xgb.predict(img_flat)[0]

            # Majority vote
            predictions = np.array([pred_cnn, pred_rf, pred_xgb])
            majority_vote = np.atleast_1d(mode(predictions, axis=0).mode)[0]
            predicted_letter = le.inverse_transform([majority_vote])[0]

            # Update buffer every 1 second
            current_time = time.time()
            if current_time - last_prediction_time > 1.0:
                buffer_text += predicted_letter
                last_prediction_time = current_time

            # Display prediction box
            cv2.rectangle(frame, (xmin, ymin - 40), (xmin + 60, ymin - 10), (0, 128, 255), -1)
            cv2.putText(frame, predicted_letter, (xmin + 5, ymin - 15),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)

    # Display buffer text
    cv2.putText(frame, f'Typed: {buffer_text}', (10, 450),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("ASL Sign Recognition with Hand Tracking", frame)

    key = cv2.waitKey(1)
    if key & 0xFF == ord('q'):
        break
    elif key & 0xFF == ord('c'):
        buffer_text = ""  # Clear buffer

cap.release()
cv2.destroyAllWindows()


📷 Starting Webcam... Press 'q' to quit.

