In [7]:
import cv2
import numpy as np
from PIL import Image
import torch
from ultralytics import YOLO
from tensorflow.keras.models import load_model
from transformers import AutoImageProcessor, SiglipForImageClassification
import mediapipe as mp

class ModelSelector:
    def __init__(self, model_type):
        self.model_type = model_type.lower()

        if self.model_type == "emotion":
            self.model = YOLO("best.pt")  # YOLOv8 Emotion
            self.predict_func = self._predict_emotion
        elif self.model_type == "gender":
            self.gender_net = cv2.dnn.readNetFromCaffe("deploy_gender.prototxt", "gender_net.caffemodel")
            self.age_net = cv2.dnn.readNetFromCaffe("deploy_age.prototxt", "age_net.caffemodel")
            self.predict_func = self._predict_demographics
        elif self.model_type == "sign":
            self.model = SiglipForImageClassification.from_pretrained("sign_model", trust_remote_code=True)
            self.processor = AutoImageProcessor.from_pretrained("sign_model",use_fast = False)
            self.model.eval()

            # Setup MediaPipe
            self.mp_hands = mp.solutions.hands
            self.hands = self.mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
            self.mp_draw = mp.solutions.drawing_utils
            self.predict_func = self._predict_sign
        else:
            raise ValueError("Unsupported model type. Choose from 'emotion', 'gender', or 'sign'.")

    def predict_image(self, image_path):
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError("Could not read image.")
        return self.predict_func(img)

    def predict_webcam(self):
        cap = cv2.VideoCapture(0)
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            result = self.predict_func(frame)
            frame = result if isinstance(result, np.ndarray) else self._overlay_text(frame, result)
            cv2.imshow("Webcam Inference", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()

    def _predict_emotion(self, frame):
        results = self.model(frame)
        return results[0].plot()  # Return frame with bounding boxes

    def _predict_demographics(self, frame):
        GENDER_LIST = ['Male', 'Female']
        AGE_LIST = ['(0-2)', '(4-6)', '(8-12)', '(15-20)', '(25-32)', '(38-43)', '(48-53)', '(60-100)']

        # Face detection (simple center crop fallback if no face detector)
        h, w, _ = frame.shape
        face_img = cv2.resize(frame, (227, 227))
        blob = cv2.dnn.blobFromImage(face_img, 1.0, (227, 227), (78.4263377603, 87.7689143744, 114.895847746), swapRB=False)

        self.gender_net.setInput(blob)
        gender_preds = self.gender_net.forward()
        gender = GENDER_LIST[gender_preds[0].argmax()]
        gender_conf = gender_preds[0].max()
    
        self.age_net.setInput(blob)
        age_preds = self.age_net.forward()
        age = AGE_LIST[age_preds[0].argmax()]
        age_conf = age_preds[0].max()
    
        return f"{gender} ({gender_conf*100:.1f}%) | Age: {age} ({age_conf*100:.1f}%)"

    def _predict_sign(self, frame):
        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = self.hands.process(img_rgb)

        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                h, w, _ = frame.shape
                x_coords = [lm.x * w for lm in hand_landmarks.landmark]
                y_coords = [lm.y * h for lm in hand_landmarks.landmark]
                x1, y1, x2, y2 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))

                padding = 20
                x1, y1 = max(0, x1 - padding), max(0, y1 - padding)
                x2, y2 = min(w, x2 + padding), min(h, y2 + padding)

                hand_roi = frame[y1:y2, x1:x2]
                if hand_roi.size == 0:
                    continue

                # Enhance brightness and contrast
                hsv = cv2.cvtColor(hand_roi, cv2.COLOR_BGR2HSV)
                hsv[..., 2] = cv2.equalizeHist(hsv[..., 2])
                enhanced = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
                final = cv2.convertScaleAbs(enhanced, alpha=1.2, beta=15)

                image_pil = Image.fromarray(cv2.cvtColor(final, cv2.COLOR_BGR2RGB)).convert("RGB")
                inputs = self.processor(images=image_pil, return_tensors="pt")

                with torch.no_grad():
                    logits = self.model(**inputs).logits
                    probs = torch.nn.functional.softmax(logits, dim=1).squeeze()
                    pred_index = torch.argmax(probs).item()
                    confidence = probs[pred_index].item()
                    pred_label = chr(65 + pred_index)

                # Draw prediction and landmarks
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f'{pred_label} ({confidence:.2f})', (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                self.mp_draw.draw_landmarks(frame, hand_landmarks, self.mp_hands.HAND_CONNECTIONS)
        return frame

    def _overlay_text(self, frame, text):
        cv2.putText(frame, text, (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1,
                    (0, 255, 0), 2, cv2.LINE_AA)
        return frame

In [10]:
class RAGRouter:
    def __init__(self):
        self.sign_selector = ModelSelector("sign")
        self.gender_selector = ModelSelector("gender")
        self.emotion_selector = ModelSelector("emotion")

        # Mediapipe Face detection
        self.mp_face = mp.solutions.face_detection
        self.face_det = self.mp_face.FaceDetection(min_detection_confidence=0.6)

    def predict_from_frame(self, frame):
        h, w, _ = frame.shape
        annotated_frame = frame.copy()
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # ---- 1. Face Detection for gender & emotion ----
        face_results = self.face_det.process(rgb)
        if face_results.detections:
            for detection in face_results.detections:
                box = detection.location_data.relative_bounding_box
                x1 = int(box.xmin * w)
                y1 = int(box.ymin * h)
                x2 = int((box.xmin + box.width) * w)
                y2 = int((box.ymin + box.height) * h)

                face_crop = frame[y1:y2, x1:x2]
                if x2 <= x1 or y2 <= y1 or face_crop.size == 0:
                    continue

                # Gender prediction
                gender = self.gender_selector._predict_demographics(face_crop)

                # Emotion prediction
                emo_results = self.emotion_selector.model(face_crop)
                preds = emo_results[0]
                if preds.boxes:
                    class_id = int(preds.boxes.cls[0])
                    label = preds.names[int(class_id)]
                    cv2.putText(annotated_frame, f"Emotion: {label}", (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX,
                                0.8, (0, 255, 255), 2)

                # Annotate gender
                cv2.putText(annotated_frame, gender, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                            0.6, (255, 0, 0), 2)

        # ---- 2. Hand Detection for sign prediction ----
        annotated_frame = self.sign_selector._predict_sign(annotated_frame)

        return annotated_frame

    def run_webcam(self):
        cap = cv2.VideoCapture(0)
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            result = self.predict_from_frame(frame)
            cv2.imshow("RAG Inference", result)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()

In [11]:
if __name__ == "__main__":
    router = RAGRouter()
    router.run_webcam()


0: 640x640 1 Surprised, 56.7ms
Speed: 3.7ms preprocess, 56.7ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Angry, 1 Surprised, 59.3ms
Speed: 3.8ms preprocess, 59.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Angry, 57.5ms
Speed: 3.4ms preprocess, 57.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Angry, 1 Surprised, 57.2ms
Speed: 3.5ms preprocess, 57.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Surprised, 61.6ms
Speed: 4.2ms preprocess, 61.6ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Surprised, 54.0ms
Speed: 3.9ms preprocess, 54.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Angry, 1 Sad, 57.0ms
Speed: 3.6ms preprocess, 57.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 Angry, 1 Surprised, 53.2ms
Speed: 3.4ms preprocess, 53.2ms inferen