In [2]:
import cv2
import os
import torch
import joblib
import numpy as np
from ultralytics import YOLO
from torchvision import models, transforms
from collections import deque
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
TRAIN_PATH = "data/train"
TEST_PATH  = "data/test"

BALANCE_FACTOR = 2   # Aquí decides cuanto quieres multiplicar

emotion_to_idx = {
    "angry": 0,
    "disgust": 1,
    "fear": 2,
    "happy": 3,
    "sad": 4,
    "surprise": 5,
    "neutral": 6,
}

idx_to_emotion = {v: k for k, v in emotion_to_idx.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


mobilenet = models.mobilenet_v2(
    weights="MobileNet_V2_Weights.IMAGENET1K_V1"
).to(device)
mobilenet.classifier = torch.nn.Identity()
mobilenet.eval()

transform_img = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


def extract_embedding(img):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = transforms.ToPILImage()(img_rgb)
    tensor = transform_img(pil_img).unsqueeze(0).to(device)

    with torch.no_grad():
        embedding = mobilenet(tensor).cpu().numpy().flatten()
    return embedding


def get_class_counts(base_path):
    counts = {}
    for emotion in emotion_to_idx.keys():
        folder = os.path.join(base_path, emotion)
        if os.path.isdir(folder):
            counts[emotion] = len(os.listdir(folder))
        else:
            counts[emotion] = 0
    return counts


def load_dataset_balanced(base_path, factor=2):
    counts = get_class_counts(base_path)
    print("Conteo por clase:", counts)

    min_count = min(counts.values())
    print(f"Mínimo encontrado = {min_count}")

    max_per_class = min_count * factor
    print(f"Máximo permitido por clase = {max_per_class}")

    X, y = [], []

    for emotion_name, label in emotion_to_idx.items():
        folder = os.path.join(base_path, emotion_name)
        if not os.path.isdir(folder):
            print("Carpeta no encontrada:", folder)
            continue

        files = os.listdir(folder)

        use_n = min(len(files), max_per_class)

        selected_files = np.random.choice(files, use_n, replace=False)


        for img_name in selected_files:
            img_path = os.path.join(folder, img_name)
            img = cv2.imread(img_path)

            if img is None:
                continue

            emb = extract_embedding(img)
            X.append(emb)
            y.append(label)

    return np.array(X), np.array(y)

X_train, y_train = load_dataset_balanced(TRAIN_PATH, BALANCE_FACTOR)

X_test, y_test = load_dataset_balanced(TEST_PATH, BALANCE_FACTOR)

print("\nEntrenamiento de SVM")

clf = make_pipeline(
    StandardScaler(),
    SVC(kernel="rbf", C=15, gamma="scale", probability=True)
)

clf.fit(X_train, y_train)



joblib.dump(clf, "emotion_svm_mobilenet.pkl")
print("\nSe guardo el modelo como emotion_svm_mobilenet.pkl")

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\nPrecisión registrada: {acc*100:.2f}%\n")

print("Reporte por clase:")
print(classification_report(
    y_test,
    y_pred,
    target_names=[idx_to_emotion[i] for i in range(7)]
))

Device: cuda
Conteo por clase: {'angry': 705, 'disgust': 717, 'fear': 281, 'happy': 4772, 'sad': 1982, 'surprise': 1290, 'neutral': 2524}
Mínimo encontrado = 281
Máximo permitido por clase = 562
Conteo por clase: {'angry': 162, 'disgust': 160, 'fear': 74, 'happy': 1185, 'sad': 478, 'surprise': 329, 'neutral': 680}
Mínimo encontrado = 74
Máximo permitido por clase = 148

Entrenamiento de SVM

Se guardo el modelo como emotion_svm_mobilenet.pkl

Precisión registrada: 51.04%

Reporte por clase:
              precision    recall  f1-score   support

       angry       0.58      0.64      0.61       148
     disgust       0.41      0.39      0.40       148
        fear       0.70      0.47      0.56        74
       happy       0.56      0.53      0.55       148
         sad       0.39      0.39      0.39       148
    surprise       0.61      0.67      0.64       148
     neutral       0.43      0.47      0.45       148

    accuracy                           0.51       962
   macro avg    

In [24]:
YOLO_MODEL = "yolov8n-face-lindevs.pt"
SVM_MODEL  = "emotion_svm_mobilenet.pkl"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

yolo = YOLO(YOLO_MODEL)

mobilenet = models.mobilenet_v2(
    weights="MobileNet_V2_Weights.IMAGENET1K_V1"
).to(device)
mobilenet.classifier = torch.nn.Identity()
mobilenet.eval()

transform_face = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


clf = joblib.load(SVM_MODEL)

emotion_to_idx = {
    "angry": 0, "disgust": 1, "fear": 2,
    "happy": 3, "sad": 4, "surprise": 5,
    "neutral": 6
}
idx_to_emotion = {v: k for k, v in emotion_to_idx.items()}

emotion_colors = {
    "angry": (0, 0, 255),
    "disgust": (0, 128, 0),
    "fear": (128, 0, 128),
    "happy": (0, 255, 255),
    "sad": (255, 128, 0),
    "surprise": (255, 0, 255),
    "neutral": (128, 128, 128)
}


history = deque(maxlen=20)

def stable_emotion(history):
    from collections import Counter
    c = Counter(history)
    emotion, count = c.most_common(1)[0]
    if count / len(history) >= 0.40:
        return emotion
    return history[-1]


cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = yolo(frame, verbose=False)
    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)

    emotion_now = "neutral"

    for (x1, y1, x2, y2) in boxes:
        w = x2 - x1
        h = y2 - y1

        fx1 = int(x1 + w * 0.15)
        fy1 = int(y1 + h * 0.18)
        fx2 = int(x2 - w * 0.15)
        fy2 = int(y2 - h * 0.10)

        face = frame[fy1:fy2, fx1:fx2]
        if face.size == 0:
            continue

        rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
        pil_img = transforms.ToPILImage()(rgb)
        tensor = transform_face(pil_img).unsqueeze(0).to(device)

        with torch.no_grad():
            emb = mobilenet(tensor).cpu().numpy().flatten()

        probs = clf.predict_proba([emb])[0]
        pred_idx = np.argmax(probs)

        if pred_idx == emotion_to_idx["happy"] and probs[pred_idx] < 0.60:
            pred_idx = np.argsort(probs)[-2]

        emotion_now = idx_to_emotion[pred_idx]

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, emotion_now, (x1, y1 - 12),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

    # Suavizado
    history.append(emotion_now)
    emo = stable_emotion(history)

    overlay = frame.copy()
    color = emotion_colors[emo]
    cv2.rectangle(overlay, (0, 0), (frame.shape[1], frame.shape[0]), color, -1)
    frame = cv2.addWeighted(overlay, 0.25, frame, 0.75, 0)

    cv2.putText(frame, emo, (20, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.4, (255, 255, 255), 3)

    cv2.imshow("YOLO Face + MobileNet + SVM", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


Device: cuda
