In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from IPython import display
import mediapipe as mp
import numpy as np
import os



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


Load Gender Model

In [3]:
# -- Load Gender Model
gender_weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
gender_model = torchvision.models.efficientnet_b0(weights=gender_weights).to(device)


In [4]:
for param in gender_model.parameters():
    param.requires_grad = True

In [5]:
gender_model.classifier = nn.Sequential(
    nn.Dropout(p=0.2, inplace=True),
    nn.Linear(1280, 128),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(64, 1)

).to(device)

Load Age Model

In [None]:
# -- Load Age Model
age_weights = torchvision.models.VGG19_Weights.DEFAULT
age_model = torchvision.models.vgg19( weights=age_weights)

In [7]:
# Freeze all parameters in the feature layers
for param in age_model.features.parameters():
    param.requires_grad = False

# Unfreeze only the last 12 layers of the feature layers
for param in age_model.features[-20:].parameters():
    param.requires_grad = True



In [None]:
age_model.classifier = nn.Sequential(
    
    nn.Sequential(
        nn.Linear(512 * 7 * 7, 4096),
        nn.BatchNorm1d(4096),
        nn.ReLU(),
        nn.Dropout(0.5),  
        nn.Linear(4096, 4096),
        nn.BatchNorm1d(4096),
        nn.ReLU()
    ),

    
    nn.Sequential(
        nn.Linear(4096, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        nn.Dropout(0.4),  
        nn.Linear(2048, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(2048, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(1024, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(1024, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        nn.Dropout(0.2),  
        nn.Linear(512, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(512, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(0.1),  
        nn.Linear(256, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        
    ),

    
    nn.Linear(256, 1)
).to(device)

Load Emotion Model

In [None]:
# -- Load Emotion Model
emotion_weights = torchvision.models.VGG19_Weights.DEFAULT
emotion_model = torchvision.models.vgg19( weights=emotion_weights)

In [None]:
# Freeze all parameters in the feature layers
for param in emotion_model.features.parameters():
    param.requires_grad = False

for param in emotion_model.features[-20:].parameters():
    param.requires_grad = True    


# Modify the classifier
emotion_model.classifier = nn.Sequential(
    
    nn.Sequential(
        nn.Linear(512 * 7 * 7, 4096),
        nn.BatchNorm1d(4096),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(4096, 4096),
        nn.BatchNorm1d(4096),
        nn.ReLU()
    ),

    
    nn.Sequential(
        nn.Linear(4096, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(2048, 2048),
        nn.BatchNorm1d(2048),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(2048, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(1024, 1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(1024, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(512, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        
    ),

    
    nn.Sequential(
        nn.Linear(512, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        
    ),

    
    nn.Linear(256, 7)  # 7 emotion classes
).to(device)




In [None]:
gender_model.load_state_dict(torch.load("fine_tuned_gender_model.pth", map_location=device))
gender_model.to(device).eval()

In [None]:
age_model.load_state_dict(torch.load("fine_tuned_age_model.pth", map_location=device))
age_model.to(device).eval()

In [None]:
emotion_model.load_state_dict(torch.load("emotion_model_fine_tuned.pth", map_location=device))
emotion_model.to(device).eval()

# Data Transformation

In [None]:
# Data transformation for gender model
gender_infer_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


In [None]:
# Data transformation for age model
age_infer_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])



In [None]:
# Data transformation for emotion model
emotion_infer_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


# Prediction

In [None]:
def predict_and_display(frame, face_box, gender_model, age_model, emotion_model):
    # Extract face region from frame
    x, y, w, h = face_box
    face = frame[y:y+h, x:x+w]

    # Convert to PIL image for model input
    face_pil = Image.fromarray(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))

    # Preprocess face for each model
    gender_tensor = gender_infer_transform(face_pil).unsqueeze(0).to(device)
    age_tensor = age_infer_transform(face_pil).unsqueeze(0).to(device)
    emotion_tensor = emotion_infer_transform(face_pil).unsqueeze(0).to(device)

    # Run predictions
    with torch.no_grad():
        gender_output = gender_model(gender_tensor)
        age_output = age_model(age_tensor)
        emotion_output = emotion_model(emotion_tensor)
        
        # Process gender prediction
        gender_pred = torch.round(torch.sigmoid(gender_output))
        gender_index = int(gender_pred.item())
        gender_labels = ['Male', 'Female']
        gender_text = gender_labels[gender_index]
        
        # Process emotion prediction
        age_text = int(age_output.item())
        
        # Process emotion prediction
        emotion_index = torch.argmax(emotion_output, dim=1).item()
        emotion_labels = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
        emotion_text = emotion_labels[emotion_index]

    # Draw bounding box and label on frame
    label = f"Gender: {gender_text}, {age_text} years old, Emotion: {emotion_text}"
    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.putText(frame, label, (x, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

    return frame


def main():
    # Initialize MediaPipe face detector
    mp_face_detection = mp.solutions.face_detection
    mp_drawing = mp.solutions.drawing_utils

    # Open webcam
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Could not open webcam.")
        return

    # Start face detection
    with mp_face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.6) as face_detection:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Convert frame to RGB for MediaPipe
            height, width, _ = frame.shape
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = face_detection.process(frame_rgb)

            # If faces detected, process each
            if results.detections:
                for detection in results.detections:
                    bbox = detection.location_data.relative_bounding_box
                    x = int(bbox.xmin * width)
                    y = int(bbox.ymin * height)
                    w = int(bbox.width * width)
                    h = int(bbox.height * height)

                    # Clamp values to frame size
                    x, y = max(0, x), max(0, y)
                    w, h = min(w, width - x), min(h, height - y)

                    frame = predict_and_display(frame, (x, y, w, h),
                                                gender_model, age_model, emotion_model)
                    
            # Show the output
            cv2.imshow('Webcam Face Analysis', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == '__main__':
    main()
