# Started from here!

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

import cv2
import numpy as np
from PIL import Image
import os

import mediapipe as mp

import heapq
import itertools
from sklearn.decomposition import PCA

objc[85365]: Class CaptureDelegate is implemented in both /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/cv2/cv2.abi3.so (0x15c7765a0) and /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/mediapipe/.dylibs/libopencv_videoio.3.4.16.dylib (0x13f4d8860). One of the two will be used. Which one is undefined.
objc[85365]: Class CVWindow is implemented in both /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/cv2/cv2.abi3.so (0x15c7765f0) and /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x12dd54a68). One of the two will be used. Which one is undefined.
objc[85365]: Class CVView is implemented in both /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/cv2/cv2.abi3.so (0x15c776618) and /Users/sertanavdan/miniconda3/envs/torch/lib/python3.10/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x12dd54a90). One of the two will be

In [3]:
test_dir = 'data/test/'
train_dir = 'data/train/'
classes = os.listdir(train_dir)
test_cls = os.listdir(train_dir)
num_classes = len(classes)

# Landmark detection from images and correlation of those landmark positions

In [4]:
def compute_pairwise_distances(landmarks):
    num_landmarks = landmarks.shape[0]
    pairwise_distances = np.zeros((num_landmarks, num_landmarks))
    for i, j in itertools.product(range(num_landmarks), repeat=2):
        if i != j:
            pairwise_distances[i, j] = np.linalg.norm(landmarks[i] - landmarks[j])

    pairwise_distances = pairwise_distances[np.triu_indices(num_landmarks, k=1)]
    return pairwise_distances

In [5]:
mp_face_mesh = mp.solutions.face_mesh
def detect_landmarks(image):
    # Initialize the FaceMesh model with default parameters
    with mp_face_mesh.FaceMesh(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as face_mesh:
        
        # Convert the image to RGB format and feed it to the model
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(image)
        
        # Initialize an empty tensor to store the landmarks
        landmarks = torch.zeros(1, 109278, dtype=torch.float32)  # Use the correct landmarks shape
        
        # If a face is detected, extract its landmarks and save them in the tensor
        if results.multi_face_landmarks:
            for face_landmarks in results.multi_face_landmarks:
                landmark_coords = np.zeros((468, 2))
                for i, landmark in enumerate(face_landmarks.landmark):
                    landmark_coords[i, 0] = landmark.x
                    landmark_coords[i, 1] = landmark.y
                
                pairwise_distances = compute_pairwise_distances(landmark_coords)
                landmarks[0] = torch.tensor(pairwise_distances, dtype=torch.float32)
        
        # Convert the RGB image back to BGR format
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # Return the image and landmarks as separate PyTorch tensors
        return torch.tensor(image, dtype=torch.float32), landmarks

In [6]:
image = cv2.imread('data/test/angry/angry_1.jpg')
image, landmarks = detect_landmarks(image)

print(image.shape)
print(landmarks.shape)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


torch.Size([48, 48, 3])
torch.Size([1, 109278])


In [7]:
class LandmarkDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        for label in os.listdir(root_dir):
            class_dir = os.path.join(root_dir, label)
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                image = cv2.imread(img_path)
                _, landmarks = detect_landmarks(image)
                self.data.append((img_path, landmarks))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, landmarks = self.data[idx]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

        if self.transform:
            image = self.transform(image)

        return image, landmarks, idx

In [None]:
num_landmarks = 468
landmarks_length = (num_landmarks * (num_landmarks - 1)) // 2

print(f'Number of landmarks: {num_landmarks}')
print(f'Length of the landmark vector: {landmarks_length}')

In [None]:
# Train the model
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = LandmarkDataset(train_dir, transform=transform)
val_dataset = LandmarkDataset(test_dir, transform=transform)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
# pca to the dataset
pca = PCA(n_components=0.95)
pca.fit(train_dataset.data[0][1].numpy())
print(pca.n_components_, "\n ------------")
print(pca.explained_variance_ratio_, "\n ------------")
print(pca.singular_values_, "\n ------------")
print(pca.components_, "\n ------------")
print(pca.mean_, "\n ------------")
print(pca.noise_variance_, "\n ------------")
print(pca.get_covariance(), "\n ------------")

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.W_q = nn.Linear(d_model + landmarks_length, d_model, bias=False)
        self.W_k = nn.Linear(d_model + landmarks_length, d_model, bias=False)
        self.W_v = nn.Linear(d_model + landmarks_length, d_model, bias=False)
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, x, landmarks):
        N = x.shape[0]

        # Concatenate the features with the landmarks
        x = torch.cat((x, landmarks), dim=1)

        Q = self.W_q(x).view(N, self.num_heads, self.head_dim)
        K = self.W_k(x).view(N, self.num_heads, self.head_dim)
        V = self.W_v(x).view(N, self.num_heads, self.head_dim)

        # Compute attention scores using matrix multiplication
        scores = torch.matmul(Q, K.transpose(-2, -1))
        # Scale the scores
        scaled_scores = scores / (self.d_model ** 0.5)
        # Apply softmax to get the attention weights
        attention_weights = torch.softmax(scaled_scores, dim=-1)
        # Compute the output using matrix multiplication
        out = torch.matmul(attention_weights, V)
        # Concatenate the heads and apply the output linear layer
        out = out.transpose(0, 1).reshape(N, -1)

        out = self.fc_out(out)

        return out


# Notes
1. Paralell branching on the CNN model can be tested!
2. Balance of the dataset needs to be checked!


In [None]:
class LandmarkAttentionCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        # Block-1
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        # Block-2
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        # Block-3
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(128),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        # Block-4
        self.conv4 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(256),
            nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(256),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        # Block-5
        self.fc1 = nn.Sequential(
            nn.Linear(256 * 4 * 4, 128),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(128),
            nn.Dropout(0.5)
        )

        # Attention Layer
        self.attention = MultiHeadAttention(128, 8)

        # Block-6
        self.fc2 = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(64),
            nn.Dropout(0.5)
        )

        # Block-7
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x, landmarks):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = x.view(-1, 256 * 4 * 4)
        x = self.fc1(x)
        
        # Pass landmarks as input to the MultiHeadAttention module
        x = self.attention(x, landmarks.view(-1, landmarks_length))

        x = F.relu(x)
        x = self.fc2(x)
        class_output = self.fc3(x)
        return class_output

In [None]:
num_classes = 7  # Update this value based on your specific problem
model = LandmarkAttentionCNN(num_classes)

In [None]:
input_size = [(1, 3, 48, 48), (1, landmarks_length)]
batch_size = 4

total_input_size = sum([np.prod(size) for size in input_size]) * batch_size * 4. / (1024 ** 2.)
print(f"Total input size: {total_input_size:.2f} MB")

In [None]:
# Model2txt
model = LandmarkAttentionCNN(7)
with open('model.txt', 'w') as f:
    f.write(str(model))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_landmarks = 468
landmarks_length = (num_landmarks * (num_landmarks - 1)) // 2

model = LandmarkAttentionCNN(num_classes, landmarks_length).to(device)  # Pass landmarks_length as an argument

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, landmarks, targets) in enumerate(dataloader):
        data = data.to(device)
        targets = targets.to(device)
        # print(data.shape)
        # forward
        outputs = model(data, landmarks)
        loss = criterion(outputs, targets)
        # backward
        optimizer.zero_grad()
        loss.backward()
        # gradient descent or adam step
        optimizer.step()
        # update metrics
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    return running_loss / len(dataloader), correct / total

In [None]:
def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, landmarks, targets) in enumerate(dataloader):
            data = data.to(device)
            targets = targets.to(device)

            #print(data.shape)
            # forward
            outputs = model(data, landmarks)
            loss = criterion(outputs, targets)

            # update metrics
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    return running_loss / len(dataloader), correct / total

In [None]:
print(device)

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, device)
    val_loss, val_acc = validate_epoch(model, val_dataloader, criterion, device)
    scheduler.step(val_loss)

    print(f"Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")