# CS 4501: Computer Vision -- Detecting Human Emotions through Videos
### Akira Durham (zup9su) and Sebastian Borromeo (uwg3xs)

In [1]:
# imports
import numpy as np
from PIL import Image
from torchvision import transforms
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

### Methods

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [2]:
class ProcessData(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = Image.fromarray(self.data[idx])
        label = self.labels[idx]
        data = self.transform(data)

        return data, label

In [3]:
def get_accuracy(logit, target, batch_size):
    """Obtain accuracy for training round"""
    corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
    ret_acc = 100.0 * corrects / batch_size
    return ret_acc.item()

# Data Processing

Data was pulled from https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data, which is FER2013's Kaggle competition, providing separated training and test datasets already.

In [4]:
# Hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001

In [5]:
# load in .csv
input_data = np.loadtxt('fer2013.csv', delimiter=',', skiprows=1, dtype=str)

In [8]:
# extract emotion labels
all_labels = input_data[:, 0].astype(int)

In [9]:
# create np array from stacked pixel vals + ensure sizing
pixel_data = np.array([np.fromstring(row, dtype=np.uint8, sep=' ') for row in input_data[:, 1]])
pixel_data = pixel_data.reshape(-1, 48, 48)

In [10]:
# get indices from input_data
train_indices = np.where(input_data[:, 2] == 'Training')[0]
test_indices = np.where(input_data[:, 2] == 'PublicTest')[0]

In [11]:
# create datasets using custom class
train_set = ProcessData(pixel_data[train_indices], all_labels[train_indices])
test_set = ProcessData(pixel_data[test_indices], all_labels[test_indices])

In [12]:
# create dataloaders for train iterations and batching
train = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test = DataLoader(test_set, batch_size=batch_size, shuffle=False)

# CNN Model Building

In [13]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Flatten(),
            nn.Linear(128*6*6, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 7)
        )

    def forward(self, x):
        return self.model(x)

class CNN2(nn.Module):
    def __init__(self):
        super(CNN2, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),

            nn.Flatten(),
            nn.Linear(128 * 6 * 6, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 7)
        )

    def forward(self, x):
        return self.model(x)

# https://debuggercafe.com/implementing-vgg11-from-scratch-using-pytorch/
class CNN3(nn.Module):
    def __init__(self):
        super(CNN3, self).__init__()
        # should this have batch normalization?
        self.model = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            # nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            # nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            # nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            # nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Flatten(),
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(4096, 7)
        )

    def forward(self, x):
        return self.model(x)

# Training

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
for epoch in range(num_epochs):
    train_running_loss = 0.0
    train_acc = 0.0

    for images, labels in train:
        images, labels = images.to(device), labels.to(device)
        logits = model(images)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_running_loss += loss.detach().item()
        train_acc += get_accuracy(logits, labels, batch_size)

    print('Epoch: %d | Loss: %.4f | Train Accuracy: %.2f' % (epoch, train_running_loss / len(train), train_acc / len(train)))

Epoch: 0 | Loss: 1.6485 | Train Accuracy: 35.29
Epoch: 1 | Loss: 1.3935 | Train Accuracy: 46.11
Epoch: 2 | Loss: 1.2914 | Train Accuracy: 50.38
Epoch: 3 | Loss: 1.2168 | Train Accuracy: 53.48
Epoch: 4 | Loss: 1.1536 | Train Accuracy: 55.70
Epoch: 5 | Loss: 1.0967 | Train Accuracy: 58.00
Epoch: 6 | Loss: 1.0482 | Train Accuracy: 59.67
Epoch: 7 | Loss: 0.9896 | Train Accuracy: 62.04
Epoch: 8 | Loss: 0.9422 | Train Accuracy: 63.83
Epoch: 9 | Loss: 0.8837 | Train Accuracy: 65.91
Epoch: 10 | Loss: 0.8338 | Train Accuracy: 67.75
Epoch: 11 | Loss: 0.7682 | Train Accuracy: 70.38
Epoch: 12 | Loss: 0.7283 | Train Accuracy: 71.83
Epoch: 13 | Loss: 0.6764 | Train Accuracy: 73.92
Epoch: 14 | Loss: 0.6311 | Train Accuracy: 75.61
Epoch: 15 | Loss: 0.5848 | Train Accuracy: 77.22
Epoch: 16 | Loss: 0.5508 | Train Accuracy: 78.58
Epoch: 17 | Loss: 0.5176 | Train Accuracy: 79.90
Epoch: 18 | Loss: 0.4836 | Train Accuracy: 81.17
Epoch: 19 | Loss: 0.4598 | Train Accuracy: 82.06
Epoch: 20 | Loss: 0.4349 | Tra

In [16]:
torch.save(model.state_dict(), 'model.pth')
# model = CNN()
# model.load_state_dict(torch.load('model.pth', weights_only=True))

# Testing

In [18]:
model.eval()
test_acc = 0.0
output = []

for images, labels in test:
    images, labels = images.to(device), labels.to(device)
    logits = model(images)
    test_acc += get_accuracy(logits, labels, batch_size)
    output.extend(torch.argmax(logits, dim=1).cpu().numpy())

print('Test Accuracy: %.2f' % (test_acc / len(test)))

Test Accuracy: 56.88


# Video Section