# Task 2: Understand body language by gesture recognition with convolutional neural network

## 1. Do literature search on Convolution Neural Network. Learn how to build a convolutional layer in PyTorch.

## 2. Referring to the guide in Task 1, build your own network for gesture classification using convolutional layers. Please see the references 4 in the manual to learn how to build convolutional layers in PyTorch.

## 3. Analyse and comment on the performance of the model. Make a comparison between the fully connected based and convolutional based models and comment on it.

In [1]:
import cv2
import numpy as np
import os
import itertools
import torch.utils.data as utils_data
import torch.nn as nn
import torch

In [2]:
path = './dataset/images'

label_path = './dataset/labels'
if not os.path.exists(label_path):
    os.makedirs(label_path)

files = os.listdir(path)
index = 0
for i, file in enumerate(files):
    if file != '.DS_Store':
        subclass_label_path = os.path.join(label_path, file + '.txt')
        with open(subclass_label_path, 'w') as f:
            f.write('#label\n')
        for _ in range(len(os.listdir(os.path.join(path, file)))):
            with open(subclass_label_path, 'a') as f:
                f.write('{:d}\n'.format(index))
        index = index + 1
    f.close()

In [5]:
Image = []
path_images = './dataset/images'
for mainDir, subDir, fileList in os.walk(path_images):
    for file in fileList:
        if file != '.DS_Store':
            currentPath = os.path.join(mainDir, file)
            Image.append(cv2.resize(cv2.imread(currentPath), (224, 224)))
Image = np.array(Image)
Image = np.transpose(Image, (0, 3, 1, 2))
dataset_size, C, H, W = Image.shape

Label = []
path_labels = './dataset/labels'
for file in os.listdir(path_labels):
    Label.append(np.loadtxt(os.path.join(path_labels, file)))
Label = np.array(list(itertools.chain.from_iterable(Label)))
num_classes = int(np.max(Label)) + 1

In [6]:
class CNNModel(nn.Module):
    def __init__(self, num_classes=4):
        super(CNNModel, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),  #input: 224*224*3
            nn.BatchNorm2d(64),
            nn.ReLU())  # output: 224*224*64
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  #input: 224*224*64
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))  # output: 224*224*64
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  #input: 224*224*64
            nn.BatchNorm2d(128),
            nn.ReLU())  # output: 112*112*128
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # input: 112*112*128
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))  # output: 112*112*128
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # input: 112*112*128
            nn.BatchNorm2d(256),
            nn.ReLU())  # output: 56*56*256
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # input: 56*56*256
            nn.BatchNorm2d(256),
            nn.ReLU())  # output: 56*56*256
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # input: 56*56*256
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))  # output: 56*56*256
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),  # input: 56*56*256
            nn.BatchNorm2d(512),
            nn.ReLU())  # output: 28*28*512
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),  # input: 28*28*512
            nn.BatchNorm2d(512),
            nn.ReLU())  # output: 28*28*512
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),  # input: 28*28*512
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))  # output: 28*28*512
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),  # input: 28*28*512
            nn.BatchNorm2d(512),
            nn.ReLU())  #output: 14*14*512
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),  # input: 14*14*512
            nn.BatchNorm2d(512),
            nn.ReLU())  #output: 14*14*512
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),  # input: 14*14*512
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))  #output: 7*7*512
        self.fc = nn.Sequential(
            nn.Dropout(0.5),  #input: 7*7*512
            nn.Linear(7 * 7 * 512, 4096),
            nn.ReLU())  #output: 1*1*4096
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),  # input: 1*1*4096
            nn.Linear(4096, 4096),
            nn.ReLU())  #output: 1*1*4096
        self.fc2 = nn.Sequential(
            nn.Linear(4096, num_classes))  #input: 1*1*4096 output: 1*1*4

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)
        out = self.layer12(out)
        out = self.layer13(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [7]:
model = CNNModel()
# if torch.cuda.is_available():
#     model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
loss_func = nn.CrossEntropyLoss()

Encapsulate data

In [8]:
dataset = utils_data.TensorDataset(torch.Tensor(Image), torch.LongTensor(Label))
split_ratio = 0.8
train_size = int(split_ratio * dataset_size)
test_size = dataset_size - train_size
train_set, test_set = utils_data.random_split(dataset, [train_size, test_size])

train_loader = utils_data.DataLoader(dataset=train_set, batch_size=8, shuffle=True)
test_loader = utils_data.DataLoader(dataset=test_set, batch_size=8, shuffle=True)

print('Data is ready!')

Data is ready!


In [9]:
best_accuracy = 0
for epoch in range(500):
    running_loss = 0.0
    train_acc = 0.0
    for step, (batch_image, batch_label) in enumerate(train_loader):
        model.train()
        # if torch.cuda.is_available():
        #     batch_image, batch_label = batch_image.cuda(), batch_label.cuda()
        batch_output = model(batch_image)
        batch_loss = loss_func(batch_output, batch_label)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        running_loss += batch_loss.item()

        # train accuracy
        _, train_predicted = torch.max(batch_output.data, 1)
        train_acc += (train_predicted == batch_label).sum().item()

    train_acc /= train_size
    running_loss /= (step + 1)

    # ----------test----------
    model.eval()
    test_acc = 0.0
    for test_image, test_label in test_loader:
        test_output = model(test_image)
        _, predicted = torch.max(test_output.data, 1)
        test_acc += (predicted == test_label).sum().item()
    test_acc /= test_size

    print('epoch={:d}\ttrain loss={:.6f}\ttrain accuracy={:.3f}\ttest accuracy={:.3f}'.format(
        epoch, running_loss, train_acc, test_acc))

    if test_acc >= best_accuracy:
        torch.save(model.state_dict(), './trained_models/CNN_model_VGG16.pkl')
        best_accuracy = test_acc

epoch=0	train loss=20.668015	train accuracy=0.210	test accuracy=0.500
epoch=1	train loss=1.923880	train accuracy=0.387	test accuracy=0.625
epoch=2	train loss=2.036543	train accuracy=0.323	test accuracy=0.938
epoch=3	train loss=0.945616	train accuracy=0.645	test accuracy=0.938
epoch=4	train loss=1.865938	train accuracy=0.468	test accuracy=0.500
epoch=5	train loss=0.947484	train accuracy=0.548	test accuracy=0.938
epoch=6	train loss=0.926650	train accuracy=0.613	test accuracy=0.625
epoch=7	train loss=1.030805	train accuracy=0.565	test accuracy=0.500
epoch=8	train loss=0.837630	train accuracy=0.694	test accuracy=0.562


KeyboardInterrupt: 