In [1]:
import sys
sys.path.append("..")
from utils.dataset import FerDataset

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision

from PIL import Image

import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
class ResidualUnit(nn.Module):
    
    def __init__(self, depth_in, depth_out):
        super(ResidualUnit, self).__init__()
        self.stride = 1
        
        self.resBlock = nn.Sequential(
            nn.Conv2d(depth_in, depth_out, kernel_size=(3, 3), stride = self.stride, padding = 1),
            nn.BatchNorm2d(depth_out),
            nn.ReLU(),
            nn.Conv2d(depth_out, depth_out, kernel_size=(3, 3), stride = self.stride, padding = 1),
            nn.BatchNorm2d(depth_out)
        )
        
    def forward(self, x):
        #Residual unit
        identity = x
        
        x = self.resBlock(x)
        #Residual unit
        #print("x.shape", x.shape)
        #x = x.view(x.size(0), -1)
        
        x += identity
        x = nn.ReLU(x)
        
        return x

In [18]:
class DownsampleResidualUnit(nn.Module):
    
    def __init__(self, depth_in, depth_out):
        super(DownsampleResidualUnit, self).__init__()
        self.stride = 2
        
        self.resBlock = nn.Sequential(
            nn.Conv2d(depth_in, depth_out, kernel_size=(3, 3), stride = self.stride, padding = 1),
            nn.BatchNorm2d(depth_out),
            nn.ReLU(),
            nn.Conv2d(depth_out, depth_out, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.BatchNorm2d(depth_out)
        )
        
        self.matchDim = nn.Sequential(
            nn.Conv2d(depth_in, depth_out, kernel_size=(1,1), stride=self.stride, padding = 0),
            # this is required to match the dimensions of the identity x with F(x), because
            # in this block the first of the two convolutionl layers performs downsamlpling and therefore
            # changes the dimensions of the activation volume.
            nn.BatchNorm2d(depth_out)
        )
        
        
    def forward(self, x):
        #Downsample unit
        identity = x
        
        x = self.resBlock(x)
        #Downsample unit
        #print("x.shape", x.shape)
        #x = x.view(x.size(0), -1)
        
        
        identity = self.matchDim(identity)
        
        
        x += identity
        x = nn.ReLU(x)
        
        return x

In [4]:
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
# https://arxiv.org/pdf/1512.03385.pdf

In [19]:
class ResNet(nn.Module):
    
    def __init__(self):
        super(ResNet, self).__init__()
        
        self.convnet = nn.Sequential(
            # 224 x 224 x 1
            nn.Conv2d(1, 64, kernel_size=(7, 7), stride = 2, padding = 3),
            # 7 x 7, out_depth=64, stride=2 are given by the paper. The output has to be 112 x 112 x 64.
            # Therefore the padding ist calculated via: P = ((width_out-1) * stride + filter_size - width_in)/2
            # instead of width_in and width_out one can also use height_in and height_out
            # ((112-1)*2+7-224)/2 = P = 2.5, i.e. we choose padding = 3
            
            # 112 x 112 x 64
            
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(3, 3), stride=2, padding = 1),
            # MaxPool with kernel = (3,3) and stride=2 is given by the paper
            # P = ((56-1) * 2 + 3 - 112)/2 = 0.5, i.e. P=1
            
            # 56 x 56 x 64
            
            ResidualUnit(64, 64),
            ResidualUnit(64, 64), 
            ResidualUnit(64, 64), 
            
            DownsampleResidualUnit(64, 128),
            # 28 x 28 x 128
            ResidualUnit(128, 128),
            ResidualUnit(128, 128),
            ResidualUnit(128, 128),
            
            DownsampleResidualUnit(128, 256),
            # 14 x 14 x 256
            ResidualUnit(256, 256),
            ResidualUnit(256, 256),
            ResidualUnit(256, 256),
            ResidualUnit(256, 256),
            ResidualUnit(256, 256),
            
            DownsampleResidualUnit(256, 512),
            # 7 x 7 x 512
            ResidualUnit(512, 512),
            ResidualUnit(512, 512),
            
            #nn.AdaptiveAvgPool2d(1)  
            nn.AvgPool2d(kernel_size = (7,7), stride=1, padding=0)         
            # width_out = (width_in - F + 2P)/S+1
            # = (7 - 7 + 0)/2 + 1 = 1
            # 1 x 1 x 512
        )
        
        
        self.fc = nn.Sequential(
            nn.Linear(512, 10)        
        )
        
        
    def forward(self, x):
        x = self.convnet(x)
        print("x.shape", x.shape)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
        

In [6]:
# i have to check dimension and padding of: the first convolutional layer and the max pooling 

In [21]:
dataset = FerDataset(base_path='../../data',
                     data='ferplus',
                     mode='train',
                     label='ferplus_votes')
dataloader = DataLoader(dataset, batch_size=6, shuffle=True, num_workers=0)
net = ResNet()
log_softmax = nn.LogSoftmax(dim=-1)
criterion = nn.KLDivLoss(size_average=False)
optimizer = torch.optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)

In [22]:
def resize_img(x_batch, y_batch, size):
    x_batch_resized = torch.zeros((x_batch.shape[0], x_batch.shape[1], size, size))
    for i in range(x_batch.shape[0]):
        image = torchvision.transforms.ToPILImage()(x_batch[i])
        image = torchvision.transforms.functional.resize(image, (size, size))
        x_batch_resized[i] = torchvision.transforms.ToTensor()(image)
        
    return x_batch_resized, y_batch

In [23]:
x_batch, y_batch = next(iter(dataloader))
print(x_batch.shape)
x_batch, y_batch = resize_img(x_batch, y_batch, 224)
print(x_batch.shape)

torch.Size([6, 1, 48, 48])
torch.Size([6, 1, 224, 224])


In [24]:
losses = []
for i in range(1000):
    optimizer.zero_grad()
    logits = net(x_batch)
    log_probs = log_softmax(logits)
    loss = criterion(log_probs, y_batch)
    losses.append(float(loss))
    loss.backward()
    optimizer.step()
    print(i, end='\r')

TypeError: conv2d(): argument 'input' (position 1) must be Tensor, not ReLU