In [1]:
import sys
sys.path.append("..")
from utils.dataset import FerDataset

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision

from PIL import Image

import matplotlib.pyplot as plt
%matplotlib inline


In [17]:
class VGG13(nn.Module):
    
    def __init__(self):
        super(VGG13, self).__init__()
        
        self.convnet = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), stride = 1, padding = 1),
            # kernel size F=3, stride S = 1, to retain input size padding must be P = (F - 1)/2
            nn.ReLU(),
# sollte man ReLU immer nach jeder Conv layer einfuegen?
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            # max pool with F=2 and S=2 chooses the max out of a 2x2 square and only keeps that max value.
            # Therefore 75% of the information are left out
            # the max pool layer works on every depth dimension independently, therefore the input depth remains
            # unchanged
            
            #nn.Dropout2d(p=0.25),
# im Paper steht "a drop out rate of 25%". Heisst, dass p=0.25 oder p=0.75
# und warum genau gibt es in dieser Architektur eine extra dropout layer? Normalerweise dachte ich, sollte jede conv
# layer eine dropout layer sein um overfitten zu reduzieren oder?
            
            
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            #nn.Dropout2d(p=0.25),
            
            
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            #nn.Dropout2d(p=0.25),
            
            
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            #nn.Dropout2d(p=0.25),
            
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride = 1, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=2),
            #nn.Dropout2d(p=0.25)
        )
        
        self.fc = nn.Sequential(
            nn.Linear(512, 4096),
# angenommen nn.Linear(512, 4096) ist richtig (es kommt zumindest kein Fehler) warum sollte der Input der FC layer
# so gross sein wie die Anzahl an filtern der convolutional layer davor? Ich denke der input einer FC layer sollte
# heigh x width x depth der convolutional layer davor sein oder?
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
# ist die Linear layer eine fully connected layer? Und was bringen zwei davon hinterinenander?
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096,10),
            #nn.Softmax(10)
# ich verwende unten die loss Funktionen von dir, also die softmax funktion. Muss ich dann trotzdem
# hier noch eine Softmax layer einfuegen? Weil du hast das nicht gemacht beim LeNet
        )
        
        
    def forward(self, x):
        x = self.convnet(x)
        x = x.view(x.size(0), -1)
# Was macht dieses funktion view()?
        x = self.fc(x)
        return x
            
            
        
        
        
            
            
            
            

In [18]:
dataset = FerDataset(base_path='../../data',
                     data='ferplus',
                     mode='train',
                     label='ferplus_votes')
dataloader = DataLoader(dataset, batch_size=24, shuffle=True, num_workers=0)
net = VGG13()
log_softmax = nn.LogSoftmax(dim=-1)
criterion = nn.KLDivLoss(size_average=False)
optimizer = torch.optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)

In [19]:

def resize_img(x_batch, y_batch, size):
    x_batch_resized = torch.zeros((x_batch.shape[0], x_batch.shape[1], size, size))
    for i in range(x_batch.shape[0]):
        image = torchvision.transforms.ToPILImage()(x_batch[i])
        image = torchvision.transforms.functional.resize(image, (size, size))
        x_batch_resized[i] = torchvision.transforms.ToTensor()(image)
        
    return x_batch_resized, y_batch

In [20]:
x_batch, y_batch = next(iter(dataloader))
print(x_batch.shape)
resized_img, y = resize_img(x_batch, y_batch, 224)
print(resized_img.shape)

torch.Size([24, 1, 48, 48])
torch.Size([24, 1, 224, 224])


In [21]:

losses = []
for i in range(1000):
    optimizer.zero_grad()
    logits = net(x_batch)
    log_probs = log_softmax(logits)
    loss = criterion(log_probs, y_batch)
    losses.append(float(loss))
    loss.backward()
    optimizer.step()
    print(i, end='\r')

10

KeyboardInterrupt: 

In [53]:
from torchvision.models import vgg13

In [54]:
print(vgg13)

<function vgg13 at 0x7fbf71547598>
