In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import models, transforms
from tqdm import tqdm
from collections import OrderedDict
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.5.0
Torchvision Version:  0.6.0


In [2]:
#Paths
from pathlib import Path
base_folder = Path('.')
data_folder = base_folder/'til2020'
train_imgs_folder = data_folder/'train'/'train'
train_annotations = data_folder/'train_final.json'
val_imgs_folder = data_folder/'val'/'val'
val_annotations = data_folder/'val_final.json'

train_pickle = data_folder/'train.p'/'train.p'
val_pickle = data_folder/'val.p'/'val.p'

save_model_folder = base_folder/'ckpts'
load_model_folder = data_folder

In [106]:
class Config():
    input_shape = (3,224,224) #hardcoded into pretrained, cannot change
    categories = ['tops', 'trousers', 'outerwear', 'dresses', 'skirts']

    wt_decay = 5e-4
    dims_list = [(7,7),(14,14)] #hardcoded into structure, cannot change
    aspect_ratios = [(1,1), (1,2), (2,1)]
    intermediate_layer = 6 #layer3 output res of pretrained (22nd repeating unit) is 14x14

In [213]:
'''
close adaptation of the one in the example, just different backbone
padding=(1,1) enables padding in both dimensions like 'same'
padding=0 disables padding like 'valid'
purpose of kernel_regularizer is done by the optimizer in pyTorch
'''
class ObjectCNNModel(nn.Module):
    def __init__(self,conf):
        super(ObjectCNNModel,self).__init__()
        self.intermediate_layer = conf.intermediate_layer
        self.dims_list = conf.dims_list
        self.aspect_ratios = conf.aspect_ratios
        self.categories = conf.categories

        #remove last layer, gain access to intermediates
        pretrained_model = torchvision.models.resnext101_32x8d(pretrained=True)
        self.pretrained = list(pretrained_model.children())[:-1]

        from torch.nn import Conv2d,BatchNorm2d,LeakyReLU,ConvTranspose2d
        def add_activation(layers):
            return [l for x in layers for l in (
                x,
                BatchNorm2d(x.out_channels),
                LeakyReLU(0.01),
            )] #[l1,bn,relu,l2,bn,relu,l3,bn,relu...]

        self.upsampler = nn.Sequential(*add_activation([
            Conv2d(2048,512,1,padding=(1,1)),
            Conv2d(512,1024,3,padding=(1,1)),
            Conv2d(1024,512,1,padding=(1,1)),
            Conv2d(512,1024,3,padding=(1,1)),
            Conv2d(1024,512,1,padding=(1,1)),
        ]))
        self.dim_7x7_layer = nn.Sequential(*add_activation([Conv2d(512,2048,3,padding=(1,1))]))
        self.transposer = nn.Sequential(*add_activation([
            Conv2d(512,256,1,padding=(1,1)),
            ConvTranspose2d(256,512,4,stride=(2,2),padding=(3,3))
        ]))
        self.dim_14x14_layer = nn.Sequential(*add_activation([
            Conv2d(1024+512,256,1,padding=(1,1)), #torch.cat((N,512,14,14),(N,1024,14,14))
            Conv2d(256,512,3,padding=(1,1)),
            Conv2d(512,256,1,padding=(1,1)),
            Conv2d(256,512,3,padding=(1,1)),
            Conv2d(512,256,1,padding=(1,1)),
            Conv2d(256,512,3,padding=(1,1)),
        ]))
        self.predictors_7x7 = []
        self.predictors_14x14 = []
        for aspect in self.aspect_ratios:
            #no activation here
            self.predictors_7x7.append([
                Conv2d(2048,1,1), #objectness
                Conv2d(2048,len(self.categories),1), #class
                Conv2d(2048,4,1) #bbox
            ])
            self.predictors_14x14.append([
                Conv2d(512,1,1), #objectness
                Conv2d(512,len(self.categories),1), #class
                Conv2d(512,4,1) #bbox
            ])
    
    def forward(self,x):
        intermediate_output = torch.zeros(x.shape[0],1024,14,14)
        for i,layer in enumerate(self.pretrained):
            x = layer(x)
            if i == self.intermediate_layer: intermediate_output = x
        backbone_output = x
        print(f'Intermediate:{intermediate_output.shape}')
        print(f'Backbone:{backbone_output.shape}')

        upsample = self.upsampler(backbone_output)
        print(f'self.transposer(upsample):{self.transposer(upsample).shape}')
        tens_7x7 = torch.add(self.dim_7x7_layer(upsample),backbone_output)
        x = torch.cat((self.transposer(upsample),intermediate_output),dim=1)
        tens_14x14 = self.dim_14x14_layer(x)

        pred_7x7 = []
        for predictor in self.predictors_7x7:
            pred_7x7.append(torch.cat((
                predictor[0](tens_7x7),
                predictor[1](tens_7x7),
                predictor[2](tens_7x7),
            ),dim=1))
        pred_7x7 = torch.cat(pred_7x7)
        pred_7x7 = torch.reshape(pred_7x7,(7,7,len(self.aspect_ratios),len(self.categories)+5))

        pred_14x14 = []
        for predictor in self.predictors_14x14:
            pred_14x14.append(torch.cat((
                predictor[0](tens_14x14),
                predictor[1](tens_14x14),
                predictor[2](tens_14x14),
            ),dim=1))
        pred_14x14 = torch.cat(pred_14x14)
        print(f'tens_14x14:{tens_14x14.shape}')
        pred_14x14 = torch.reshape(pred_14x14,(14,14,len(self.aspect_ratios),len(self.categories)+5))

        return {'7x7':pred_7x7,'14x14':pred_14x14}

In [214]:
model = ObjectCNNModel(Config)

#backbone = torchvision.models.resnet50(pretrained=True)
backbone = torchvision.models.resnext101_32x8d(pretrained=True) #SOTA


from torchsummaryX import summary
df = summary(backbone,torch.zeros((1,)+(Config.input_shape)))
print(df.loc[df.index.str.contains(pat = 'layer3')])
list(backbone.children())[:][6]

In [215]:
from torchsummaryX import summary
df = summary(model,torch.zeros((1,)+(Config.input_shape)))

Intermediate:torch.Size([1, 1024, 14, 14])
Backbone:torch.Size([1, 2048, 1, 1])
self.transposer(upsample):torch.Size([1, 512, 14, 14])


RuntimeError: Sizes of tensors must match except in dimension 0. Got 512 and 1024 in dimension 1