In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# download dataset from http://image.ntua.gr/iva/datasets/flickr_logos/
import tarfile
tf = tarfile.open('/gdrive/My Drive/flickr_logos_27_dataset.tar.gz')
tf.extractall()

In [None]:
tf = tarfile.open('flickr_logos_27_dataset/flickr_logos_27_dataset_images.tar.gz')
tf.extractall()

In [None]:
from __future__ import print_function
from __future__ import division
from PIL import Image, ImageDraw
import torchvision.models as models
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import torch.utils.data
# from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import time
import os
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)


USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)

# Number of classes in the dataset
num_classes = 27
# Batch size for training (change depending on how much memory you have)
batch_size = 8
# Number of epochs to train for
num_epochs = 15
# Flag for feature extracting. When False, we finetune the whole model,
#   when True we only update the reshaped layer params
feature_extract = True

PyTorch Version:  1.8.0+cu101
Torchvision Version:  0.9.0+cu101
using device: cuda


In [None]:
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla T4


In [None]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

In [None]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
in_features = model.roi_heads.box_predictor.cls_score.in_features #1024
num_classes = 28 #0 for background
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [None]:
model.roi_heads

RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=28, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=112, bias=True)
  )
)

In [None]:
#split images for train and val set
import random
with open("flickr_logos_27_dataset/flickr_logos_27_dataset_training_set_annotation.txt", "r") as f:
    contents = f.read()
lines = contents.split("\n")
group_by_class = {}
class_idx_map = {}
idx = 1
for line in lines:
    if len(line) <=1:
      continue
    img_name, class_name, class_idx, x1, y1, x2, y2, _ = line.split(' ')
    if class_name not in group_by_class:
        group_by_class[class_name] = []
        class_idx_map[class_name] = idx
        idx += 1
    group_by_class[class_name].append(line)
lines_train = []
lines_val = []
for class_name in group_by_class:
    train_split = random.sample(group_by_class[class_name], int(len(group_by_class[class_name])*0.8))
    val_split = list(set(group_by_class[class_name]) - set(train_split))
    lines_train.extend(train_split)
    lines_val.extend(val_split)

with open("flickr_logos_27_dataset/flickr_logos_27_dataset_training_set_annotation_trainsplit.txt", 'w') as f:
    for line in lines_train:
        f.write(line)
        f.write('\n')
        
with open("flickr_logos_27_dataset/flickr_logos_27_dataset_training_set_annotation_valsplit.txt", 'w') as f:
    for line in lines_val:
        f.write(line)
        f.write('\n')
        
print(class_idx_map)

{'Adidas': 1, 'Apple': 2, 'BMW': 3, 'Citroen': 4, 'Cocacola': 5, 'DHL': 6, 'Fedex': 7, 'Ferrari': 8, 'Ford': 9, 'Google': 10, 'Heineken': 11, 'HP': 12, 'Intel': 13, 'McDonalds': 14, 'Mini': 15, 'Nbc': 16, 'Nike': 17, 'Pepsi': 18, 'Porsche': 19, 'Puma': 20, 'RedBull': 21, 'Sprite': 22, 'Starbucks': 23, 'Texaco': 24, 'Unicef': 25, 'Vodafone': 26, 'Yahoo': 27}


In [None]:
# load logo-dataset via pytorch dataloader
import transforms as T
input_size=500
class myOwnDataset(torch.utils.data.Dataset):
    def __init__(self, root = "flickr_logos_27_dataset/", annotation = "flickr_logos_27_dataset/", phase = 'train', transforms=None):
        self.root = root
        self.image_dir = self.root + "flickr_logos_27_dataset_images/"
        self.transforms = transforms
        self.ht_wd = {}
#         self.coco = COCO(annotation)
#         self.ids = list(sorted(self.coco.imgs.keys()))        
        
        #read image names, class labels and bbox from annotations
        self.image_names = []
        self.annotation_dict = {}
        with open(annotation+"flickr_logos_27_dataset_training_set_annotation_"+phase+"split.txt", "r") as f:
            contents = f.read()
        lines = contents.split("\n")
        # class_idx_map = {}
        idx = 1
        flag=False
        for line in lines:
          if len(line.split(' ')) <=1:
            continue
          img_name, class_name, _, x1, y1, x2, y2, _ = line.split(' ')
          if int(x2) <= int(x1) or int(y2) <= int(y1):
            # print(line.split(' '), x2<=x1, y2<=y1)
            continue
          # if class_name not in class_idx_map:
          #     class_idx_map[class_name] = idx
          #     idx += 1
          self.image_names.append(img_name.split('.')[0])
          temp = {}
          boxes = [int(x1), int(y1), int(x2), int(y2)]
          area = (boxes[3] - boxes[1])*(boxes[2] - boxes[0])
          temp['boxes'] = torch.as_tensor([boxes], dtype=torch.float32)
          # temp['image_id'] = torch.tensor([img_name.split('.')[0]])
          temp['labels'] = torch.tensor([class_idx_map[class_name]], dtype=torch.int64)
          temp['area'] = torch.tensor([area], dtype=torch.float32)
          temp['iscrowd'] = torch.tensor([0], dtype=torch.int64)
          self.annotation_dict[img_name.split('.')[0]] = temp
          
          img = Image.open(os.path.join(self.image_dir, img_name))
          if flag == False:
              print("Image, size ", img.size)
              flag=True
          width, height = img.size
          self.ht_wd[img_name.split('.')[0]] = [height, width]

    def __getitem__(self, index):
        
        img_name = self.image_names[index] + ".jpg"
        img = Image.open(os.path.join(self.image_dir, img_name))
        my_annotation = self.annotation_dict[self.image_names[index]]
        my_annotation['image_id'] = torch.tensor([index])
        if self.transforms is not None:
            img, my_annotation = self.transforms(img, my_annotation)

        return img, my_annotation
    
    def __len__(self):
        return len(self.image_names)
    
    def get_height_and_width(self, idx):
        img_name = self.image_names[index]
        ht, wd = self.ht_wd[img_name]
        return ht, wd
        
def get_transform(train='train'):
    transforms = []
    transforms.append(T.ToTensor())
    if train =='train':
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

# def get_transform(phase='train'):
#     custom_transforms = []
# #     custom_transforms.append(torchvision.transforms.ToTensor())
#     if phase == 'train':
#         custom_transforms = [transforms.RandomResizedCrop(input_size),
#                             transforms.RandomHorizontalFlip(),
#                             transforms.ToTensor(),
#                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
#     else:
#         custom_transforms = [transforms.Resize(input_size),
#                             transforms.CenterCrop(input_size),
#                             transforms.ToTensor(),
#                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] 
#     return torchvision.transforms.Compose(custom_transforms)


# create own Dataset
my_dataset_train = myOwnDataset(transforms=get_transform('train'))
print("Length of training set: ", len(my_dataset_train))

my_dataset_val = myOwnDataset(phase = 'val', transforms=get_transform('val'))
print("Length of val set: ", len(my_dataset_val))
image_tensor, example = my_dataset_train[1000]
print(" example ", example)

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# Batch size
train_batch_size = 4

# own DataLoader
data_loader = {}
data_loader['train'] = torch.utils.data.DataLoader(my_dataset_train,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=2,
                                          collate_fn=collate_fn)
data_loader['val'] = torch.utils.data.DataLoader(my_dataset_val,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=2,
                                          collate_fn=collate_fn)


Image, size  (400, 500)
Length of training set:  3616
Image, size  (500, 461)
Length of val set:  915
 example  {'boxes': tensor([[204., 101., 333., 162.]]), 'labels': tensor([7]), 'area': tensor([7869.]), 'iscrowd': tensor([0]), 'image_id': tensor([1000])}


In [None]:
from engine import train_one_epoch, evaluate
import utils



# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader['train'], device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader['val'], device=device)


Epoch: [0]  [  0/904]  eta: 0:21:33  lr: 0.000011  loss: 3.5820 (3.5820)  loss_classifier: 3.4253 (3.4253)  loss_box_reg: 0.1472 (0.1472)  loss_objectness: 0.0067 (0.0067)  loss_rpn_box_reg: 0.0028 (0.0028)  time: 1.4308  data: 0.2298  max mem: 4878
Epoch: [0]  [ 10/904]  eta: 0:17:25  lr: 0.000066  loss: 3.3826 (3.3224)  loss_classifier: 3.2782 (3.2163)  loss_box_reg: 0.0653 (0.0713)  loss_objectness: 0.0261 (0.0281)  loss_rpn_box_reg: 0.0057 (0.0066)  time: 1.1699  data: 0.0313  max mem: 6711
Epoch: [0]  [ 20/904]  eta: 0:18:14  lr: 0.000121  loss: 2.9382 (2.6855)  loss_classifier: 2.7525 (2.5632)  loss_box_reg: 0.0680 (0.0774)  loss_objectness: 0.0292 (0.0373)  loss_rpn_box_reg: 0.0064 (0.0077)  time: 1.2285  data: 0.0117  max mem: 6878
Epoch: [0]  [ 30/904]  eta: 0:17:39  lr: 0.000176  loss: 0.8363 (1.9868)  loss_classifier: 0.7265 (1.8546)  loss_box_reg: 0.0891 (0.0829)  loss_objectness: 0.0434 (0.0413)  loss_rpn_box_reg: 0.0075 (0.0081)  time: 1.2362  data: 0.0114  max mem: 6878
