# Single-shot Detection with sgrvinod and FastAI

This is a interactive notebook where you can change things around, experiment and singling out techniques anywhere in the process

## The model components

This guide assume solid understand of convolutional concepts and computation. If this assumption does not hold, please feel free to check my other [repos](https://github.com/Sylar257/Image-Captioning-Project) (and [this](https://github.com/Sylar257/Skin-cancer-detection-with-stacking)) that contains more contents on these subjects.
Hence, in this notebook, we are going to jump right into our model and explain along the way why we need all these components.<br>
This might also be the place where you can experiment most of your modification on the archietectures. Using more powerful base-models, add in regularization layers, etc...

In [1]:
from torch import nn
from utils import *
import torch.nn.functional as F
from math import sqrt
from itertools import product as product
import torchvision
import time
import torch.backends.cudnn as cudnn
import torch.utils.data
import torch
from torch.utils.data import Dataset
import json
import os
from PIL import Image
from utils import transform

In [2]:
# specify GPU for cuda
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

In [3]:
class VGGBase(nn.Module):
    """
    We implement VGG-16 here for low-level feature extraction
    """

    def __init__(self):
        super(VGGBase, self).__init__()

        # Stabdard convolutional layers in VGG16
        # We have an input size of 300 by 300
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)   # stride = 1, output = (300+2-3)/1+1 = 300
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)  # output = 300 as before
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)          # output = (300-2)/2+1 = 150

        self.conv2_1 = nn.Conv2d(64,  128, kernel_size=3, padding=1)# output = (150+2-3)/1+1 = 150
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)# output = (150+2-3)/1+1 = 150
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)          # output = (150-2)/2 +1  = 75

        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)  # ceiling (not floor) here for even dims
        # output = ceil((75-2)/2)-1 = 38   if floor we would be getting 37 here which is an odd number

        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1) # output = (38+2-3)/1+1 = 38
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1) # output = (38+2-3)/1+1 = 38
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1) # output = (38+2-3)/1+1 = 38
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)           # output = (38-2)/2 +1  = 19

        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1) # output = (19+2-3)/1+1 = 19
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1) # output = (19+2-3)/1+1 = 19
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1) # output = (19+2-3)/1+1 = 19
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)  # We retain the size at this step with padding and stride of 1
        # output = (19+2-3)/1+1 = 19

        # Here we replace the FC6 and FC7 with the technique introduce by sgrvinod(same with the original paper)
        self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # output = (19+12-3-2*(6-1))/1+1 = 19

        self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1)                       # output = (19-1)/1+1 = 19

        # Load pretrained layers
        self.load_pretrained_layers()

    def forward(self, image):
        """
        Forward run with an image input of size 300 by 300
        :param image: images, a tensor of dimensions (N, 3, 300, 300)
        :return: lower-level feature maps conv4_3 and conv7
        """
        out = F.relu(self.conv1_1(image))   # (N,64,300,300)
        out = F.relu(self.conv1_2(out))     # (N,64,300,300)
        out = self.pool1(out)               # (N,64,150,150)

        out = F.relu(self.conv2_1(out))  # (N,128,150,150)
        out = F.relu(self.conv2_2(out))  # (N,128,150,150)
        out = self.pool2(out)            # (N,128, 75, 75)

        out = F.relu(self.conv3_1(out))  # (N,256, 75, 75)
        out = F.relu(self.conv3_2(out))  # (N,256, 75, 75)
        out = F.relu(self.conv3_3(out))  # (N,256, 75, 75)
        out = self.pool3(out)            # (N,256, 38, 38), it would have been 37 if not for ceil_mode = True

        out = F.relu(self.conv4_1(out))  # (N, 512, 38, 38)
        out = F.relu(self.conv4_2(out))  # (N, 512, 38, 38)
        out = F.relu(self.conv4_3(out))  # (N, 512, 38, 38)
        # here we extract the feature from conv4_3
        conv4_3_feats = out              # (N, 512, 38, 38)
        out = self.pool4(out)            # (N, 512, 19, 19)

        out = F.relu(self.conv5_1(out))  # (N, 512, 19, 19)
        out = F.relu(self.conv5_2(out))  # (N, 512, 19, 19)
        out = F.relu(self.conv5_3(out))  # (N, 512, 19, 19)
        out = self.pool5(out)            # (N, 512, 19, 19), pool5 does not reduce dimensions

        out = F.relu(self.conv6(out))    # (N, 1024, 19, 19)

        conv7_feats = F.relu(self.conv7(out))  # (N, 1024, 19, 19)

        # Lower-level feature maps
        return conv4_3_feats, conv7_feats

    def load_pretrained_layers(self):
        """
        Use pre-trained wieght from Torch Vsion. 
        Convert fc6 and fc7 weights into conv6 and conv7
        """
        # Current state of base
        state_dict = self.state_dict()
        param_names = list(state_dict.keys())

        # Pretrained VGG base
        pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict()
        pretrained_param_names = list(pretrained_state_dict.keys())

        # Transfer conv. parameters from pretrained model to current model
        for i, param in enumerate(param_names[:-4]):  # excluding conv6 and conv7 parameters
            state_dict[param] = pretrained_state_dict[pretrained_param_names[i]]

        # Convert fc6, fc7 to convolutional layers, and subsample (by decimation) to sizes of conv6 and conv7
        # fc6
        conv_fc6_weight = pretrained_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)  # (4096, 512, 7, 7)
        conv_fc6_bias = pretrained_state_dict['classifier.0.bias']  # (4096)
        state_dict['conv6.weight'] = decimate(conv_fc6_weight, m=[4, None, 3, 3])  # (1024, 512, 3, 3)
        state_dict['conv6.bias'] = decimate(conv_fc6_bias, m=[4])  # (1024)
        # fc7
        conv_fc7_weight = pretrained_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)  # (4096, 4096, 1, 1)
        conv_fc7_bias = pretrained_state_dict['classifier.3.bias']  # (4096)
        state_dict['conv7.weight'] = decimate(conv_fc7_weight, m=[4, 4, None, None])  # (1024, 1024, 1, 1)
        state_dict['conv7.bias'] = decimate(conv_fc7_bias, m=[4])  # (1024)

        # Note: an FC layer of size (K) operating on a flattened version (C*H*W) of a 2D image of size (C, H, W)...
        # ...is equivalent to a convolutional layer with kernel size (H, W), input channels C, output channels K...
        # ...operating on the 2D image of size (C, H, W) without padding

        self.load_state_dict(state_dict)

        print("\nLoaded base model with pre-trained weights\n")

In [4]:
class AuxiliaryConvolutions(nn.Module):
    """
    These layers are put on top of base model to produce more feature maps for object detections.(smaller maps)
    """

    def __init__(self):
        super(AuxiliaryConvolutions, self).__init__()

        self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, padding=0)         # output=(19-1)/1+1 = 19
        self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)# output=(19+2-3)/2+1 = 10

        self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1, padding=0)          # output=(10-1)/1+1 = 10
        self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)# output=(10+2-3)/2+1 = 5 because by defaul we use "floor"
        
        self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)         # output=(5-1)/1+1 = 5
        self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)         # output=(5-3)/1+1 = 3
        
        self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)         # output=(3-1)/1+1 = 3
        self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)         # output=(3-3)/1+1 = 1
        
        self.init_conv2d()
        
    def init_conv2d(self):
        """
        Initialize convolution parameters
        """
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.constant_(c.bias, 0.) 
                
    def forward(self, conv7_feats):
        """
        conv7_feats: (N, 1024, 19, 19)
        return: higher-level feature maps conv8_2, conv9_2, conv10_2, and conv11_2
        """
        out = F.relu(self.conv8_1(conv7_feats))  # (N, 256, 19, 19)
        out = F.relu(self.conv8_2(out))  # (N, 512, 10, 10)
        conv8_2_feats = out  # (N, 512, 10, 10)

        out = F.relu(self.conv9_1(out))  # (N, 128, 10, 10)
        out = F.relu(self.conv9_2(out))  # (N, 256, 5, 5)
        conv9_2_feats = out  # (N, 256, 5, 5)

        out = F.relu(self.conv10_1(out))  # (N, 128, 5, 5)
        out = F.relu(self.conv10_2(out))  # (N, 256, 3, 3)
        conv10_2_feats = out  # (N, 256, 3, 3)

        out = F.relu(self.conv11_1(out))  # (N, 128, 3, 3)
        conv11_2_feats = F.relu(self.conv11_2(out))  # (N, 256, 1, 1)

        # Higher-level feature maps
        return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
    

In [5]:
class PredictionConvolutions(nn.Module):
    """
    Convolutions to predict class scores and bounding boxes using lower and higher level feature maps

    The bounding boxes (offsets (g_{c_x}, g_{c_y}, g_w, g_h) of the 8732 default priors)
    See 'cxcy_to_gcxgcy' in utils.py for encoding definition

    The class scores represent the scores of each object class in each of the 8732 hounding boxes
    A high score for 'background' = no object
    """

    def __init__(self, n_classes):
        super(PredictionConvolutions, self).__init__()

        self.n_classes = n_classes

        # Number of proior_boxes we are considering per position in each feature map
        n_boxes = {'conv4_3': 4,
                    'conv7': 6,
                    'conv8_2': 6,
                    'conv9_2': 6,
                    'conv10_2': 4,
                    'conv11_2': 4}
        # 4 prior-boxes prediction convoluitions (predict offsets w.r.t prior-boxes)

        # This is the part to compute LOCALIZATION prediction
        self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*4, kernel_size=3, padding=1) # output = (38-3+2)/1+1 = 38, same padding
        self.loc_conv7   = nn.Conv2d(1024, n_boxes['conv7']*4, kernel_size=3, padding=1)  # output = (19-3+2)/1+1 = 19
        self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*4, kernel_size=3, padding=1) # output = (10-3+2)/1+1 = 10
        self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*4, kernel_size=3, padding=1) # output = (5-3+2)/1 +1 = 5
        self.loc_conv10_2= nn.Conv2d(256, n_boxes['conv10_2']*4,kernel_size=3, padding=1) # output = (3-3+2)/1 +1 = 3
        self.loc_conv11_2= nn.Conv2d(256, n_boxes['conv11_2']*4,kernel_size=3, padding=1) # output = (1-3+2)/1 +1 = 1

        # This is the part to comput CLASS prediction
        self.cl_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv7   = nn.Conv2d(1024,n_boxes['conv7']   * n_classes, kernel_size=3, padding=1)
        self.cl_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv10_2 = nn.Conv2d(256,n_boxes['conv10_2'] * n_classes,kernel_size=3, padding=1)
        self.cl_conv11_2 = nn.Conv2d(256,n_boxes['conv11_2'] * n_classes,kernel_size=3, padding=1)

        self.init_conv2d()
    def init_conv2d(self):
        # Use Kaiming_uniform_ here instead of xavier_uniform_
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.constant_(c.bias, 0.)

    def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats):
        """
        Forward propagation.
        :param conv4_3_feats: conv4_3 feature map, a tensor of dimensions (N, 512, 38, 38)
        :param conv7_feats: conv7 feature map, a tensor of dimensions (N, 1024, 19, 19)
        :param conv8_2_feats: conv8_2 feature map, a tensor of dimensions (N, 512, 10, 10)
        :param conv9_2_feats: conv9_2 feature map, a tensor of dimensions (N, 256, 5, 5)
        :param conv10_2_feats: conv10_2 feature map, a tensor of dimensions (N, 256, 3, 3)
        :param conv11_2_feats: conv11_2 feature map, a tensor of dimensions (N, 256, 1, 1)
        :return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image
        """
        batch_size = conv4_3_feats.size(0)

        # Predict localization boxes' bounds w.r.t prior boxes
        l_conv4_3 = self.loc_conv4_3(conv4_3_feats)            # (N, 16, 38, 38)  16 is from 4 priors 4*4=16
        l_conv4_3 = l_conv4_3.permute(0, 2, 3, 1).contiguous() # (N, 38, 38, 16)  to match prior-box order (after .view())
        # .contiguous() ensures it is stores in a contiguous chunk of memory, needed for .view() below

        l_conv4_3 = l_conv4_3.view(batch_size, -1, 4)          # This give us (N, 5776, 4) the (g_{c_x}, g_{c_y}, g_w, g_h) for all 5776 priors

        l_conv7 = self.loc_conv7(conv7_feats)  # (N, 24, 19, 19)
        l_conv7 = l_conv7.permute(0, 2, 3, 1).contiguous()  # (N, 19, 19, 24)
        l_conv7 = l_conv7.view(batch_size, -1, 4)  # (N, 2166, 4), there are a total 2116 boxes on this feature map

        l_conv8_2 = self.loc_conv8_2(conv8_2_feats)  # (N, 24, 10, 10)
        l_conv8_2 = l_conv8_2.permute(0, 2, 3, 1).contiguous()  # (N, 10, 10, 24)
        l_conv8_2 = l_conv8_2.view(batch_size, -1, 4)  # (N, 600, 4)

        l_conv9_2 = self.loc_conv9_2(conv9_2_feats)  # (N, 24, 5, 5)
        l_conv9_2 = l_conv9_2.permute(0, 2, 3, 1).contiguous()  # (N, 5, 5, 24)
        l_conv9_2 = l_conv9_2.view(batch_size, -1, 4)  # (N, 150, 4)

        l_conv10_2 = self.loc_conv10_2(conv10_2_feats)  # (N, 16, 3, 3)
        l_conv10_2 = l_conv10_2.permute(0, 2, 3, 1).contiguous()  # (N, 3, 3, 16)
        l_conv10_2 = l_conv10_2.view(batch_size, -1, 4)  # (N, 36, 4)

        l_conv11_2 = self.loc_conv11_2(conv11_2_feats)  # (N, 16, 1, 1)
        l_conv11_2 = l_conv11_2.permute(0, 2, 3, 1).contiguous()  # (N, 1, 1, 16)
        l_conv11_2 = l_conv11_2.view(batch_size, -1, 4)  # (N, 4, 4)

        # Predict classes in localization boxes
        c_conv4_3 = self.cl_conv4_3(conv4_3_feats)  # (N, 4 * n_classes, 38, 38)
        c_conv4_3 = c_conv4_3.permute(0, 2, 3,
                                      1).contiguous()  # (N, 38, 38, 4 * n_classes), to match prior-box order (after .view())
        c_conv4_3 = c_conv4_3.view(batch_size, -1,
                                    self.n_classes)  # (N, 5776, n_classes), there are a total 5776 boxes on this feature map

        c_conv7 = self.cl_conv7(conv7_feats)  # (N, 6 * n_classes, 19, 19)
        c_conv7 = c_conv7.permute(0, 2, 3, 1).contiguous()  # (N, 19, 19, 6 * n_classes)
        c_conv7 = c_conv7.view(batch_size, -1,
                                self.n_classes)  # (N, 2166, n_classes), there are a total 2116 boxes on this feature map

        c_conv8_2 = self.cl_conv8_2(conv8_2_feats)  # (N, 6 * n_classes, 10, 10)
        c_conv8_2 = c_conv8_2.permute(0, 2, 3, 1).contiguous()  # (N, 10, 10, 6 * n_classes)
        c_conv8_2 = c_conv8_2.view(batch_size, -1, self.n_classes)  # (N, 600, n_classes)

        c_conv9_2 = self.cl_conv9_2(conv9_2_feats)  # (N, 6 * n_classes, 5, 5)
        c_conv9_2 = c_conv9_2.permute(0, 2, 3, 1).contiguous()  # (N, 5, 5, 6 * n_classes)
        c_conv9_2 = c_conv9_2.view(batch_size, -1, self.n_classes)  # (N, 150, n_classes)

        c_conv10_2 = self.cl_conv10_2(conv10_2_feats)  # (N, 4 * n_classes, 3, 3)
        c_conv10_2 = c_conv10_2.permute(0, 2, 3, 1).contiguous()  # (N, 3, 3, 4 * n_classes)
        c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes)  # (N, 36, n_classes)

        c_conv11_2 = self.cl_conv11_2(conv11_2_feats)  # (N, 4 * n_classes, 1, 1)
        c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous()  # (N, 1, 1, 4 * n_classes)
        c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes)  # (N, 4, n_classes)

        # A total of 8732 boxes
        # Concatenate in this specific order    
        locs = torch.cat([l_conv4_3, l_conv7, l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1)  # (N, 8732, 4)
        classes_scores = torch.cat([c_conv4_3, c_conv7, c_conv8_2, c_conv9_2, c_conv10_2, c_conv11_2], dim=1)  # (N, 8732, n_classes)

        return locs, classes_scores

In [6]:
class SSD300(nn.Module):
    def __init__(self, n_classes):

        """
        This class works as a wrapper that encapsulates the base VGG network, auxiliary, and prediciton convolutions.
        """
        super(SSD300, self).__init__()

        self.n_classes = n_classes

        self.base = VGGBase()
        self.aux_convs = AuxiliaryConvolutions()
        self.pred_convs = PredictionConvolutions(n_classes)

        # Since lower level features (conv4_3_feats) have considerably larger scales, we take the L2 norm and rescale
        # Rescale factor is initially set at 20, but is learned for each channel during back-prop
        self.rescale_factor = nn.Parameter(torch.FloatTensor(1, 512, 1, 1)) # there are 512 channels in conv4_3_feats
        nn.init.constant_(self.rescale_factor, 20)

        # The above two lines demonstrate a simple example of how do add a leanable-parameter into our computation

        # Prior boxes
        self.priors_cxcy = self.create_prior_boxes()  # with shape of (8732, 4)

    def create_prior_boxes(self):
        """
        Create the 8732 prior (default) boxes for the SSD300, as defined in the paper.
        :return: prior boxes in center-size coordinates, a tensor of dimensions (8732, 4)
        """
        fmap_dims = {'conv4_3': 38,
                     'conv7': 19,
                     'conv8_2': 10,
                     'conv9_2': 5,
                     'conv10_2': 3,
                     'conv11_2': 1}

        obj_scales = {'conv4_3': 0.1,
                      'conv7': 0.2,
                      'conv8_2': 0.375,
                      'conv9_2': 0.55,
                      'conv10_2': 0.725,
                      'conv11_2': 0.9}

        aspect_ratios = {'conv4_3': [1., 2., 0.5],
                         'conv7': [1., 2., 3., 0.5, .333],
                         'conv8_2': [1., 2., 3., 0.5, .333],
                         'conv9_2': [1., 2., 3., 0.5, .333],
                         'conv10_2': [1., 2., 0.5],
                         'conv11_2': [1., 2., 0.5]}

        fmaps = list(fmap_dims.keys())

        prior_boxes = []

        for k, fmap in enumerate(fmaps):
            for i in range(fmap_dims[fmap]):
                for j in range(fmap_dims[fmap]):
                    cx = (j + 0.5) / fmap_dims[fmap]
                    cy = (i + 0.5) / fmap_dims[fmap]

                    for ratio in aspect_ratios[fmap]:
                        prior_boxes.append([cx, cy, obj_scales[fmap] * sqrt(ratio), obj_scales[fmap] / sqrt(ratio)])

                        # For an aspect ratio of 1, use an additional prior whose scale is the geometric mean of the
                        # scale of the current feature map and the scale of the next feature map
                        if ratio == 1.:
                            try:
                                additional_scale = sqrt(obj_scales[fmap] * obj_scales[fmaps[k + 1]])
                            # For the last feature map, there is no "next" feature map
                            except IndexError:
                                additional_scale = 1.
                            prior_boxes.append([cx, cy, additional_scale, additional_scale])

        prior_boxes = torch.FloatTensor(prior_boxes).to(device)  # (8732, 4)
        prior_boxes.clamp_(0, 1)  # (8732, 4)

        return prior_boxes
    
    def forward(self, image):
        """
        Forward propagation

        image: (N, 3, 300, 300)
        return:  8732 locations and class scores (i.e.  w.r.t each prior box) for the given image(s)
        """

        # Run VGG base network convolutions (lower level feature map generators, up to conv7)
        conv4_3_feats, conv7_feats = self.base(image)   # (N, 512, 38, 38),  (N, 1024, 19, 19)

        # Rescale conv4_3 after L2 norm using our learnable parameter
        norm = conv4_3_feats.pow(2).sum(dim=1, keepdim=True).sqrt()  # (N, 1, 38, 38)
        conv4_3_feats = conv4_3_feats / norm                         # (N, 512, 38, 38) this step was done by broadcasting
        conv4_3_feats = conv4_3_feats*self.rescale_factor            # (N, 512, 38, 38)

        # Run auxiliaury convolution (higher level feature map extraction)
        conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.aux_convs(conv7_feats) 
        # (N, 512, 10, 10), (N, 256, 5, 5), (N, 256, 3, 3), (N, 256, 1, 1)

        # Run prediction convolutions (predict offset w.r.t. priors and classes in each resulting location)
        locs, classes_scores = self.pred_convs(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
        # (N, 8732, 4), (N, 8732, n_classes)

        return locs, classes_scores
    
    def detect_objects(self, predicted_locs, predicted_scores, min_score, max_overlap, top_k):
        """
        Decipher the 8732 locations and class scores (output of our forward pass) to detect objects.

        For each class. perform Non-Maximum Suppression (NMS) on boxes that are above a minimum score

        predicted_locs: predicted locations w.r.t the 8732 prior boxes, a tensor of (N, 8732, 4)
        predicted_scores: predicted class score for each of prediced locations, a tensor of (N, 8732, n_classes)
        min_score: the minimun score for a box to be consifered a match for a CERTAIN CLASS
        max_overlap: the maximum overlap that we allow. For any pair of boxes with higher overlap, the lower class score one will be suppressed
        top_k: if there are a lot of resulting detection across all classes, keep only the top_k 
        
        return: detections (boxes, labels, and scores), lists of length batch_size N
        """
        batch_size = predicted_locs.size(0) # N
        n_priors = self.priors_cxcy.size(0) # 8732
        predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes)

        # list to store final predicted boxes, labels, and scores for all images
        all_images_boxes = list()
        all_images_scores = list()
        all_images_labels = list()

        assert n_priors == predicted_scores.size(1) == predicted_locs.size(1)

        for i in range(batch_size):
            # Convert diviation from prior boxes to (c_x, c_y, w, h)
            # Convert bounding boxes from center-size coordinates (c_x, c_y, w, h) to boundary coordinates (x_min, y_min, x_max, y_max)
            
            decoded_locs = cxcy_to_xy(gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy))

            # Lists to store boxes and scores for this image
            image_box = list()
            image_scores = list()
            image_labels = list()

            max_score,  best_label = predicted_scores[i].max(dim=1) # (8732), (8732)

            # operations for each class. Class 0 is not included here because it denotes background(negative)
            for c in range(1, self.n_classes):
                # Keep only predicted boxes and scores where scores for this class are above minimum_score
                class_scores = predicted_scores[i][:, c]  # (8732)
                score_above_min_score = class_score > min_score # torch.uint8 (byte) tensor, for infexing
                n_above_min_score = score_above_min_score.sum().item()
                if n_above_min_score == 0:
                    continue
                # here, we will retain the score & locs of the boxes with score higher than the threshold
                class_scores = class_scores[score_above_min_score] # (n_qualified), n_min_score <= 8732
                class_decoded_locs = decoded_locs[score_above_min_score] # (n_qualfied, 4)

                # Sort predicted boxes and scores by scores
                class_scores, sort_ind = class_scores.sort(dim=0, descending=True)  # (n_qualified), (n_min_score)
                class_decoded_locs = class_decoded_locs[sort_ind]  # (n_min_score, 4)

                # Find the overlap between predicted boxes
                overlap = find_jaccard_overlap(class_decoded_locs, class_decoded_locs) # (n_qualified, n_min_score)

                # Non-Maximum Suppression (NMS)
                
                # A torch.uint8 (byte) tensor to keep track of which predicted boxes to suppress
                # 1 implies suppress, 0 implies don't suppress
                suppress = torch.zeros((n_above_min_score), dtype=torch.uint8).to(device)  # (n_qualified)

                # Consider each box in order of decreasing scores
                for box in range(class_decoded_locs.size(0)):
                    # If this box is already marked for suppression
                    if suppress[box] == 1:
                        continue

                    # Suppress boxes whose overlaps (with this box) are greater than maximum overlap
                    # Find such boxes and update suppress indices
                    suppress = torch.max(suppress, overlap[box] > max_overlap)
                    # The max operation retains previously suppressed boxes, like an 'OR' operation

                    # Don't suppress this box, even though it has an overlap of 1 with itself
                    suppress[box] = 0

                # Store only unsuppressed boxes for this class
                image_boxes.append(class_decoded_locs[1 - suppress])
                image_labels.append(torch.LongTensor((1 - suppress).sum().item() * [c]).to(device))
                image_scores.append(class_scores[1 - suppress])

            # If no object in any class is found, store a placeholder for 'background'
            if len(image_boxes) == 0:
                image_boxes.append(torch.FloatTensor([[0., 0., 1., 1.]]).to(device))
                image_labels.append(torch.LongTensor([0]).to(device))
                image_scores.append(torch.FloatTensor([0.]).to(device))

            # Concatenate into single tensors
            image_boxes = torch.cat(image_boxes, dim=0)  # (n_objects, 4)
            image_labels = torch.cat(image_labels, dim=0)  # (n_objects)
            image_scores = torch.cat(image_scores, dim=0)  # (n_objects)
            n_objects = image_scores.size(0)

            # Keep only the top k objects
            if n_objects > top_k:
                image_scores, sort_ind = image_scores.sort(dim=0, descending=True)
                image_scores = image_scores[:top_k]  # (top_k)
                image_boxes = image_boxes[sort_ind][:top_k]  # (top_k, 4)
                image_labels = image_labels[sort_ind][:top_k]  # (top_k)

            # Append to lists that store predicted boxes and scores for all images
            all_images_boxes.append(image_boxes)
            all_images_labels.append(image_labels)
            all_images_scores.append(image_scores)

        return all_images_boxes, all_images_labels, all_images_scores  # lists of length batch_size

In [7]:
class MultiBoxLoss(nn.Module):
    """
    The Multibox loss function for SSD300 architecture, which is a combination of:

    1) a localization loss for the predicted locations of the boxes, and
    2) a confidence loss for the predicted class scores.
    """

    def __init__(self, priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1.):
        """
        priors_cxcy: priors' (c_x, c_y, w, h)
        threshold: overlapping less than 'threshold' with priors are set to class-background
        neg_pos_ratio: a parameter used when calculating hard negative mining. Detail in forward() section
        alpha: the ratio between localization loss and confidence loss
        """
        
        super(MultiBoxLoss, self).__init__()
        self.priors_cxcy = priors_cxcy
        self.priors_xy = cxcy_to_xy(priors_cxcy)
        self.threshold = threshold
        self.neg_pos_ratio = neg_pos_ratio
        self.alpha = alpha

        # the two loss functions for localization and classification
        self.smooth_l1 = nn.L1Loss()
        self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
        
    def forward(self, predicted_locs, predicted_scores, boxes, labels):
        """
        Forward propagation.

        predicted_locs:   predicted locations/box w.r.t 8732 priors, (N, 8732, 4)
        predicted_scores: preidted class scores for each of the encoded locations, (N, 8732, n_classes)
        boxes:            ground truth boxes,  a list of N tensors
        label:            ground truth labels, a list of N tensors
        """

        batch_size = predicted_locs.size(0)
        n_priors   = self.priors_cxcy.size(0)
        n_classes  = predicted_scores.size(2)

        assert n_priors == predicted_locs.size(1) == predicted_scores.size(1)

        true_locs = torch.zeros((batch_size, n_priors, 4), dtype=torch.float).to(device)  # (N, 8732, 4)
        true_classes = torch.zeros((batch_size, n_priors), dtype=torch.long).to(device)   # (N, 8732)

        # for each image in the minibatch
        for i in range(batch_size):
            n_objects = boxes[i].size(0) # the number of objects exist in the given image

            overlap = find_jaccard_overlap(boxes[i], self.priors_xy)  # (n_objects, 8732)

            # for each prior, find the object that has the maximum overlap
            overlap_for_each_prior, object_for_each_prior = overlap.max(dim=0) # (8732),  (8732)

            # we dont want a situation where an object is not represented in our positive (non-background) priors for reasons like:
            # 1. An objext might not be the best object for all priors, and is theresore not in the object_for_each_prior
            # 2. All priors with the object may be assigned as background based on the threshold (0.5 by defaul)

            # to remedy this
            # first, find the prior that has the maximum overlap for each object.
            _, prior_for_each_object = overlap.max(dim=1)   # (n_object)

            # Then, assign each object to the corresponding maximum-overlap-prior. (this fixes 1.)
            object_for_each_prior[prior_for_each_object] = torch.LongTensor(range(n_objects)).to(device)

            # To ensure these priors qualify, artificially give them an overlap of greater than 0.5. (This fixes 2.)
            overlap_for_each_prior[prior_for_each_object] = 1.

            # Labels for each prior
            label_for_each_prior = labels[i][object_for_each_prior]  # (8732)
            # Set priors whose overlaps with objects are less than the threshold to be background (no object)
            label_for_each_prior[overlap_for_each_prior < self.threshold] = 0  # (8732)

            # Store
            true_classes[i] = label_for_each_prior

            # Encode center-size object coordinates into the form we regressed predicted boxes to
            true_locs[i] = cxcy_to_gcxgcy(xy_to_cxcy(boxes[i][object_for_each_prior]), self.priors_cxcy)  # (8732, 4)

        # Identify priors that are positive (object/non-background)
        positive_priors = true_classes != 0  # (N, 8732)

        # LOCALIZATION LOSS

        # Localization loss is computed only over positive (non-background) priors
        loc_loss = self.smooth_l1(predicted_locs[positive_priors], true_locs[positive_priors])  # (), scalar

        # Note: indexing with a torch.uint8 (byte) tensor flattens the tensor when indexing is across multiple dimensions (N & 8732)
        # So, if predicted_locs has the shape (N, 8732, 4), predicted_locs[positive_priors] will have (total positives, 4)

        # CONFIDENCE LOSS

        # Confidence loss is computed over positive priors and the most difficult (hardest) negative priors in each image
        # That is, FOR EACH IMAGE,
        # we will take the hardest (neg_pos_ratio * n_positives) negative priors, i.e where there is maximum loss
        # This is called Hard Negative Mining - it concentrates on hardest negatives in each image, and also minimizes pos/neg imbalance

        # Number of positive and hard-negative priors per image
        n_positives = positive_priors.sum(dim=1)  # (N)
        n_hard_negatives = self.neg_pos_ratio * n_positives  # (N)

        # First, find the loss for all priors
        conf_loss_all = self.cross_entropy(predicted_scores.view(-1, n_classes), true_classes.view(-1))  # (N * 8732)
        conf_loss_all = conf_loss_all.view(batch_size, n_priors)  # (N, 8732)

        # We already know which priors are positive
        conf_loss_pos = conf_loss_all[positive_priors]  # (sum(n_positives))

        # Next, find which priors are hard-negative
        # To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives
        conf_loss_neg = conf_loss_all.clone()  # (N, 8732)
        conf_loss_neg[positive_priors] = 0.  # (N, 8732), positive priors are ignored (never in top n_hard_negatives)
        conf_loss_neg, _ = conf_loss_neg.sort(dim=1, descending=True)  # (N, 8732), sorted by decreasing hardness
        hardness_ranks = torch.LongTensor(range(n_priors)).unsqueeze(0).expand_as(conf_loss_neg).to(device)  # (N, 8732)
        hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze(1)  # (N, 8732)
        conf_loss_hard_neg = conf_loss_neg[hard_negatives]  # (sum(n_hard_negatives))

        # As in the paper, averaged over positive priors only, although computed over both positive and hard-negative priors
        conf_loss = (conf_loss_hard_neg.sum() + conf_loss_pos.sum()) / n_positives.sum().float()  # (), scalar

        # TOTAL LOSS

        return conf_loss + self.alpha * loc_loss

In [8]:
class PascalVOCDataset(Dataset):
    """
    A PyTorch Dataset class to be used as DataLoader later
    """

    def __init__(self, data_folder, split, keep_difficult=False):
        """
        data_folder: folder where data files are stored
        split: this must be either 'TRAIN' or 'TEST'
        keep_difficult: keep or discard objects that are considered as difficult(a property come with the dataset)
        """
        self.split = split.upper()

        assert self.split in {'TRAIN','TEST'}

        self.data_folder = data_folder
        self.keep_difficult = keep_difficult

        with open(os.path.join(data_folder,self.split+'_images.json'),   'r') as j:
            self.images = json.load(j)
        with open(os.path.join(data_folder, self.split+'_objects.json'), 'r') as j:
            self.objects = json.load(j)

        assert len(self.images) == len(self.objects)
    
    def __getitem__(self, i):
        # Read Image
        image = Image.open(self.images[i], mode='r')
        image = image.convert('RGB')

        # Read objects in this image (bounding boxes, labels, difficulties)
        objects = self.objects[i]
        boxes  = torch.FloatTensor(objects['boxes']) # (n_objects, 4)
        labels = torch.LongTensor(objects['labels']) # (n_objects)
        difficulties = torch.ByteTensor(objects['difficulties'])  # (n_objects)

        # Discard difficult objects, if specified
        if not self.keep_difficult:
            boxes = boxes[1-difficulties]
            labels = labels[1-difficulties]
            difficulties = difficulties[1-difficulties]

        # Apply transformations
        image, boxes, labels, difficulties = transform(image, boxes, labels, difficulties, self.split)

        return image, boxes, labels, difficulties

    def __len__(self):
        return len(self.images)

    def collate_fn(self, batch):
        """
        Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader)

        This describes how to combine these tensors of different sizes. We use lists.

        @Params
        batch: an iterable of N sets from __getitem__()
        return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
        """

        images = list()
        boxes  = list()
        labels = list()
        difficulties = list()

        for b in batch:
            images.append(b[0])
            boxes.append(b[1])
            labels.append(b[2])
            difficulties.append(b[3])

        images = torch.stack(images, dim=0)

        return images, boxes, labels, difficulties  # tensor (N, 3, 300, 300), 3 lists of N tensors each

In [9]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.
    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # Batches
    for i, (images, boxes, labels, _) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to default device
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [b.to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients, if necessary
        if grad_clip is not None:
            clip_gradient(optimizer, grad_clip)

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                  batch_time=batch_time,
                                                                  data_time=data_time, loss=losses))
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored


def validate(val_loader, model, criterion):
    """
    One epoch's validation.
    :param val_loader: DataLoader for validation data
    :param model: model
    :param criterion: MultiBox loss
    :return: average validation loss
    """
    model.eval()  # eval mode disables dropout

    batch_time = AverageMeter()
    losses = AverageMeter()

    start = time.time()

    # Prohibit gradient computation explicity because I had some problems with memory
    with torch.no_grad():
        # Batches
        for i, (images, boxes, labels, difficulties) in enumerate(val_loader):

            # Move to default device
            images = images.to(device)  # (N, 3, 300, 300)
            boxes = [b.to(device) for b in boxes]
            labels = [l.to(device) for l in labels]

            # Forward prop.
            predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

            # Loss
            loss = criterion(predicted_locs, predicted_scores, boxes, labels)

            losses.update(loss.item(), images.size(0))
            batch_time.update(time.time() - start)

            start = time.time()

            # Print status
            if i % print_freq == 0:
                print('[{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(i, len(val_loader),
                                                                      batch_time=batch_time,
                                                                      loss=losses))

    print('\n * LOSS - {loss.avg:.3f}\n'.format(loss=losses))

    return losses.avg

In [10]:
data_folder = './'
keep_difficult = True
n_classes = len(label_map)

# Training parameters
checkpoint = None  # path to model checkpoint if consider resume training from there
batch_size = 8
start_epoch = 0    # start at this epoch
epochs = 200       # total training epochs to run without early-stopping
epochs_since_improvement = 0 # record the no. of epochs since last improvement
best_loss = 100.   # assume a hight loss at first
workers = 4        # number of workers for loading data in the DataLoader
print_freq = 200   # print training or validation status every __ batches
lr = 1e-3          # learning rate
momentum = 0.9     
weight_decay = 5e-4
grad_clip = None   # consider clipping the gradient when using high learning_rate

cudnn.benchmark = True

In [None]:
global epochs_since_improvement, start_epoch, label_map, best_loss, epoch, checkpoint

# initialize model or load checkpoint
if checkpoint is None:
    model = SSD300(n_classes)
    # Initialize the optimizer, with twice the default learning rate for biases
    biases = list()
    not_biases = list()
    for param_name, param in model.named_parameters():
        if param.requires_grad:
            if param_name.endswith('.bias'):
                biases.append(param)
            else:
                not_biases.append(param)
    optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                                    lr=lr, momentum=momentum, weight_decay=weight_decay)
else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    epochs_since_improvement = checkpoint['epochs_since_improvement']
    best_loss = checkpoint['best_loss']
    print('\nLoaded checkpoint from epoch %d. Best loss so far is %.3f.\n' % (start_epoch, best_loss))
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']
    
# move to  default device
model = model.to(device)      # model to GPU
criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device) # Loss function to GPU

# Custom dataloaders
train_dataset = PascalVOCDataset(data_folder,'train',keep_difficult)
val_dataset   = PascalVOCDataset(data_folder,'test' ,keep_difficult)
train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                            collate_fn=train_dataset.collate_fn, num_workers=workers,
                                           pin_memory=True) # pass in our collate function here
val_loader    = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                                             collate_fn=val_dataset.collate_fn, num_workers=workers,
                                             pin_memory=True)

# Epochs
for epoch in range(start_epoch, epochs):
    train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

    # One epoch's validation
    val_loss = validate(val_loader=val_loader,
                        model=model,
                        criterion=criterion)

    # Did validation loss improve?
    is_best = val_loss < best_loss
    best_loss = min(val_loss, best_loss)

    if not is_best:
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))

    else:
        epochs_since_improvement = 0

    # Save checkpoint
    save_checkpoint(epoch, epochs_since_improvement, model, optimizer, val_loss, best_loss, is_best)


Loaded base model with pre-trained weights





Epoch: [0][0/2069]	Batch Time 3.437 (3.437)	Data Time 0.380 (0.380)	Loss 23.7883 (23.7883)	
Epoch: [0][200/2069]	Batch Time 0.220 (0.236)	Data Time 0.000 (0.012)	Loss 9.0279 (11.7826)	
Epoch: [0][400/2069]	Batch Time 0.219 (0.225)	Data Time 0.000 (0.010)	Loss 6.2759 (9.2286)	
Epoch: [0][600/2069]	Batch Time 0.197 (0.221)	Data Time 0.000 (0.008)	Loss 6.0422 (8.1862)	
Epoch: [0][800/2069]	Batch Time 0.184 (0.217)	Data Time 0.000 (0.007)	Loss 6.2347 (7.6520)	
Epoch: [0][1000/2069]	Batch Time 0.219 (0.216)	Data Time 0.000 (0.007)	Loss 5.9983 (7.3166)	
Epoch: [0][1200/2069]	Batch Time 0.221 (0.216)	Data Time 0.001 (0.007)	Loss 5.8123 (7.0860)	
Epoch: [0][1400/2069]	Batch Time 0.198 (0.215)	Data Time 0.000 (0.007)	Loss 5.8464 (6.8994)	
Epoch: [0][1600/2069]	Batch Time 0.254 (0.214)	Data Time 0.000 (0.007)	Loss 5.8728 (6.7563)	
Epoch: [0][1800/2069]	Batch Time 0.210 (0.214)	Data Time 0.000 (0.006)	Loss 5.7810 (6.6353)	
Epoch: [0][2000/2069]	Batch Time 0.213 (0.213)	Data Time 0.011 (0.006)	Los

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Epoch: [1][0/2069]	Batch Time 0.621 (0.621)	Data Time 0.423 (0.423)	Loss 5.6997 (5.6997)	
Epoch: [1][200/2069]	Batch Time 0.215 (0.212)	Data Time 0.000 (0.008)	Loss 5.0802 (5.3863)	
Epoch: [1][400/2069]	Batch Time 0.196 (0.211)	Data Time 0.000 (0.007)	Loss 4.9441 (5.3654)	
Epoch: [1][600/2069]	Batch Time 0.254 (0.210)	Data Time 0.000 (0.005)	Loss 4.9014 (5.3048)	
Epoch: [1][800/2069]	Batch Time 0.190 (0.210)	Data Time 0.000 (0.006)	Loss 4.9474 (5.2668)	
Epoch: [1][1000/2069]	Batch Time 0.200 (0.210)	Data Time 0.000 (0.005)	Loss 4.5186 (5.2210)	
Epoch: [1][1200/2069]	Batch Time 0.187 (0.210)	Data Time 0.000 (0.005)	Loss 5.0104 (5.1703)	
Epoch: [1][1400/2069]	Batch Time 0.276 (0.209)	Data Time 0.000 (0.005)	Loss 4.9228 (5.1284)	
Epoch: [1][1600/2069]	Batch Time 0.190 (0.209)	Data Time 0.000 (0.005)	Loss 4.3487 (5.0928)	
Epoch: [1][1800/2069]	Batch Time 0.205 (0.209)	Data Time 0.001 (0.005)	Loss 3.9060 (5.0499)	
Epoch: [1][2000/2069]	Batch Time 0.194 (0.209)	Data Time 0.000 (0.005)	Loss 4

Epoch: [7][1400/2069]	Batch Time 0.252 (0.211)	Data Time 0.000 (0.007)	Loss 3.1319 (3.4572)	
Epoch: [7][1600/2069]	Batch Time 0.224 (0.211)	Data Time 0.001 (0.006)	Loss 4.3825 (3.4573)	
Epoch: [7][1800/2069]	Batch Time 0.216 (0.210)	Data Time 0.000 (0.006)	Loss 4.2022 (3.4544)	
Epoch: [7][2000/2069]	Batch Time 0.264 (0.210)	Data Time 0.008 (0.006)	Loss 3.7908 (3.4547)	
[0/619]	Batch Time 0.351 (0.351)	Loss 3.4906 (3.4906)	
[200/619]	Batch Time 0.075 (0.099)	Loss 3.7280 (3.3473)	
[400/619]	Batch Time 0.093 (0.098)	Loss 3.8881 (3.3653)	
[600/619]	Batch Time 0.144 (0.099)	Loss 3.1655 (3.3830)	

 * LOSS - 3.380

Epoch: [8][0/2069]	Batch Time 1.035 (1.035)	Data Time 0.808 (0.808)	Loss 3.8596 (3.8596)	
Epoch: [8][200/2069]	Batch Time 0.271 (0.214)	Data Time 0.000 (0.010)	Loss 3.2532 (3.4160)	
Epoch: [8][400/2069]	Batch Time 0.189 (0.213)	Data Time 0.001 (0.009)	Loss 2.7862 (3.3991)	
Epoch: [8][600/2069]	Batch Time 0.217 (0.211)	Data Time 0.000 (0.007)	Loss 3.6565 (3.4027)	
Epoch: [8][800/206


 * LOSS - 3.182

Epoch: [14][0/2069]	Batch Time 0.778 (0.778)	Data Time 0.551 (0.551)	Loss 3.4096 (3.4096)	
Epoch: [14][200/2069]	Batch Time 0.246 (0.217)	Data Time 0.019 (0.012)	Loss 3.2857 (3.0852)	
Epoch: [14][400/2069]	Batch Time 0.185 (0.213)	Data Time 0.000 (0.008)	Loss 3.2123 (3.1453)	
Epoch: [14][600/2069]	Batch Time 0.223 (0.211)	Data Time 0.000 (0.007)	Loss 3.2412 (3.1268)	
Epoch: [14][800/2069]	Batch Time 0.237 (0.212)	Data Time 0.000 (0.007)	Loss 3.4148 (3.1205)	
Epoch: [14][1000/2069]	Batch Time 0.198 (0.211)	Data Time 0.001 (0.006)	Loss 2.3915 (3.1080)	
Epoch: [14][1200/2069]	Batch Time 0.218 (0.210)	Data Time 0.006 (0.005)	Loss 2.6437 (3.1086)	
Epoch: [14][1400/2069]	Batch Time 0.206 (0.210)	Data Time 0.000 (0.006)	Loss 3.0535 (3.0955)	
Epoch: [14][1600/2069]	Batch Time 0.190 (0.210)	Data Time 0.000 (0.005)	Loss 3.9938 (3.0927)	
Epoch: [14][1800/2069]	Batch Time 0.205 (0.209)	Data Time 0.000 (0.005)	Loss 2.7291 (3.0953)	
Epoch: [14][2000/2069]	Batch Time 0.245 (0.209)	D

Epoch: [20][1000/2069]	Batch Time 0.205 (0.210)	Data Time 0.000 (0.005)	Loss 2.5871 (2.9744)	
Epoch: [20][1200/2069]	Batch Time 0.186 (0.210)	Data Time 0.000 (0.005)	Loss 3.3029 (2.9687)	
Epoch: [20][1400/2069]	Batch Time 0.227 (0.210)	Data Time 0.008 (0.005)	Loss 3.3744 (2.9691)	
Epoch: [20][1600/2069]	Batch Time 0.281 (0.210)	Data Time 0.024 (0.005)	Loss 2.3116 (2.9741)	
Epoch: [20][1800/2069]	Batch Time 0.208 (0.209)	Data Time 0.000 (0.005)	Loss 2.7886 (2.9719)	
Epoch: [20][2000/2069]	Batch Time 0.186 (0.209)	Data Time 0.001 (0.005)	Loss 2.7809 (2.9746)	
[0/619]	Batch Time 0.311 (0.311)	Loss 3.3781 (3.3781)	
[200/619]	Batch Time 0.096 (0.101)	Loss 2.4129 (3.0226)	
[400/619]	Batch Time 0.077 (0.099)	Loss 2.9692 (3.0204)	
[600/619]	Batch Time 0.086 (0.099)	Loss 3.2242 (3.0301)	

 * LOSS - 3.033

Epoch: [21][0/2069]	Batch Time 0.600 (0.600)	Data Time 0.369 (0.369)	Loss 1.6461 (1.6461)	
Epoch: [21][200/2069]	Batch Time 0.183 (0.212)	Data Time 0.000 (0.010)	Loss 2.8153 (2.9438)	
Epoch: [

Epoch: [26][2000/2069]	Batch Time 0.216 (0.208)	Data Time 0.000 (0.004)	Loss 2.7130 (2.8809)	
[0/619]	Batch Time 0.361 (0.361)	Loss 2.7170 (2.7170)	
[200/619]	Batch Time 0.116 (0.100)	Loss 2.8028 (2.9630)	
[400/619]	Batch Time 0.095 (0.098)	Loss 3.4334 (2.9935)	
[600/619]	Batch Time 0.102 (0.098)	Loss 3.0884 (3.0047)	

 * LOSS - 3.006


Epochs since last improvement: 3

Epoch: [27][0/2069]	Batch Time 0.734 (0.734)	Data Time 0.486 (0.486)	Loss 3.2877 (3.2877)	
Epoch: [27][200/2069]	Batch Time 0.204 (0.215)	Data Time 0.000 (0.009)	Loss 3.0807 (2.8519)	
Epoch: [27][400/2069]	Batch Time 0.194 (0.212)	Data Time 0.000 (0.008)	Loss 2.8100 (2.8509)	
Epoch: [27][600/2069]	Batch Time 0.184 (0.211)	Data Time 0.000 (0.007)	Loss 2.7613 (2.8515)	
Epoch: [27][800/2069]	Batch Time 0.186 (0.211)	Data Time 0.001 (0.007)	Loss 2.3606 (2.8407)	
Epoch: [27][1000/2069]	Batch Time 0.182 (0.211)	Data Time 0.000 (0.006)	Loss 2.2900 (2.8512)	
Epoch: [27][1200/2069]	Batch Time 0.189 (0.211)	Data Time 0.000 (0.006

Epoch: [33][200/2069]	Batch Time 0.214 (0.215)	Data Time 0.000 (0.011)	Loss 2.4178 (2.7505)	
Epoch: [33][400/2069]	Batch Time 0.186 (0.212)	Data Time 0.000 (0.008)	Loss 2.0415 (2.7756)	
Epoch: [33][600/2069]	Batch Time 0.182 (0.211)	Data Time 0.000 (0.007)	Loss 2.8800 (2.7748)	
Epoch: [33][800/2069]	Batch Time 0.200 (0.210)	Data Time 0.000 (0.006)	Loss 2.7349 (2.7856)	
Epoch: [33][1000/2069]	Batch Time 0.189 (0.210)	Data Time 0.005 (0.005)	Loss 2.4286 (2.7842)	
Epoch: [33][1200/2069]	Batch Time 0.215 (0.210)	Data Time 0.000 (0.005)	Loss 2.8325 (2.7786)	
Epoch: [33][1400/2069]	Batch Time 0.204 (0.209)	Data Time 0.000 (0.005)	Loss 3.1938 (2.7809)	
Epoch: [33][1600/2069]	Batch Time 0.265 (0.209)	Data Time 0.080 (0.005)	Loss 2.3658 (2.7891)	
Epoch: [33][1800/2069]	Batch Time 0.198 (0.209)	Data Time 0.000 (0.005)	Loss 3.4780 (2.7940)	
Epoch: [33][2000/2069]	Batch Time 0.195 (0.209)	Data Time 0.009 (0.005)	Loss 2.4050 (2.7943)	
[0/619]	Batch Time 0.353 (0.353)	Loss 2.6841 (2.6841)	
[200/619]

Epoch: [39][1400/2069]	Batch Time 0.225 (0.210)	Data Time 0.000 (0.005)	Loss 2.4483 (2.7125)	
Epoch: [39][1600/2069]	Batch Time 0.204 (0.210)	Data Time 0.000 (0.005)	Loss 2.9092 (2.7210)	
Epoch: [39][1800/2069]	Batch Time 0.247 (0.210)	Data Time 0.024 (0.005)	Loss 2.8761 (2.7237)	
Epoch: [39][2000/2069]	Batch Time 0.213 (0.210)	Data Time 0.005 (0.005)	Loss 2.3563 (2.7227)	
[0/619]	Batch Time 0.322 (0.322)	Loss 2.5023 (2.5023)	
[200/619]	Batch Time 0.078 (0.098)	Loss 2.9264 (2.8647)	
[400/619]	Batch Time 0.096 (0.098)	Loss 3.2868 (2.8845)	
[600/619]	Batch Time 0.101 (0.098)	Loss 2.2081 (2.8974)	

 * LOSS - 2.898


Epochs since last improvement: 1

Epoch: [40][0/2069]	Batch Time 0.844 (0.844)	Data Time 0.651 (0.651)	Loss 2.9060 (2.9060)	
Epoch: [40][200/2069]	Batch Time 0.182 (0.218)	Data Time 0.000 (0.013)	Loss 3.1590 (2.7281)	
Epoch: [40][400/2069]	Batch Time 0.216 (0.214)	Data Time 0.009 (0.010)	Loss 3.0242 (2.7217)	
Epoch: [40][600/2069]	Batch Time 0.179 (0.212)	Data Time 0.000 (0.00

[200/619]	Batch Time 0.096 (0.101)	Loss 3.2928 (2.8963)	
[400/619]	Batch Time 0.087 (0.098)	Loss 2.7248 (2.9496)	
[600/619]	Batch Time 0.074 (0.098)	Loss 3.0967 (2.9342)	

 * LOSS - 2.937


Epochs since last improvement: 4

Epoch: [46][0/2069]	Batch Time 0.680 (0.680)	Data Time 0.443 (0.443)	Loss 2.4602 (2.4602)	
Epoch: [46][200/2069]	Batch Time 0.214 (0.215)	Data Time 0.000 (0.010)	Loss 2.0735 (2.7094)	
Epoch: [46][400/2069]	Batch Time 0.224 (0.214)	Data Time 0.008 (0.008)	Loss 2.9422 (2.6941)	
Epoch: [46][600/2069]	Batch Time 0.214 (0.212)	Data Time 0.000 (0.006)	Loss 2.3329 (2.6848)	
Epoch: [46][800/2069]	Batch Time 0.223 (0.212)	Data Time 0.009 (0.007)	Loss 2.4883 (2.7016)	
Epoch: [46][1000/2069]	Batch Time 0.187 (0.211)	Data Time 0.000 (0.006)	Loss 2.3434 (2.6867)	
Epoch: [46][1200/2069]	Batch Time 0.195 (0.211)	Data Time 0.002 (0.006)	Loss 3.1697 (2.6755)	
Epoch: [46][1400/2069]	Batch Time 0.197 (0.211)	Data Time 0.000 (0.007)	Loss 2.4696 (2.6719)	
Epoch: [46][1600/2069]	Batch Ti

Epoch: [52][400/2069]	Batch Time 0.189 (0.211)	Data Time 0.000 (0.007)	Loss 2.4949 (2.5774)	
Epoch: [52][600/2069]	Batch Time 0.210 (0.210)	Data Time 0.000 (0.006)	Loss 2.6079 (2.5818)	
Epoch: [52][800/2069]	Batch Time 0.302 (0.210)	Data Time 0.112 (0.006)	Loss 2.1611 (2.5942)	
Epoch: [52][1000/2069]	Batch Time 0.184 (0.210)	Data Time 0.000 (0.005)	Loss 3.3192 (2.6006)	
Epoch: [52][1200/2069]	Batch Time 0.216 (0.210)	Data Time 0.000 (0.005)	Loss 3.1524 (2.6137)	
Epoch: [52][1400/2069]	Batch Time 0.197 (0.210)	Data Time 0.001 (0.005)	Loss 2.3263 (2.6238)	
Epoch: [52][1600/2069]	Batch Time 0.199 (0.210)	Data Time 0.000 (0.005)	Loss 2.5147 (2.6266)	
Epoch: [52][1800/2069]	Batch Time 0.188 (0.210)	Data Time 0.000 (0.005)	Loss 2.1082 (2.6296)	
Epoch: [52][2000/2069]	Batch Time 0.189 (0.209)	Data Time 0.002 (0.005)	Loss 2.4513 (2.6328)	
[0/619]	Batch Time 0.313 (0.313)	Loss 3.2043 (3.2043)	
[200/619]	Batch Time 0.075 (0.102)	Loss 2.8111 (2.8982)	
[400/619]	Batch Time 0.141 (0.098)	Loss 2.500

Epoch: [58][1400/2069]	Batch Time 0.189 (0.211)	Data Time 0.000 (0.006)	Loss 2.1870 (2.5934)	
Epoch: [58][1600/2069]	Batch Time 0.182 (0.210)	Data Time 0.000 (0.006)	Loss 2.3255 (2.5937)	
Epoch: [58][1800/2069]	Batch Time 0.220 (0.210)	Data Time 0.000 (0.005)	Loss 3.6741 (2.5967)	
Epoch: [58][2000/2069]	Batch Time 0.182 (0.210)	Data Time 0.000 (0.005)	Loss 3.1867 (2.5976)	
[0/619]	Batch Time 0.269 (0.269)	Loss 3.0573 (3.0573)	
[200/619]	Batch Time 0.074 (0.101)	Loss 2.1786 (2.8539)	
[400/619]	Batch Time 0.082 (0.099)	Loss 2.7116 (2.8180)	
[600/619]	Batch Time 0.099 (0.098)	Loss 2.7779 (2.8251)	

 * LOSS - 2.828


Epochs since last improvement: 1

Epoch: [59][0/2069]	Batch Time 0.780 (0.780)	Data Time 0.562 (0.562)	Loss 2.4404 (2.4404)	
Epoch: [59][200/2069]	Batch Time 0.236 (0.214)	Data Time 0.008 (0.009)	Loss 2.0565 (2.5620)	
Epoch: [59][400/2069]	Batch Time 0.405 (0.211)	Data Time 0.184 (0.007)	Loss 2.6324 (2.5821)	
Epoch: [59][600/2069]	Batch Time 0.210 (0.211)	Data Time 0.000 (0.00

[200/619]	Batch Time 0.099 (0.098)	Loss 2.7512 (2.7727)	
[400/619]	Batch Time 0.103 (0.097)	Loss 2.9948 (2.8012)	
[600/619]	Batch Time 0.146 (0.098)	Loss 2.2112 (2.8114)	

 * LOSS - 2.810


Epochs since last improvement: 2

Epoch: [65][0/2069]	Batch Time 0.589 (0.589)	Data Time 0.386 (0.386)	Loss 2.9843 (2.9843)	
Epoch: [65][200/2069]	Batch Time 0.182 (0.209)	Data Time 0.000 (0.006)	Loss 2.4872 (2.5428)	
Epoch: [65][400/2069]	Batch Time 0.186 (0.209)	Data Time 0.000 (0.006)	Loss 2.8068 (2.5444)	
Epoch: [65][600/2069]	Batch Time 0.204 (0.209)	Data Time 0.002 (0.006)	Loss 2.6138 (2.5690)	
Epoch: [65][800/2069]	Batch Time 0.237 (0.211)	Data Time 0.000 (0.006)	Loss 1.9159 (2.5731)	
Epoch: [65][1000/2069]	Batch Time 0.209 (0.211)	Data Time 0.008 (0.006)	Loss 3.1545 (2.5759)	
Epoch: [65][1200/2069]	Batch Time 0.226 (0.210)	Data Time 0.010 (0.006)	Loss 2.7582 (2.5755)	
Epoch: [65][1400/2069]	Batch Time 0.228 (0.210)	Data Time 0.008 (0.006)	Loss 2.6333 (2.5703)	
Epoch: [65][1600/2069]	Batch Ti

Epoch: [71][400/2069]	Batch Time 0.200 (0.209)	Data Time 0.001 (0.006)	Loss 2.2253 (2.5315)	
Epoch: [71][600/2069]	Batch Time 0.217 (0.208)	Data Time 0.000 (0.005)	Loss 2.1230 (2.5516)	
Epoch: [71][800/2069]	Batch Time 0.235 (0.209)	Data Time 0.002 (0.006)	Loss 2.8402 (2.5422)	
Epoch: [71][1000/2069]	Batch Time 0.216 (0.209)	Data Time 0.000 (0.005)	Loss 2.9080 (2.5333)	
Epoch: [71][1200/2069]	Batch Time 0.200 (0.209)	Data Time 0.000 (0.005)	Loss 2.1748 (2.5346)	
Epoch: [71][1400/2069]	Batch Time 0.198 (0.209)	Data Time 0.000 (0.005)	Loss 2.6428 (2.5385)	
Epoch: [71][1600/2069]	Batch Time 0.181 (0.209)	Data Time 0.000 (0.005)	Loss 3.0054 (2.5277)	
Epoch: [71][1800/2069]	Batch Time 0.193 (0.209)	Data Time 0.000 (0.005)	Loss 2.6892 (2.5329)	
Epoch: [71][2000/2069]	Batch Time 0.182 (0.209)	Data Time 0.000 (0.005)	Loss 2.4276 (2.5258)	
[0/619]	Batch Time 0.295 (0.295)	Loss 2.6930 (2.6930)	
[200/619]	Batch Time 0.080 (0.101)	Loss 2.4531 (2.9456)	
[400/619]	Batch Time 0.095 (0.100)	Loss 2.283

Epoch: [77][1400/2069]	Batch Time 0.189 (0.210)	Data Time 0.000 (0.005)	Loss 2.1834 (2.5008)	
Epoch: [77][1600/2069]	Batch Time 0.225 (0.210)	Data Time 0.000 (0.005)	Loss 2.3152 (2.5109)	
Epoch: [77][1800/2069]	Batch Time 0.206 (0.210)	Data Time 0.001 (0.005)	Loss 2.1944 (2.5087)	
Epoch: [77][2000/2069]	Batch Time 0.208 (0.210)	Data Time 0.000 (0.005)	Loss 1.3340 (2.5037)	
[0/619]	Batch Time 0.287 (0.287)	Loss 2.8778 (2.8778)	
[200/619]	Batch Time 0.100 (0.100)	Loss 2.7999 (2.7606)	
[400/619]	Batch Time 0.110 (0.098)	Loss 2.4011 (2.7825)	
[600/619]	Batch Time 0.107 (0.097)	Loss 2.5640 (2.7753)	

 * LOSS - 2.780


Epochs since last improvement: 1

Epoch: [78][0/2069]	Batch Time 0.723 (0.723)	Data Time 0.470 (0.470)	Loss 2.4627 (2.4627)	
Epoch: [78][200/2069]	Batch Time 0.281 (0.217)	Data Time 0.000 (0.011)	Loss 2.0910 (2.4531)	
Epoch: [78][400/2069]	Batch Time 0.386 (0.215)	Data Time 0.172 (0.010)	Loss 2.6476 (2.4511)	
Epoch: [78][600/2069]	Batch Time 0.191 (0.214)	Data Time 0.000 (0.00

[200/619]	Batch Time 0.090 (0.099)	Loss 2.1483 (2.7977)	
[400/619]	Batch Time 0.077 (0.098)	Loss 3.0048 (2.7711)	
[600/619]	Batch Time 0.085 (0.098)	Loss 3.0244 (2.7849)	

 * LOSS - 2.784


Epochs since last improvement: 2

Epoch: [84][0/2069]	Batch Time 0.894 (0.894)	Data Time 0.676 (0.676)	Loss 2.1791 (2.1791)	
Epoch: [84][200/2069]	Batch Time 0.204 (0.218)	Data Time 0.000 (0.014)	Loss 2.2233 (2.4394)	
Epoch: [84][400/2069]	Batch Time 0.242 (0.214)	Data Time 0.024 (0.009)	Loss 2.3191 (2.4762)	
Epoch: [84][600/2069]	Batch Time 0.182 (0.214)	Data Time 0.000 (0.009)	Loss 2.3649 (2.4856)	
Epoch: [84][800/2069]	Batch Time 0.188 (0.212)	Data Time 0.000 (0.007)	Loss 2.4443 (2.4843)	
Epoch: [84][1000/2069]	Batch Time 0.197 (0.211)	Data Time 0.000 (0.007)	Loss 2.4328 (2.4960)	
Epoch: [84][1200/2069]	Batch Time 0.208 (0.211)	Data Time 0.005 (0.006)	Loss 2.1522 (2.4938)	
Epoch: [84][1400/2069]	Batch Time 0.213 (0.211)	Data Time 0.001 (0.006)	Loss 2.2677 (2.4964)	
Epoch: [84][1600/2069]	Batch Ti

Epoch: [90][600/2069]	Batch Time 0.188 (0.211)	Data Time 0.001 (0.006)	Loss 2.6608 (2.4529)	
Epoch: [90][800/2069]	Batch Time 0.192 (0.211)	Data Time 0.000 (0.005)	Loss 1.9819 (2.4672)	
Epoch: [90][1000/2069]	Batch Time 0.237 (0.211)	Data Time 0.000 (0.005)	Loss 2.5754 (2.4670)	
Epoch: [90][1200/2069]	Batch Time 0.190 (0.211)	Data Time 0.000 (0.005)	Loss 2.9493 (2.4688)	
Epoch: [90][1400/2069]	Batch Time 0.197 (0.211)	Data Time 0.000 (0.006)	Loss 2.2245 (2.4559)	
Epoch: [90][1600/2069]	Batch Time 0.191 (0.211)	Data Time 0.000 (0.006)	Loss 2.7260 (2.4608)	
Epoch: [90][1800/2069]	Batch Time 0.207 (0.211)	Data Time 0.000 (0.006)	Loss 2.1976 (2.4659)	
Epoch: [90][2000/2069]	Batch Time 0.236 (0.211)	Data Time 0.000 (0.006)	Loss 2.5628 (2.4686)	
[0/619]	Batch Time 0.293 (0.293)	Loss 2.5316 (2.5316)	
[200/619]	Batch Time 0.107 (0.099)	Loss 2.6031 (2.7493)	
[400/619]	Batch Time 0.099 (0.099)	Loss 2.8701 (2.7401)	
[600/619]	Batch Time 0.113 (0.098)	Loss 3.6434 (2.7456)	

 * LOSS - 2.748


Epoch

Epoch: [96][1400/2069]	Batch Time 0.206 (0.210)	Data Time 0.000 (0.006)	Loss 2.1232 (2.4342)	
Epoch: [96][1600/2069]	Batch Time 0.243 (0.210)	Data Time 0.000 (0.005)	Loss 2.8255 (2.4333)	
Epoch: [96][1800/2069]	Batch Time 0.182 (0.210)	Data Time 0.000 (0.006)	Loss 3.1054 (2.4393)	
Epoch: [96][2000/2069]	Batch Time 0.189 (0.210)	Data Time 0.000 (0.005)	Loss 2.9517 (2.4413)	
[0/619]	Batch Time 0.322 (0.322)	Loss 2.3908 (2.3908)	
[200/619]	Batch Time 0.091 (0.096)	Loss 2.0767 (2.7073)	
[400/619]	Batch Time 0.100 (0.096)	Loss 2.8907 (2.7291)	
[600/619]	Batch Time 0.104 (0.097)	Loss 2.1967 (2.7194)	

 * LOSS - 2.716

Epoch: [97][0/2069]	Batch Time 0.820 (0.820)	Data Time 0.605 (0.605)	Loss 2.3293 (2.3293)	
Epoch: [97][200/2069]	Batch Time 0.218 (0.214)	Data Time 0.000 (0.007)	Loss 2.4345 (2.4531)	
Epoch: [97][400/2069]	Batch Time 0.203 (0.212)	Data Time 0.011 (0.006)	Loss 2.1365 (2.4472)	
Epoch: [97][600/2069]	Batch Time 0.243 (0.213)	Data Time 0.000 (0.007)	Loss 2.4603 (2.4561)	
Epoch: [97

[200/619]	Batch Time 0.087 (0.099)	Loss 2.3098 (2.7550)	
[400/619]	Batch Time 0.076 (0.098)	Loss 2.8834 (2.7542)	
[600/619]	Batch Time 0.110 (0.099)	Loss 2.5485 (2.7492)	

 * LOSS - 2.751


Epochs since last improvement: 1

Epoch: [103][0/2069]	Batch Time 0.953 (0.953)	Data Time 0.724 (0.724)	Loss 2.2662 (2.2662)	
Epoch: [103][200/2069]	Batch Time 0.182 (0.210)	Data Time 0.000 (0.009)	Loss 2.1555 (2.4038)	
Epoch: [103][400/2069]	Batch Time 0.185 (0.209)	Data Time 0.000 (0.007)	Loss 2.0331 (2.4255)	
Epoch: [103][600/2069]	Batch Time 0.182 (0.209)	Data Time 0.000 (0.005)	Loss 2.0050 (2.4301)	
Epoch: [103][800/2069]	Batch Time 0.204 (0.209)	Data Time 0.000 (0.005)	Loss 1.9994 (2.4327)	
Epoch: [103][1000/2069]	Batch Time 0.188 (0.209)	Data Time 0.000 (0.005)	Loss 2.2845 (2.4420)	
Epoch: [103][1200/2069]	Batch Time 0.189 (0.210)	Data Time 0.000 (0.005)	Loss 2.1691 (2.4386)	
Epoch: [103][1400/2069]	Batch Time 0.205 (0.210)	Data Time 0.000 (0.006)	Loss 2.2712 (2.4430)	
Epoch: [103][1600/2069]

Epoch: [109][200/2069]	Batch Time 0.221 (0.216)	Data Time 0.008 (0.010)	Loss 2.7426 (2.4999)	
Epoch: [109][400/2069]	Batch Time 0.190 (0.213)	Data Time 0.000 (0.008)	Loss 2.5648 (2.4893)	
Epoch: [109][600/2069]	Batch Time 0.243 (0.211)	Data Time 0.008 (0.006)	Loss 2.0880 (2.4769)	
Epoch: [109][800/2069]	Batch Time 0.241 (0.211)	Data Time 0.007 (0.006)	Loss 2.0967 (2.4548)	
Epoch: [109][1000/2069]	Batch Time 0.239 (0.210)	Data Time 0.000 (0.005)	Loss 1.7241 (2.4481)	
Epoch: [109][1200/2069]	Batch Time 0.234 (0.209)	Data Time 0.001 (0.005)	Loss 3.1251 (2.4433)	
Epoch: [109][1400/2069]	Batch Time 0.185 (0.209)	Data Time 0.000 (0.005)	Loss 1.8289 (2.4369)	
Epoch: [109][1600/2069]	Batch Time 0.200 (0.209)	Data Time 0.000 (0.005)	Loss 2.4959 (2.4432)	
Epoch: [109][1800/2069]	Batch Time 0.233 (0.210)	Data Time 0.000 (0.005)	Loss 1.5150 (2.4386)	
Epoch: [109][2000/2069]	Batch Time 0.193 (0.209)	Data Time 0.000 (0.005)	Loss 2.7162 (2.4392)	
[0/619]	Batch Time 0.332 (0.332)	Loss 2.6451 (2.6451)	

Epoch: [115][1000/2069]	Batch Time 0.231 (0.211)	Data Time 0.000 (0.007)	Loss 2.8993 (2.3951)	
Epoch: [115][1200/2069]	Batch Time 0.252 (0.211)	Data Time 0.000 (0.006)	Loss 2.5391 (2.3977)	
Epoch: [115][1400/2069]	Batch Time 0.257 (0.212)	Data Time 0.014 (0.006)	Loss 2.6110 (2.4005)	
Epoch: [115][1600/2069]	Batch Time 0.195 (0.212)	Data Time 0.010 (0.006)	Loss 2.2962 (2.3931)	
Epoch: [115][1800/2069]	Batch Time 0.195 (0.211)	Data Time 0.000 (0.006)	Loss 2.1462 (2.3967)	
Epoch: [115][2000/2069]	Batch Time 0.236 (0.211)	Data Time 0.008 (0.005)	Loss 2.9873 (2.4002)	
[0/619]	Batch Time 0.307 (0.307)	Loss 3.0386 (3.0386)	
[200/619]	Batch Time 0.099 (0.099)	Loss 2.5344 (2.7468)	
[400/619]	Batch Time 0.073 (0.100)	Loss 2.8858 (2.7359)	
[600/619]	Batch Time 0.097 (0.099)	Loss 2.9521 (2.7524)	

 * LOSS - 2.757


Epochs since last improvement: 14

Epoch: [116][0/2069]	Batch Time 0.721 (0.721)	Data Time 0.490 (0.490)	Loss 2.9905 (2.9905)	
Epoch: [116][200/2069]	Batch Time 0.194 (0.214)	Data Time 

Epoch: [121][1600/2069]	Batch Time 0.217 (0.211)	Data Time 0.000 (0.007)	Loss 2.6311 (2.4008)	
Epoch: [121][1800/2069]	Batch Time 0.378 (0.211)	Data Time 0.185 (0.007)	Loss 2.6770 (2.4028)	
Epoch: [121][2000/2069]	Batch Time 0.197 (0.211)	Data Time 0.001 (0.007)	Loss 2.1012 (2.4019)	
[0/619]	Batch Time 0.306 (0.306)	Loss 2.2745 (2.2745)	
[200/619]	Batch Time 0.107 (0.095)	Loss 3.1160 (2.7534)	
[400/619]	Batch Time 0.081 (0.097)	Loss 2.2363 (2.7444)	
[600/619]	Batch Time 0.096 (0.097)	Loss 2.3838 (2.7570)	

 * LOSS - 2.763


Epochs since last improvement: 20

Epoch: [122][0/2069]	Batch Time 1.045 (1.045)	Data Time 0.827 (0.827)	Loss 2.5297 (2.5297)	
Epoch: [122][200/2069]	Batch Time 0.193 (0.213)	Data Time 0.000 (0.012)	Loss 2.4650 (2.3588)	
Epoch: [122][400/2069]	Batch Time 0.197 (0.211)	Data Time 0.000 (0.008)	Loss 2.6916 (2.3536)	
Epoch: [122][600/2069]	Batch Time 0.208 (0.210)	Data Time 0.000 (0.007)	Loss 2.5262 (2.3585)	
Epoch: [122][800/2069]	Batch Time 0.213 (0.211)	Data Time 0.0

[200/619]	Batch Time 0.099 (0.098)	Loss 3.3270 (2.7823)	
[400/619]	Batch Time 0.082 (0.098)	Loss 2.1543 (2.7462)	
[600/619]	Batch Time 0.120 (0.098)	Loss 2.3994 (2.7426)	

 * LOSS - 2.744


Epochs since last improvement: 2

Epoch: [128][0/2069]	Batch Time 0.695 (0.695)	Data Time 0.504 (0.504)	Loss 2.0926 (2.0926)	
Epoch: [128][200/2069]	Batch Time 0.198 (0.217)	Data Time 0.004 (0.012)	Loss 3.3527 (2.3582)	
Epoch: [128][400/2069]	Batch Time 0.188 (0.214)	Data Time 0.000 (0.009)	Loss 2.0182 (2.3544)	
Epoch: [128][600/2069]	Batch Time 0.208 (0.214)	Data Time 0.000 (0.009)	Loss 2.3359 (2.3648)	
Epoch: [128][800/2069]	Batch Time 0.205 (0.213)	Data Time 0.000 (0.008)	Loss 1.6938 (2.3811)	
Epoch: [128][1000/2069]	Batch Time 0.186 (0.213)	Data Time 0.000 (0.007)	Loss 2.5297 (2.3825)	
Epoch: [128][1200/2069]	Batch Time 0.211 (0.212)	Data Time 0.000 (0.007)	Loss 2.3603 (2.3924)	
Epoch: [128][1400/2069]	Batch Time 0.195 (0.212)	Data Time 0.000 (0.006)	Loss 1.8494 (2.3927)	
Epoch: [128][1600/2069]

Epoch: [134][400/2069]	Batch Time 0.340 (0.212)	Data Time 0.118 (0.008)	Loss 1.9245 (2.3971)	
Epoch: [134][600/2069]	Batch Time 0.226 (0.211)	Data Time 0.008 (0.007)	Loss 2.0116 (2.3845)	
Epoch: [134][800/2069]	Batch Time 0.185 (0.210)	Data Time 0.000 (0.006)	Loss 2.3829 (2.3632)	
Epoch: [134][1000/2069]	Batch Time 0.263 (0.211)	Data Time 0.000 (0.006)	Loss 2.1292 (2.3644)	
Epoch: [134][1200/2069]	Batch Time 0.210 (0.210)	Data Time 0.008 (0.006)	Loss 2.4955 (2.3638)	
Epoch: [134][1400/2069]	Batch Time 0.193 (0.210)	Data Time 0.000 (0.006)	Loss 2.3252 (2.3602)	
Epoch: [134][1600/2069]	Batch Time 0.247 (0.210)	Data Time 0.000 (0.005)	Loss 3.2101 (2.3599)	
Epoch: [134][1800/2069]	Batch Time 0.184 (0.210)	Data Time 0.000 (0.005)	Loss 2.1771 (2.3571)	
Epoch: [134][2000/2069]	Batch Time 0.189 (0.210)	Data Time 0.004 (0.005)	Loss 2.1386 (2.3583)	
[0/619]	Batch Time 0.302 (0.302)	Loss 3.2685 (3.2685)	
[200/619]	Batch Time 0.085 (0.101)	Loss 2.5248 (2.7479)	
[400/619]	Batch Time 0.079 (0.098)	L