In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
import os
import random
import h5py
import numpy as np
import torch
from scipy import ndimage
from scipy.ndimage.interpolation import zoom
from torch.utils.data import Dataset


def random_rot_flip(image, label):
    k = np.random.randint(0, 4)
    image = np.rot90(image, k)
    label = np.rot90(label, k)
    axis = np.random.randint(0, 2)
    image = np.flip(image, axis=axis).copy()
    label = np.flip(label, axis=axis).copy()
    return image, label


def random_rotate(image, label):
    angle = np.random.randint(-20, 20)
    image = ndimage.rotate(image, angle, order=0, reshape=False)
    label = ndimage.rotate(label, angle, order=0, reshape=False)
    return image, label


class RandomGenerator(object):
    def __init__(self, output_size):
        self.output_size = output_size

    def __call__(self, sample):
        image, label = sample['image'], sample['label']

        if random.random() > 0.5:
            image, label = random_rot_flip(image, label)
        elif random.random() > 0.5:
            image, label = random_rotate(image, label)
        x, y = image.shape
        if x != self.output_size[0] or y != self.output_size[1]:
            image = zoom(image, (self.output_size[0] / x, self.output_size[1] / y), order=3)  # why not 3?
            label = zoom(label, (self.output_size[0] / x, self.output_size[1] / y), order=0)
        image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0)
        label = torch.from_numpy(label.astype(np.float32))
        sample = {'image': image, 'label': label.long()}
        return sample


class Synapse_dataset(Dataset):
    def __init__(self, base_dir, list_dir, split, transform=None):
        self.transform = transform  # using transform in torch!
        self.split = split
        self.sample_list = open(os.path.join(list_dir, self.split+'.txt')).readlines()
        self.data_dir = base_dir

    def __len__(self):
        return len(self.sample_list)

    def __getitem__(self, idx):
        if self.split == "train":
            slice_name = self.sample_list[idx].strip('\n')
            data_path = os.path.join(self.data_dir, slice_name+'.npz')
            data = np.load(data_path)
            image, label = data['image'], data['label']
        else:
            vol_name = self.sample_list[idx].strip('\n')
            filepath = self.data_dir + "/{}.npy.h5".format(vol_name)
            data = h5py.File(filepath)
            image, label = data['image'][:], data['label'][:]

        sample = {'image': image, 'label': label}
        if self.transform:
            sample = self.transform(sample)
        sample['case_name'] = self.sample_list[idx].strip('\n')
        return sample

In [22]:
from os.path import join
from os.path import normpath
import platform

def pjoin(path, *paths):
    p = join(path, *paths)
    if platform.system() == "Windows":
        return normpath(p).replace('\\','/')
    else:
        return p

In [23]:
!pip install ml_collections
!pip install tensorboardX
!pip install MedPy



In [24]:
!pip install torchinfo



In [25]:
import ml_collections
import torch.nn as nn


def get_b16_config():
    """Returns the ViT-B/16 configuration."""
    config = ml_collections.ConfigDict()
    config.patches = ml_collections.ConfigDict({'size': (16, 16)})
    config.hidden_size = 768
    config.transformer = ml_collections.ConfigDict()
    config.transformer.mlp_dim = 3072
    config.transformer.num_heads = 12
    config.transformer.num_layers = 12
    config.transformer.attention_dropout_rate = 0.0
    config.transformer.dropout_rate = 0.1

    config.classifier = 'seg'
    config.representation_size = None
    config.resnet_pretrained_path = None
    config.pretrained_path = '/content/gdrive/MyDrive/Synapse/imagenet21k+imagenet2012_ViT-B_16.npz'
    config.patch_size = 16

    config.decoder_channels = (256, 128, 64, 16)
    config.n_classes = 2
    config.activation = 'softmax'
    return config

def get_r50_b16_config():
    """Returns the Resnet50 + ViT-B/16 configuration."""
    config = get_b16_config()
    config.patches.grid = (8, 8)
    config.resnet = ml_collections.ConfigDict()
    config.resnet.num_layers = (3, 4, 9)
    config.resnet.width_factor = 1

    config.classifier = 'seg'
    config.pretrained_path = '/content/gdrive/MyDrive/Synapse/imagenet21k+imagenet2012_R50+ViT-B_16.npz'
    config.decoder_channels = (256, 128, 64, 16)
    config.skip_channels = [512, 256, 64, 16]
    config.n_classes = 2
    config.n_skip = 3
    config.activation = 'softmax'

    return config




In [26]:
class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


class ChannelAttention(nn.Module):
    def __init__(self, input_channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.input_channels = input_channels
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        #  https://github.com/luuuyi/CBAM.PyTorch/blob/master/model/resnet_cbam.py
        #  uses Convolutions instead of Linear
        self.MLP = nn.Sequential(
            Flatten(),
            nn.Linear(input_channels, input_channels // reduction_ratio),
            nn.ReLU(),
            nn.Linear(input_channels // reduction_ratio, input_channels)
        )

    def forward(self, x):
        # Take the input and apply average and max pooling
        avg_values = self.avg_pool(x)
        max_values = self.max_pool(x)
        out = self.MLP(avg_values) + self.MLP(max_values)
        scale = x * torch.sigmoid(out).unsqueeze(2).unsqueeze(3).expand_as(x)
        return scale


class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=padding, bias=False)
        self.bn = nn.BatchNorm2d(1)

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        out = torch.cat([avg_out, max_out], dim=1)
        out = self.conv(out)
        out = self.bn(out)
        scale = x * torch.sigmoid(out)
        return scale


class CBAM(nn.Module):
    def __init__(self, input_channels, reduction_ratio=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.channel_att = ChannelAttention(input_channels, reduction_ratio=reduction_ratio)
        self.spatial_att = SpatialAttention(kernel_size=kernel_size)

    def forward(self, x):
        out = self.channel_att(x)
        out = self.spatial_att(out)
        return out

In [27]:
from typing import Counter
import math

from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
from os.path import join
from os.path import normpath
import platform

def pjoin(path, *paths):
    p = join(path, *paths)
    if platform.system() == "Windows":
        return normpath(p).replace('\\','/')
    else:
        return p

def np2th(weights, conv=False):
    """Possibly convert HWIO to OIHW."""
    if conv:
        weights = weights.transpose([3, 2, 0, 1])
    return torch.from_numpy(weights)


class StdConv2d(nn.Conv2d):

    def forward(self, x):
        w = self.weight
        v, m = torch.var_mean(w, dim=[1, 2, 3], keepdim=True, unbiased=False)
        w = (w - m) / torch.sqrt(v + 1e-5)
        return F.conv2d(x, w, self.bias, self.stride, self.padding,
                        self.dilation, self.groups)


def conv3x3(cin, cout, stride=1, groups=1, bias=False):
    return StdConv2d(cin, cout, kernel_size=3, stride=stride,
                     padding=1, bias=bias, groups=groups)


def conv1x1(cin, cout, stride=1, bias=False):
    return StdConv2d(cin, cout, kernel_size=1, stride=stride,
                     padding=0, bias=bias)


class PreActBottleneck(nn.Module):
    """Pre-activation (v2) bottleneck block.
    """

    def __init__(self, cin, cout=None, cmid=None, stride=1):
        super().__init__()
        cout = cout or cin
        cmid = cmid or cout//4

        self.gn1 = nn.GroupNorm(32, cmid, eps=1e-6)
        self.conv1 = conv1x1(cin, cmid, bias=False)
        self.CBAM9 = CBAM(cmid)

        self.gn2 = nn.GroupNorm(32, cmid, eps=1e-6)
        self.conv2 = conv3x3(cmid, cmid, stride, bias=False)  # Original code has it on conv1!!
        self.CBAM6 = CBAM(cmid)

        self.gn3 = nn.GroupNorm(32, cout, eps=1e-6)
        self.conv3 = conv1x1(cmid, cout, bias=False)
        self.CBAM7 = CBAM(cout)

        self.CBAM8 = CBAM(cout)
        self.relu = nn.ReLU(inplace=True)

        if (stride != 1 or cin != cout):
            # Projection also with pre-activation according to paper.
            self.downsample = conv1x1(cin, cout, stride, bias=False)
            self.gn_proj = nn.GroupNorm(cout, cout)

    def forward(self, x):

        # Residual branch
        residual = x
        if hasattr(self, 'downsample'):
            residual = self.downsample(x)
            residual = self.gn_proj(residual)

        # Unit's branch
        # y=self.CBAM9(x)
        y = self.relu(self.gn1(self.conv1(x)))
        # print('y1', y.shape)
        y = self.CBAM9(y)

        y = self.relu(self.gn2(self.conv2(y)))
        # print("y2",y.shape)
        y = self.CBAM6(y)

        y = self.gn3(self.conv3(y))
        y = self.CBAM7(y)
        # print("y3",y.shape)

        y = self.relu(residual + y)
        y = self.CBAM8(y)
        # print("y4",y.shape)
        return y

    def load_from(self, weights, n_block, n_unit):
        conv1_weight = np2th(weights[pjoin(n_block, n_unit, "conv1/kernel")], conv=True)
        conv2_weight = np2th(weights[pjoin(n_block, n_unit, "conv2/kernel")], conv=True)
        conv3_weight = np2th(weights[pjoin(n_block, n_unit, "conv3/kernel")], conv=True)

        gn1_weight = np2th(weights[pjoin(n_block, n_unit, "gn1/scale")])
        gn1_bias = np2th(weights[pjoin(n_block, n_unit, "gn1/bias")])

        gn2_weight = np2th(weights[pjoin(n_block, n_unit, "gn2/scale")])
        gn2_bias = np2th(weights[pjoin(n_block, n_unit, "gn2/bias")])

        gn3_weight = np2th(weights[pjoin(n_block, n_unit, "gn3/scale")])
        gn3_bias = np2th(weights[pjoin(n_block, n_unit, "gn3/bias")])

        self.conv1.weight.copy_(conv1_weight)
        self.conv2.weight.copy_(conv2_weight)
        self.conv3.weight.copy_(conv3_weight)

        self.gn1.weight.copy_(gn1_weight.view(-1))
        self.gn1.bias.copy_(gn1_bias.view(-1))

        self.gn2.weight.copy_(gn2_weight.view(-1))
        self.gn2.bias.copy_(gn2_bias.view(-1))

        self.gn3.weight.copy_(gn3_weight.view(-1))
        self.gn3.bias.copy_(gn3_bias.view(-1))

        if hasattr(self, 'downsample'):
            proj_conv_weight = np2th(weights[pjoin(n_block, n_unit, "conv_proj/kernel")], conv=True)
            proj_gn_weight = np2th(weights[pjoin(n_block, n_unit, "gn_proj/scale")])
            proj_gn_bias = np2th(weights[pjoin(n_block, n_unit, "gn_proj/bias")])

            self.downsample.weight.copy_(proj_conv_weight)
            self.gn_proj.weight.copy_(proj_gn_weight.view(-1))
            self.gn_proj.bias.copy_(proj_gn_bias.view(-1))

class ResNetV2(nn.Module):
    """Implementation of Pre-activation (v2) ResNet mode."""

    def __init__(self, block_units, width_factor):
        super().__init__()
        width = int(64 * width_factor)
        self.width = width

        self.root = nn.Sequential(OrderedDict([
            ('conv', StdConv2d(3, width, kernel_size=7, stride=2, bias=False, padding=3)),
            ('gn', nn.GroupNorm(32, width, eps=1e-6)),
            ('relu', nn.ReLU(inplace=True)),
            # ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=0))
        ]))

        self.body = nn.Sequential(OrderedDict([
            ('block1/', nn.Sequential(OrderedDict(
                [('unit1/', PreActBottleneck(cin=width, cout=width*4, cmid=width))] +
                [(f'unit{i:d}/', PreActBottleneck(cin=width*4, cout=width*4, cmid=width)) for i in range(2, block_units[0] + 1)],
                ))),
            ('block2/', nn.Sequential(OrderedDict(
                [('unit1/', PreActBottleneck(cin=width*4, cout=width*8, cmid=width*2, stride=2))] +
                [(f'unit{i:d}/', PreActBottleneck(cin=width*8, cout=width*8, cmid=width*2)) for i in range(2, block_units[1] + 1)],
                ))),
            ('block3/', nn.Sequential(OrderedDict(
                [('unit1/', PreActBottleneck(cin=width*8, cout=width*16, cmid=width*4, stride=2))] +
                [(f'unit{i:d}/', PreActBottleneck(cin=width*16, cout=width*16, cmid=width*4)) for i in range(2, block_units[2] + 1)],
                ))),
        ]))

    def forward(self, x):
        features = []
        b, c, in_size, _ = x.size()
        x = self.root(x)
        features.append(x)
        x = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)(x)
        for i in range(len(self.body)-1):
            x = self.body[i](x)
            right_size = int(in_size / 4 / (i+1))
            if x.size()[2] != right_size:
                pad = right_size - x.size()[2]
                assert pad < 3 and pad > 0, "x {} should {}".format(x.size(), right_size)
                feat = torch.zeros((b, x.size()[1], right_size, right_size), device=x.device)
                feat[:, :, 0:x.size()[2], 0:x.size()[3]] = x[:]
            else:
                feat = x
            features.append(feat)
        x = self.body[-1](x)
        return x, features[::-1]

In [28]:
config = get_r50_b16_config()
resnet = ResNetV2(block_units=config.resnet.num_layers, width_factor=config.resnet.width_factor)

out = resnet(torch.zeros(1,3,224,224))
print(out[0].shape, len(out[1]))
print([y.shape for y in out[1]])

torch.Size([1, 1024, 14, 14]) 3
[torch.Size([1, 512, 28, 28]), torch.Size([1, 256, 56, 56]), torch.Size([1, 64, 112, 112])]


In [29]:
# coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import logging
import math


import torch
import torch.nn as nn
import numpy as np

from torch.nn import CrossEntropyLoss, Dropout, Softmax, Linear, Conv2d, LayerNorm
from torch.nn.modules.utils import _pair
from scipy import ndimage



logger = logging.getLogger(__name__)


ATTENTION_Q = "MultiHeadDotProductAttention_1/query/"
ATTENTION_K = "MultiHeadDotProductAttention_1/key/"
ATTENTION_V = "MultiHeadDotProductAttention_1/value/"
ATTENTION_OUT = "MultiHeadDotProductAttention_1/out/"
FC_0 = "MlpBlock_3/Dense_0/"
FC_1 = "MlpBlock_3/Dense_1/"
ATTENTION_NORM = "LayerNorm_0/"
MLP_NORM = "LayerNorm_2/"


def np2th(weights, conv=False):
    """Possibly convert HWIO to OIHW."""
    if conv:
        weights = weights.transpose([3, 2, 0, 1])
    return torch.from_numpy(weights)


def swish(x):
    return x * torch.sigmoid(x)


ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish}


class Attention(nn.Module):
    def __init__(self, config, vis):
        super(Attention, self).__init__()
        self.vis = vis
        self.num_attention_heads = config.transformer["num_heads"]
        self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = Linear(config.hidden_size, self.all_head_size)
        self.key = Linear(config.hidden_size, self.all_head_size)
        self.value = Linear(config.hidden_size, self.all_head_size)

        self.out = Linear(config.hidden_size, config.hidden_size)
        self.attn_dropout = Dropout(config.transformer["attention_dropout_rate"])
        self.proj_dropout = Dropout(config.transformer["attention_dropout_rate"])

        self.softmax = Softmax(dim=-1)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_probs = self.softmax(attention_scores)
        weights = attention_probs if self.vis else None
        attention_probs = self.attn_dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.out(context_layer)
        attention_output = self.proj_dropout(attention_output)
        return attention_output, weights


class Mlp(nn.Module):
    def __init__(self, config):
        super(Mlp, self).__init__()
        self.fc1 = Linear(config.hidden_size, config.transformer["mlp_dim"])
        self.fc2 = Linear(config.transformer["mlp_dim"], config.hidden_size)
        self.act_fn = ACT2FN["gelu"]
        self.dropout = Dropout(config.transformer["dropout_rate"])

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.normal_(self.fc1.bias, std=1e-6)
        nn.init.normal_(self.fc2.bias, std=1e-6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x


class Embeddings(nn.Module):
    """Construct the embeddings from patch, position embeddings.
    """
    def __init__(self, config, img_size, in_channels=3):
        super(Embeddings, self).__init__()
        self.hybrid = None
        self.config = config
        img_size = _pair(img_size)

        if config.patches.get("grid") is not None:   # ResNet
            grid_size = config.patches["grid"]
            patch_size = (img_size[0] // 16 // grid_size[0], img_size[1] // 16 // grid_size[1])
            patch_size_real = (patch_size[0] * 16, patch_size[1] * 16)
            n_patches = (img_size[0] // patch_size_real[0]) * (img_size[1] // patch_size_real[1])  
            self.hybrid = True
        else:
            patch_size = _pair(config.patches["size"])
            n_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1])
            self.hybrid = False

        if self.hybrid:
            self.hybrid_model = ResNetV2(block_units=config.resnet.num_layers, width_factor=config.resnet.width_factor)
            in_channels = self.hybrid_model.width * 16
        self.patch_embeddings = Conv2d(in_channels=in_channels,
                                       out_channels=config.hidden_size,
                                       kernel_size=patch_size,
                                       stride=patch_size)
        self.position_embeddings = nn.Parameter(torch.zeros(1, n_patches, config.hidden_size))

        self.dropout = Dropout(config.transformer["dropout_rate"])


    def forward(self, x):
        if self.hybrid:
            x, features = self.hybrid_model(x)
        else:
            features = None
        x = self.patch_embeddings(x)  # (B, hidden. n_patches^(1/2), n_patches^(1/2))
        x = x.flatten(2)
        x = x.transpose(-1, -2)  # (B, n_patches, hidden)

        embeddings = x + self.position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings, features


class Block(nn.Module):
    def __init__(self, config, vis):
        super(Block, self).__init__()
        self.hidden_size = config.hidden_size
        self.attention_norm = LayerNorm(config.hidden_size, eps=1e-6)
        self.ffn_norm = LayerNorm(config.hidden_size, eps=1e-6)
        self.ffn = Mlp(config)
        self.attn = Attention(config, vis)

    def forward(self, x):
        h = x
        x = self.attention_norm(x)
        x, weights = self.attn(x)
        x = x + h

        h = x
        x = self.ffn_norm(x)
        x = self.ffn(x)
        x = x + h
        return x, weights

    def load_from(self, weights, n_block):
        ROOT = f"Transformer/encoderblock_{n_block}"
        with torch.no_grad():
            query_weight = np2th(weights[pjoin(ROOT, ATTENTION_Q, "kernel")]).view(self.hidden_size, self.hidden_size).t()
            key_weight = np2th(weights[pjoin(ROOT, ATTENTION_K, "kernel")]).view(self.hidden_size, self.hidden_size).t()
            value_weight = np2th(weights[pjoin(ROOT, ATTENTION_V, "kernel")]).view(self.hidden_size, self.hidden_size).t()
            out_weight = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "kernel")]).view(self.hidden_size, self.hidden_size).t()

            query_bias = np2th(weights[pjoin(ROOT, ATTENTION_Q, "bias")]).view(-1)
            key_bias = np2th(weights[pjoin(ROOT, ATTENTION_K, "bias")]).view(-1)
            value_bias = np2th(weights[pjoin(ROOT, ATTENTION_V, "bias")]).view(-1)
            out_bias = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "bias")]).view(-1)

            self.attn.query.weight.copy_(query_weight)
            self.attn.key.weight.copy_(key_weight)
            self.attn.value.weight.copy_(value_weight)
            self.attn.out.weight.copy_(out_weight)
            self.attn.query.bias.copy_(query_bias)
            self.attn.key.bias.copy_(key_bias)
            self.attn.value.bias.copy_(value_bias)
            self.attn.out.bias.copy_(out_bias)

            mlp_weight_0 = np2th(weights[pjoin(ROOT, FC_0, "kernel")]).t()
            mlp_weight_1 = np2th(weights[pjoin(ROOT, FC_1, "kernel")]).t()
            mlp_bias_0 = np2th(weights[pjoin(ROOT, FC_0, "bias")]).t()
            mlp_bias_1 = np2th(weights[pjoin(ROOT, FC_1, "bias")]).t()

            self.ffn.fc1.weight.copy_(mlp_weight_0)
            self.ffn.fc2.weight.copy_(mlp_weight_1)
            self.ffn.fc1.bias.copy_(mlp_bias_0)
            self.ffn.fc2.bias.copy_(mlp_bias_1)

            self.attention_norm.weight.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "scale")]))
            self.attention_norm.bias.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "bias")]))
            self.ffn_norm.weight.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "scale")]))
            self.ffn_norm.bias.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "bias")]))


class Encoder(nn.Module):
    def __init__(self, config, vis):
        super(Encoder, self).__init__()
        self.vis = vis
        self.layer = nn.ModuleList()
        self.encoder_norm = LayerNorm(config.hidden_size, eps=1e-6)
        for _ in range(config.transformer["num_layers"]):
            layer = Block(config, vis)
            self.layer.append(copy.deepcopy(layer))

    def forward(self, hidden_states):
        attn_weights = []
        for layer_block in self.layer:
            hidden_states, weights = layer_block(hidden_states)
            if self.vis:
                attn_weights.append(weights)
        encoded = self.encoder_norm(hidden_states)
        return encoded, attn_weights


class Transformer(nn.Module):
    def __init__(self, config, img_size, vis):
        super(Transformer, self).__init__()
        self.embeddings = Embeddings(config, img_size=img_size)
        self.encoder = Encoder(config, vis)

    def forward(self, input_ids):
        embedding_output, features = self.embeddings(input_ids)
        encoded, attn_weights = self.encoder(embedding_output)  # (B, n_patch, hidden)
        return encoded, attn_weights, features


class Conv2dReLU(nn.Sequential):
    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            padding=0,
            stride=1,
            use_batchnorm=True,
    ):
        conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            bias=not (use_batchnorm),
        )
        relu = nn.ReLU(inplace=True)

        bn = nn.BatchNorm2d(out_channels)

        super(Conv2dReLU, self).__init__(conv, bn, relu)


class DecoderBlock(nn.Module):
    def __init__(
            self,
            in_channels,
            out_channels,
            skip_channels=0,
            use_batchnorm=True,
    ):
        super().__init__()
        self.cbam1 = CBAM(in_channels)
        self.conv1 = Conv2dReLU(
            in_channels + skip_channels,
            out_channels,
            kernel_size=3,
            padding=1,
            use_batchnorm=use_batchnorm,
        )
        self.cbam2 = CBAM(in_channels+skip_channels)
        self.conv2 = Conv2dReLU(
            out_channels,
            out_channels,
            kernel_size=3,
            padding=1,
            use_batchnorm=use_batchnorm,
        )
        self.cbam3 = CBAM(out_channels)
        self.up = nn.UpsamplingBilinear2d(scale_factor=2)


    def forward(self, x, skip=None):
        x=self.cbam1(x)
        x = self.up(x)
        if skip is not None:
            x = torch.cat([x, skip], dim=1)
        x=self.cbam2(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x=self.cbam3(x)
        return x


class SegmentationHead(nn.Sequential):

    def __init__(self, in_channels, out_channels, kernel_size=3, upsampling=1):
        conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
        upsampling = nn.UpsamplingBilinear2d(scale_factor=upsampling) if upsampling > 1 else nn.Identity()
        super().__init__(conv2d, upsampling)


class DecoderCup(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        head_channels = 512
        self.conv_more = Conv2dReLU(
            config.hidden_size,
            head_channels,
            kernel_size=3,
            padding=1,
            use_batchnorm=True,
        )
        decoder_channels = config.decoder_channels
        in_channels = [head_channels] + list(decoder_channels[:-1])
        out_channels = decoder_channels

        if self.config.n_skip != 0:
            skip_channels = self.config.skip_channels
            for i in range(4-self.config.n_skip):  # re-select the skip channels according to n_skip
                skip_channels[3-i]=0

        else:
            skip_channels=[0,0,0,0]

        blocks = [
            DecoderBlock(in_ch, out_ch, sk_ch) for in_ch, out_ch, sk_ch in zip(in_channels, out_channels, skip_channels)
        ]
        self.blocks = nn.ModuleList(blocks)

    def forward(self, hidden_states, features=None):
        B, n_patch, hidden = hidden_states.size()  # reshape from (B, n_patch, hidden) to (B, h, w, hidden)
        h, w = int(np.sqrt(n_patch)), int(np.sqrt(n_patch))
        x = hidden_states.permute(0, 2, 1)
        x = x.contiguous().view(B, hidden, h, w)
        x = self.conv_more(x)
        for i, decoder_block in enumerate(self.blocks):
            if features is not None:
                skip = features[i] if (i < self.config.n_skip) else None
            else:
                skip = None
            x = decoder_block(x, skip=skip)
        return x


class VisionTransformer(nn.Module):
    def __init__(self, config, img_size=224, num_classes=21843, zero_head=False, vis=False):
        super(VisionTransformer, self).__init__()
        self.num_classes = num_classes
        self.zero_head = zero_head
        self.classifier = config.classifier
        self.transformer = Transformer(config, img_size, vis)
        self.decoder = DecoderCup(config)
        self.segmentation_head = SegmentationHead(
            in_channels=config['decoder_channels'][-1],
            out_channels=config['n_classes'],
            kernel_size=3,
        )
        self.config = config

    def forward(self, x):
        if x.size()[1] == 1:
            x = x.repeat(1,3,1,1)
        x, attn_weights, features = self.transformer(x)  # (B, n_patch, hidden)
        x = self.decoder(x, features)
        logits = self.segmentation_head(x)
        return logits

    def load_from(self, weights):
        with torch.no_grad():

            res_weight = weights
            self.transformer.embeddings.patch_embeddings.weight.copy_(np2th(weights["embedding/kernel"], conv=True))
            self.transformer.embeddings.patch_embeddings.bias.copy_(np2th(weights["embedding/bias"]))

            self.transformer.encoder.encoder_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"]))
            self.transformer.encoder.encoder_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"]))

            posemb = np2th(weights["Transformer/posembed_input/pos_embedding"])

            posemb_new = self.transformer.embeddings.position_embeddings
            if posemb.size() == posemb_new.size():
                self.transformer.embeddings.position_embeddings.copy_(posemb)
            elif posemb.size()[1]-1 == posemb_new.size()[1]:
                posemb = posemb[:, 1:]
                self.transformer.embeddings.position_embeddings.copy_(posemb)
            else:
                logger.info("load_pretrained: resized variant: %s to %s" % (posemb.size(), posemb_new.size()))
                ntok_new = posemb_new.size(1)
                if self.classifier == "seg":
                    _, posemb_grid = posemb[:, :1], posemb[0, 1:]
                gs_old = int(np.sqrt(len(posemb_grid)))
                gs_new = int(np.sqrt(ntok_new))
                print('load_pretrained: grid-size from %s to %s' % (gs_old, gs_new))
                posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1)
                zoom = (gs_new / gs_old, gs_new / gs_old, 1)
                posemb_grid = ndimage.zoom(posemb_grid, zoom, order=1)  # th2np
                posemb_grid = posemb_grid.reshape(1, gs_new * gs_new, -1)
                posemb = posemb_grid
                self.transformer.embeddings.position_embeddings.copy_(np2th(posemb))

            # Encoder whole
            for bname, block in self.transformer.encoder.named_children():
                for uname, unit in block.named_children():
                    unit.load_from(weights, n_block=uname)

            if self.transformer.embeddings.hybrid:
                self.transformer.embeddings.hybrid_model.root.conv.weight.copy_(np2th(res_weight["conv_root/kernel"], conv=True))
                gn_weight = np2th(res_weight["gn_root/scale"]).view(-1)
                gn_bias = np2th(res_weight["gn_root/bias"]).view(-1)
                self.transformer.embeddings.hybrid_model.root.gn.weight.copy_(gn_weight)
                self.transformer.embeddings.hybrid_model.root.gn.bias.copy_(gn_bias)

                for bname, block in self.transformer.embeddings.hybrid_model.body.named_children():
                    for uname, unit in block.named_children():
                        unit.load_from(res_weight, n_block=bname, n_unit=uname)
                        
CONFIGS = {
#     'ViT-B_32': get_b32_config(),
#     'ViT-L_16': get_l16_config(),
#     'ViT-L_32': get_l32_config(),
#     'ViT-H_14': get_h14_config(),
    'R50-ViT-B_16': get_r50_b16_config()
#     'R50-ViT-L_16': get_r50_l16_config(),
#     'testing': get_testing(),
}

In [30]:
import numpy as np
import torch
from medpy import metric
from scipy.ndimage import zoom
import torch.nn as nn
import SimpleITK as sitk


class DiceLoss(nn.Module):
    def __init__(self, n_classes):
        super(DiceLoss, self).__init__()
        self.n_classes = n_classes

    def _one_hot_encoder(self, input_tensor):
        tensor_list = []
        for i in range(self.n_classes):
            temp_prob = input_tensor == i  # * torch.ones_like(input_tensor)
            tensor_list.append(temp_prob.unsqueeze(1))
        output_tensor = torch.cat(tensor_list, dim=1)
        return output_tensor.float()

    def _dice_loss(self, score, target):
        target = target.float()
        smooth = 1e-5
        intersect = torch.sum(score * target)
        y_sum = torch.sum(target * target)
        z_sum = torch.sum(score * score)
        loss = (2 * intersect + smooth) / (z_sum + y_sum + smooth)
        loss = 1 - loss
        return loss

    def forward(self, inputs, target, weight=None, softmax=False):
        if softmax:
            inputs = torch.softmax(inputs, dim=1)
        target = self._one_hot_encoder(target)
        if weight is None:
            weight = [1] * self.n_classes
        assert inputs.size() == target.size(), 'predict {} & target {} shape do not match'.format(inputs.size(), target.size())
        class_wise_dice = []
        loss = 0.0
        for i in range(0, self.n_classes):
            dice = self._dice_loss(inputs[:, i], target[:, i])
            class_wise_dice.append(1.0 - dice.item())
            loss += dice * weight[i]
        return loss / self.n_classes


def calculate_metric_percase(pred, gt):
    pred[pred > 0] = 1
    gt[gt > 0] = 1
    if pred.sum() > 0 and gt.sum()>0:
        dice = metric.binary.dc(pred, gt)
        hd95 = metric.binary.hd95(pred, gt)
        return dice, hd95
    elif pred.sum() > 0 and gt.sum()==0:
        return 1, 0
    else:
        return 0, 0


def test_single_volume(image, label, net, classes, patch_size=[256, 256], test_save_path=None, case=None, z_spacing=1):
    image, label = image.squeeze(0).cpu().detach().numpy(), label.squeeze(0).cpu().detach().numpy()
    if len(image.shape) == 3:
        prediction = np.zeros_like(label)
        for ind in range(image.shape[0]):
            slice = image[ind, :, :]
            x, y = slice.shape[0], slice.shape[1]
            if x != patch_size[0] or y != patch_size[1]:
                slice = zoom(slice, (patch_size[0] / x, patch_size[1] / y), order=3)  # previous using 0
            input = torch.from_numpy(slice).unsqueeze(0).unsqueeze(0).float().cuda()
            net.eval()
            with torch.no_grad():
                outputs = net(input)
                out = torch.argmax(torch.softmax(outputs, dim=1), dim=1).squeeze(0)
                out = out.cpu().detach().numpy()
                if x != patch_size[0] or y != patch_size[1]:
                    pred = zoom(out, (x / patch_size[0], y / patch_size[1]), order=0)
                else:
                    pred = out
                prediction[ind] = pred
    else:
        input = torch.from_numpy(image).unsqueeze(
            0).unsqueeze(0).float().cuda()
        net.eval()
        with torch.no_grad():
            out = torch.argmax(torch.softmax(net(input), dim=1), dim=1).squeeze(0)
            prediction = out.cpu().detach().numpy()
    metric_list = []
    for i in range(1, classes):
        metric_list.append(calculate_metric_percase(prediction == i, label == i))

    if test_save_path is not None:
        img_itk = sitk.GetImageFromArray(image.astype(np.float32))
        prd_itk = sitk.GetImageFromArray(prediction.astype(np.float32))
        lab_itk = sitk.GetImageFromArray(label.astype(np.float32))
        img_itk.SetSpacing((1, 1, z_spacing))
        prd_itk.SetSpacing((1, 1, z_spacing))
        lab_itk.SetSpacing((1, 1, z_spacing))
        sitk.WriteImage(prd_itk, test_save_path + '/'+case + "_pred.nii.gz")
        sitk.WriteImage(img_itk, test_save_path + '/'+ case + "_img.nii.gz")
        sitk.WriteImage(lab_itk, test_save_path + '/'+ case + "_gt.nii.gz")
    return metric_list

In [31]:
import argparse
import logging
import os
import random
import sys
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter
from torch.nn.modules.loss import CrossEntropyLoss
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision import transforms

def worker_init_fn(worker_id):
    random.seed(1234 + worker_id)

def trainer_synapse(args, model, snapshot_path):
    logging.basicConfig(filename=snapshot_path + "/log.txt", level=logging.INFO,
                        format='[%(asctime)s.%(msecs)03d] %(message)s', datefmt='%H:%M:%S')
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
    logging.info(str(args))
    base_lr = args.base_lr
    num_classes = args.num_classes
    batch_size = args.batch_size * args.n_gpu
    # max_iterations = args.max_iterations
    db_train = Synapse_dataset(base_dir=args.root_path, list_dir=args.list_dir, split="train",
                               transform=transforms.Compose(
                                   [RandomGenerator(output_size=[args.img_size, args.img_size])]))
    print("The length of train set is: {}".format(len(db_train)))



    trainloader = DataLoader(db_train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True,
                             worker_init_fn=worker_init_fn)
    if args.n_gpu > 1:
        model = nn.DataParallel(model)
    model.train()
    ce_loss = CrossEntropyLoss()
    dice_loss = DiceLoss(num_classes)
    optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001)
    writer = SummaryWriter(snapshot_path + '/log')
    iter_num = 0
    max_epoch = args.max_epochs
    max_iterations = args.max_epochs * len(trainloader)  # max_epoch = max_iterations // len(trainloader) + 1
    logging.info("{} iterations per epoch. {} max iterations ".format(len(trainloader), max_iterations))
    best_performance = 0.0
    iterator = tqdm(range(max_epoch), ncols=70)
    for epoch_num in iterator:
        for i_batch, sampled_batch in enumerate(trainloader):
            image_batch, label_batch = sampled_batch['image'], sampled_batch['label']
            image_batch, label_batch = image_batch.cuda(), label_batch.cuda()
            outputs = model(image_batch)
            loss_ce = ce_loss(outputs, label_batch[:].long())
            loss_dice = dice_loss(outputs, label_batch, softmax=True)
            loss = 0.5 * loss_ce + 0.5 * loss_dice
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_ = base_lr * (1.0 - iter_num / max_iterations) ** 0.9
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_

            iter_num = iter_num + 1
            writer.add_scalar('info/lr', lr_, iter_num)
            writer.add_scalar('info/total_loss', loss, iter_num)
            writer.add_scalar('info/loss_ce', loss_ce, iter_num)

            logging.info('iteration %d : loss : %f, loss_ce: %f' % (iter_num, loss.item(), loss_ce.item()))

            if iter_num % 20 == 0:
                image = image_batch[1, 0:1, :, :]
                image = (image - image.min()) / (image.max() - image.min())
                writer.add_image('train/Image', image, iter_num)
                outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1, keepdim=True)
                writer.add_image('train/Prediction', outputs[1, ...] * 50, iter_num)
                labs = label_batch[1, ...].unsqueeze(0) * 50
                writer.add_image('train/GroundTruth', labs, iter_num)

        save_interval = 50  # int(max_epoch/6)
        if epoch_num > int(max_epoch / 2) and (epoch_num + 1) % save_interval == 0:
            save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth')
            torch.save(model.state_dict(), save_mode_path)
            logging.info("save model to {}".format(save_mode_path))

        if epoch_num >= max_epoch - 1:
            save_mode_path = os.path.join(snapshot_path, 'epoch_' + str(epoch_num) + '.pth')
            torch.save(model.state_dict(), save_mode_path)
            logging.info("save model to {}".format(save_mode_path))
            iterator.close()
            break

    writer.close()
    return "Training Finished!"

In [None]:
import argparse
import logging
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import argparse

import torch
torch.cuda.empty_cache()


parser = argparse.ArgumentParser()
parser.add_argument('--root_path', type=str,
                    default='/content/gdrive/MyDrive/Synapse/train_npz', help='root dir for data')
parser.add_argument('--dataset', type=str,
                    default='Synapse', help='experiment_name')
parser.add_argument('--list_dir', type=str,
                    default='/content/gdrive/MyDrive/Synapse/TransUNet-main/lists/lists_Synapse', help='list dir')
parser.add_argument('--num_classes', type=int,
                    default=9, help='output channel of network')
parser.add_argument('--max_iterations', type=int,
                    default=30000, help='maximum epoch number to train')
parser.add_argument('--max_epochs', type=int,
                    default=150, help='maximum epoch number to train')
parser.add_argument('--batch_size', type=int,
                    default=8, help='batch_size per gpu')
parser.add_argument('--n_gpu', type=int, default=1, help='total gpu')
parser.add_argument('--deterministic', type=int,  default=1,
                    help='whether use deterministic training')
parser.add_argument('--base_lr', type=float,  default=0.01,
                    help='segmentation network learning rate')
parser.add_argument('--img_size', type=int,
                    default=224, help='input patch size of network input')
parser.add_argument('--seed', type=int,
                    default=1234, help='random seed')
parser.add_argument('--n_skip', type=int,
                    default=3, help='using number of skip-connect, default is num')
parser.add_argument('--vit_name', type=str,
                    default='R50-ViT-B_16', help='select one vit model')
parser.add_argument('--vit_patches_size', type=int,
                    default=16, help='vit_patches_size, default is 16')
parser.add_argument('-f')
args = parser.parse_args()


if __name__ == "__main__":
    if not args.deterministic:
        cudnn.benchmark = True
        cudnn.deterministic = False
    else:
        cudnn.benchmark = False
        cudnn.deterministic = True

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    dataset_name = args.dataset
    dataset_config = {
        'Synapse': {
            'root_path': '/content/gdrive/MyDrive/Synapse/train_npz',
            'list_dir': '/content/gdrive/MyDrive/Synapse/TransUNet-main/lists/lists_Synapse',
            'num_classes': 9,
        },
    }
    args.num_classes = dataset_config[dataset_name]['num_classes']
    args.root_path = dataset_config[dataset_name]['root_path']
    args.list_dir = dataset_config[dataset_name]['list_dir']
    args.is_pretrain = True
    args.exp = 'TU_' + dataset_name + str(args.img_size)
    snapshot_path = "/content/vit new/model/".format(args.exp, 'TU')
    snapshot_path = snapshot_path + '_pretrain' if args.is_pretrain else snapshot_path
    snapshot_path += '_' + args.vit_name
    snapshot_path = snapshot_path + '_skip' + str(args.n_skip)
    snapshot_path = snapshot_path + '_vitpatch' + str(args.vit_patches_size) if args.vit_patches_size!=16 else snapshot_path
    snapshot_path = snapshot_path+'_'+str(args.max_iterations)[0:2]+'k' if args.max_iterations != 30000 else snapshot_path
    snapshot_path = snapshot_path + '_epo' +str(args.max_epochs) if args.max_epochs != 30 else snapshot_path
    snapshot_path = snapshot_path+'_bs'+str(args.batch_size)
    snapshot_path = snapshot_path + '_lr' + str(args.base_lr) if args.base_lr != 0.01 else snapshot_path
    snapshot_path = snapshot_path + '_'+str(args.img_size)
    snapshot_path = snapshot_path + '_s'+str(args.seed) if args.seed!=1234 else snapshot_path

    if not os.path.exists(snapshot_path):
        os.makedirs(snapshot_path)
    config_vit=CONFIGS[args.vit_name]
    config_vit.n_classes = args.num_classes
    config_vit.n_skip = args.n_skip
    if args.vit_name.find('R50') != -1:
        config_vit.patches.grid = (int(args.img_size / args.vit_patches_size), int(args.img_size / args.vit_patches_size))
    net = VisionTransformer(config_vit, img_size=args.img_size, num_classes=config_vit.n_classes).cuda()
    net.load_from(weights=np.load(config_vit.pretrained_path))

    trainer = {'Synapse': trainer_synapse,}
    trainer[dataset_name](args, net, snapshot_path)

    # net(torch.zeros((1,3,224,224)))

load_pretrained: grid-size from 24 to 14
Namespace(base_lr=0.01, batch_size=8, dataset='Synapse', deterministic=1, exp='TU_Synapse224', f='/root/.local/share/jupyter/runtime/kernel-103918b3-b150-41c2-8bd9-2382add004ff.json', img_size=224, is_pretrain=True, list_dir='/content/gdrive/MyDrive/Synapse/TransUNet-main/lists/lists_Synapse', max_epochs=150, max_iterations=30000, n_gpu=1, n_skip=3, num_classes=9, root_path='/content/gdrive/MyDrive/Synapse/train_npz', seed=1234, vit_name='R50-ViT-B_16', vit_patches_size=16)
The length of train set is: 2211
277 iterations per epoch. 41550 max iterations 


  cpuset_checked))
  0%|                                         | 0/150 [00:00<?, ?it/s]

iteration 1 : loss : 1.532318, loss_ce: 2.141530
iteration 2 : loss : 1.528255, loss_ce: 2.132218
iteration 3 : loss : 1.540132, loss_ce: 2.108438
iteration 4 : loss : 1.509062, loss_ce: 2.091883
iteration 5 : loss : 1.500453, loss_ce: 2.053112
iteration 6 : loss : 1.474040, loss_ce: 2.021042
iteration 7 : loss : 1.448971, loss_ce: 1.988415
iteration 8 : loss : 1.426161, loss_ce: 1.943095
iteration 9 : loss : 1.407766, loss_ce: 1.889406
iteration 10 : loss : 1.379988, loss_ce: 1.844331
iteration 11 : loss : 1.354470, loss_ce: 1.789371
iteration 12 : loss : 1.321379, loss_ce: 1.729595
iteration 13 : loss : 1.291789, loss_ce: 1.685439
iteration 14 : loss : 1.252116, loss_ce: 1.595566
iteration 15 : loss : 1.229019, loss_ce: 1.550245
iteration 16 : loss : 1.184063, loss_ce: 1.468101
iteration 17 : loss : 1.142364, loss_ce: 1.379033
iteration 18 : loss : 1.107218, loss_ce: 1.304363
iteration 19 : loss : 1.071951, loss_ce: 1.233333
iteration 20 : loss : 1.033754, loss_ce: 1.146507
iteration

  1%|▏                             | 1/150 [03:59<9:55:28, 239.79s/it]

iteration 278 : loss : 0.483133, loss_ce: 0.157916
iteration 279 : loss : 0.471143, loss_ce: 0.104228
iteration 280 : loss : 0.503016, loss_ce: 0.192359
iteration 281 : loss : 0.519991, loss_ce: 0.207117
iteration 282 : loss : 0.510369, loss_ce: 0.200379
iteration 283 : loss : 0.493725, loss_ce: 0.170882
iteration 284 : loss : 0.578140, loss_ce: 0.346245
iteration 285 : loss : 0.474598, loss_ce: 0.120394
iteration 286 : loss : 0.522615, loss_ce: 0.211147
iteration 287 : loss : 0.477952, loss_ce: 0.094608
iteration 288 : loss : 0.459610, loss_ce: 0.097883
iteration 289 : loss : 0.471832, loss_ce: 0.112716
iteration 290 : loss : 0.501979, loss_ce: 0.197606
iteration 291 : loss : 0.469497, loss_ce: 0.109336
iteration 292 : loss : 0.471608, loss_ce: 0.081003
iteration 293 : loss : 0.492294, loss_ce: 0.187600
iteration 294 : loss : 0.556315, loss_ce: 0.294293
iteration 295 : loss : 0.486978, loss_ce: 0.172888
iteration 296 : loss : 0.471454, loss_ce: 0.139170
iteration 297 : loss : 0.496138

  1%|▍                             | 2/150 [07:56<9:46:30, 237.78s/it]

iteration 555 : loss : 0.455986, loss_ce: 0.089313
iteration 556 : loss : 0.430398, loss_ce: 0.102559
iteration 557 : loss : 0.469240, loss_ce: 0.141774
iteration 558 : loss : 0.437475, loss_ce: 0.104999
iteration 559 : loss : 0.480369, loss_ce: 0.190415
iteration 560 : loss : 0.457660, loss_ce: 0.157898
iteration 561 : loss : 0.445271, loss_ce: 0.072042
iteration 562 : loss : 0.432222, loss_ce: 0.119132
iteration 563 : loss : 0.444107, loss_ce: 0.131487
iteration 564 : loss : 0.447036, loss_ce: 0.098598
iteration 565 : loss : 0.454655, loss_ce: 0.118319
iteration 566 : loss : 0.464872, loss_ce: 0.148002
iteration 567 : loss : 0.441194, loss_ce: 0.103758
iteration 568 : loss : 0.448290, loss_ce: 0.133901
iteration 569 : loss : 0.433023, loss_ce: 0.111246
iteration 570 : loss : 0.456967, loss_ce: 0.151257
iteration 571 : loss : 0.464473, loss_ce: 0.137799
iteration 572 : loss : 0.461184, loss_ce: 0.163996
iteration 573 : loss : 0.499736, loss_ce: 0.223134
iteration 574 : loss : 0.428233

  2%|▌                             | 3/150 [11:52<9:40:50, 237.08s/it]

iteration 832 : loss : 0.441623, loss_ce: 0.129826
iteration 833 : loss : 0.474758, loss_ce: 0.207308
iteration 834 : loss : 0.425749, loss_ce: 0.106126
iteration 835 : loss : 0.435140, loss_ce: 0.140862
iteration 836 : loss : 0.439850, loss_ce: 0.106674
iteration 837 : loss : 0.431555, loss_ce: 0.139121
iteration 838 : loss : 0.477099, loss_ce: 0.214199
iteration 839 : loss : 0.445374, loss_ce: 0.142981
iteration 840 : loss : 0.423630, loss_ce: 0.134337
iteration 841 : loss : 0.431120, loss_ce: 0.123893
iteration 842 : loss : 0.430726, loss_ce: 0.121374
iteration 843 : loss : 0.459216, loss_ce: 0.170597
iteration 844 : loss : 0.447233, loss_ce: 0.154203
iteration 845 : loss : 0.415495, loss_ce: 0.098283
iteration 846 : loss : 0.452628, loss_ce: 0.088385
iteration 847 : loss : 0.432119, loss_ce: 0.140040
iteration 848 : loss : 0.419727, loss_ce: 0.131542
iteration 849 : loss : 0.394560, loss_ce: 0.082900
iteration 850 : loss : 0.447363, loss_ce: 0.154318
iteration 851 : loss : 0.477032

  3%|▊                             | 4/150 [15:48<9:36:04, 236.74s/it]

iteration 1109 : loss : 0.416943, loss_ce: 0.062431
iteration 1110 : loss : 0.397631, loss_ce: 0.097116
iteration 1111 : loss : 0.455174, loss_ce: 0.086190
iteration 1112 : loss : 0.451274, loss_ce: 0.136418
iteration 1113 : loss : 0.416594, loss_ce: 0.117558
iteration 1114 : loss : 0.382113, loss_ce: 0.073412
iteration 1115 : loss : 0.462420, loss_ce: 0.186403
iteration 1116 : loss : 0.454327, loss_ce: 0.050513
iteration 1117 : loss : 0.405445, loss_ce: 0.105170
iteration 1118 : loss : 0.431643, loss_ce: 0.135540
iteration 1119 : loss : 0.399482, loss_ce: 0.093695
iteration 1120 : loss : 0.434413, loss_ce: 0.110814
iteration 1121 : loss : 0.451467, loss_ce: 0.105386
iteration 1122 : loss : 0.413382, loss_ce: 0.111853
iteration 1123 : loss : 0.403015, loss_ce: 0.044445
iteration 1124 : loss : 0.430970, loss_ce: 0.146307
iteration 1125 : loss : 0.390264, loss_ce: 0.088229
iteration 1126 : loss : 0.418381, loss_ce: 0.110038
iteration 1127 : loss : 0.459785, loss_ce: 0.152886
iteration 11

  3%|█                             | 5/150 [19:45<9:31:57, 236.67s/it]

iteration 1386 : loss : 0.415269, loss_ce: 0.106844
iteration 1387 : loss : 0.407911, loss_ce: 0.116031
iteration 1388 : loss : 0.436392, loss_ce: 0.080348
iteration 1389 : loss : 0.432209, loss_ce: 0.046713
iteration 1390 : loss : 0.404186, loss_ce: 0.125963
iteration 1391 : loss : 0.398173, loss_ce: 0.099665
iteration 1392 : loss : 0.478683, loss_ce: 0.216643
iteration 1393 : loss : 0.389453, loss_ce: 0.073797
iteration 1394 : loss : 0.372094, loss_ce: 0.071790
iteration 1395 : loss : 0.425001, loss_ce: 0.120005
iteration 1396 : loss : 0.393201, loss_ce: 0.102097
iteration 1397 : loss : 0.382426, loss_ce: 0.083487
iteration 1398 : loss : 0.454684, loss_ce: 0.180255
iteration 1399 : loss : 0.383590, loss_ce: 0.063663
iteration 1400 : loss : 0.381089, loss_ce: 0.093766
iteration 1401 : loss : 0.395734, loss_ce: 0.109588
iteration 1402 : loss : 0.440698, loss_ce: 0.172313
iteration 1403 : loss : 0.388968, loss_ce: 0.095514
iteration 1404 : loss : 0.377145, loss_ce: 0.085372
iteration 14

  4%|█▏                            | 6/150 [23:41<9:27:44, 236.56s/it]

iteration 1663 : loss : 0.397778, loss_ce: 0.092890
iteration 1664 : loss : 0.381988, loss_ce: 0.096648
iteration 1665 : loss : 0.397608, loss_ce: 0.132102
iteration 1666 : loss : 0.368047, loss_ce: 0.098404
iteration 1667 : loss : 0.372821, loss_ce: 0.090415
iteration 1668 : loss : 0.389294, loss_ce: 0.137739
iteration 1669 : loss : 0.444253, loss_ce: 0.138153
iteration 1670 : loss : 0.395735, loss_ce: 0.129704
iteration 1671 : loss : 0.389832, loss_ce: 0.132578
iteration 1672 : loss : 0.371799, loss_ce: 0.099804
iteration 1673 : loss : 0.412840, loss_ce: 0.072210
iteration 1674 : loss : 0.406447, loss_ce: 0.090548
iteration 1675 : loss : 0.418667, loss_ce: 0.097594
iteration 1676 : loss : 0.423773, loss_ce: 0.154209
iteration 1677 : loss : 0.390568, loss_ce: 0.072851
iteration 1678 : loss : 0.385420, loss_ce: 0.112171
iteration 1679 : loss : 0.357319, loss_ce: 0.085585
iteration 1680 : loss : 0.421312, loss_ce: 0.172350
iteration 1681 : loss : 0.398712, loss_ce: 0.057578
iteration 16

  5%|█▍                            | 7/150 [27:37<9:23:15, 236.33s/it]

iteration 1940 : loss : 0.400247, loss_ce: 0.116196
iteration 1941 : loss : 0.376810, loss_ce: 0.112276
iteration 1942 : loss : 0.476260, loss_ce: 0.241685
iteration 1943 : loss : 0.374998, loss_ce: 0.091131
iteration 1944 : loss : 0.417313, loss_ce: 0.107746
iteration 1945 : loss : 0.357785, loss_ce: 0.100403
iteration 1946 : loss : 0.399276, loss_ce: 0.144755
iteration 1947 : loss : 0.372094, loss_ce: 0.112011
iteration 1948 : loss : 0.363713, loss_ce: 0.114756
iteration 1949 : loss : 0.429690, loss_ce: 0.132019
iteration 1950 : loss : 0.424143, loss_ce: 0.132826
iteration 1951 : loss : 0.371169, loss_ce: 0.124057
iteration 1952 : loss : 0.405948, loss_ce: 0.093550
iteration 1953 : loss : 0.453608, loss_ce: 0.225649
iteration 1954 : loss : 0.358525, loss_ce: 0.114968
iteration 1955 : loss : 0.435604, loss_ce: 0.134295
iteration 1956 : loss : 0.371227, loss_ce: 0.078011
iteration 1957 : loss : 0.387377, loss_ce: 0.148542
iteration 1958 : loss : 0.375334, loss_ce: 0.080129
iteration 19

  5%|█▌                            | 8/150 [31:33<9:18:52, 236.14s/it]

iteration 2217 : loss : 0.359583, loss_ce: 0.120865
iteration 2218 : loss : 0.348148, loss_ce: 0.062790
iteration 2219 : loss : 0.464547, loss_ce: 0.053407
iteration 2220 : loss : 0.365878, loss_ce: 0.090694
iteration 2221 : loss : 0.385704, loss_ce: 0.127878
iteration 2222 : loss : 0.330524, loss_ce: 0.071242
iteration 2223 : loss : 0.387239, loss_ce: 0.096204
iteration 2224 : loss : 0.323738, loss_ce: 0.055110
iteration 2225 : loss : 0.373948, loss_ce: 0.141419
iteration 2226 : loss : 0.322695, loss_ce: 0.071678
iteration 2227 : loss : 0.421595, loss_ce: 0.075064
iteration 2228 : loss : 0.367836, loss_ce: 0.111107
iteration 2229 : loss : 0.461390, loss_ce: 0.223881
iteration 2230 : loss : 0.432692, loss_ce: 0.183716
iteration 2231 : loss : 0.328922, loss_ce: 0.084801
iteration 2232 : loss : 0.429889, loss_ce: 0.131296
iteration 2233 : loss : 0.353634, loss_ce: 0.119940
iteration 2234 : loss : 0.338533, loss_ce: 0.124818
iteration 2235 : loss : 0.408609, loss_ce: 0.155141
iteration 22

  6%|█▊                            | 9/150 [35:29<9:14:57, 236.15s/it]

iteration 2494 : loss : 0.437480, loss_ce: 0.084297
iteration 2495 : loss : 0.410228, loss_ce: 0.106686
iteration 2496 : loss : 0.383462, loss_ce: 0.099843
iteration 2497 : loss : 0.355847, loss_ce: 0.107782
iteration 2498 : loss : 0.404423, loss_ce: 0.079531
iteration 2499 : loss : 0.320063, loss_ce: 0.099712
iteration 2500 : loss : 0.426837, loss_ce: 0.066727
iteration 2501 : loss : 0.329619, loss_ce: 0.061648
iteration 2502 : loss : 0.407598, loss_ce: 0.153996
iteration 2503 : loss : 0.441618, loss_ce: 0.200241
iteration 2504 : loss : 0.322741, loss_ce: 0.117681
iteration 2505 : loss : 0.329582, loss_ce: 0.092452
iteration 2506 : loss : 0.412915, loss_ce: 0.179203
iteration 2507 : loss : 0.372089, loss_ce: 0.093278
iteration 2508 : loss : 0.329554, loss_ce: 0.123249
iteration 2509 : loss : 0.371932, loss_ce: 0.078181
iteration 2510 : loss : 0.340345, loss_ce: 0.073074
iteration 2511 : loss : 0.302346, loss_ce: 0.093958
iteration 2512 : loss : 0.403725, loss_ce: 0.161622
iteration 25

  7%|█▉                           | 10/150 [39:25<9:10:46, 236.04s/it]

iteration 2771 : loss : 0.253707, loss_ce: 0.054513
iteration 2772 : loss : 0.303329, loss_ce: 0.071904
iteration 2773 : loss : 0.389496, loss_ce: 0.133327
iteration 2774 : loss : 0.305777, loss_ce: 0.081660
iteration 2775 : loss : 0.330393, loss_ce: 0.085388
iteration 2776 : loss : 0.208896, loss_ce: 0.057290
iteration 2777 : loss : 0.249925, loss_ce: 0.073993
iteration 2778 : loss : 0.383621, loss_ce: 0.068390
iteration 2779 : loss : 0.401490, loss_ce: 0.039146
iteration 2780 : loss : 0.369752, loss_ce: 0.145620
iteration 2781 : loss : 0.362772, loss_ce: 0.181514
iteration 2782 : loss : 0.271743, loss_ce: 0.050266
iteration 2783 : loss : 0.316368, loss_ce: 0.084943
iteration 2784 : loss : 0.345626, loss_ce: 0.128547
iteration 2785 : loss : 0.302748, loss_ce: 0.050546
iteration 2786 : loss : 0.406878, loss_ce: 0.064800
iteration 2787 : loss : 0.219409, loss_ce: 0.058879
iteration 2788 : loss : 0.310897, loss_ce: 0.133336
iteration 2789 : loss : 0.297416, loss_ce: 0.071649
iteration 27

  7%|██▏                          | 11/150 [43:20<9:06:40, 235.98s/it]

iteration 3048 : loss : 0.326984, loss_ce: 0.136855
iteration 3049 : loss : 0.338569, loss_ce: 0.088723
iteration 3050 : loss : 0.390516, loss_ce: 0.183906
iteration 3051 : loss : 0.295376, loss_ce: 0.096929
iteration 3052 : loss : 0.248432, loss_ce: 0.059087
iteration 3053 : loss : 0.281096, loss_ce: 0.057494
iteration 3054 : loss : 0.291410, loss_ce: 0.050616
iteration 3055 : loss : 0.304669, loss_ce: 0.108873
iteration 3056 : loss : 0.246031, loss_ce: 0.049973
iteration 3057 : loss : 0.433426, loss_ce: 0.031607
iteration 3058 : loss : 0.184967, loss_ce: 0.047398
iteration 3059 : loss : 0.325268, loss_ce: 0.087767
iteration 3060 : loss : 0.284391, loss_ce: 0.067933
iteration 3061 : loss : 0.339073, loss_ce: 0.039191
iteration 3062 : loss : 0.223549, loss_ce: 0.047274
iteration 3063 : loss : 0.358979, loss_ce: 0.023367
iteration 3064 : loss : 0.277245, loss_ce: 0.071349
iteration 3065 : loss : 0.303656, loss_ce: 0.131800
iteration 3066 : loss : 0.350268, loss_ce: 0.136095
iteration 30

  8%|██▎                          | 12/150 [47:16<9:02:46, 235.99s/it]

iteration 3325 : loss : 0.314712, loss_ce: 0.123370
iteration 3326 : loss : 0.288545, loss_ce: 0.082656
iteration 3327 : loss : 0.331611, loss_ce: 0.114660
iteration 3328 : loss : 0.309455, loss_ce: 0.083173
iteration 3329 : loss : 0.254092, loss_ce: 0.069217
iteration 3330 : loss : 0.209602, loss_ce: 0.083129
iteration 3331 : loss : 0.233374, loss_ce: 0.056516
iteration 3332 : loss : 0.203401, loss_ce: 0.059935
iteration 3333 : loss : 0.237506, loss_ce: 0.073596
iteration 3334 : loss : 0.340281, loss_ce: 0.132415
iteration 3335 : loss : 0.303377, loss_ce: 0.051310
iteration 3336 : loss : 0.258486, loss_ce: 0.069460
iteration 3337 : loss : 0.352707, loss_ce: 0.046279
iteration 3338 : loss : 0.319156, loss_ce: 0.067997
iteration 3339 : loss : 0.202429, loss_ce: 0.045541
iteration 3340 : loss : 0.302935, loss_ce: 0.070676
iteration 3341 : loss : 0.266675, loss_ce: 0.075207
iteration 3342 : loss : 0.322342, loss_ce: 0.116733
iteration 3343 : loss : 0.208932, loss_ce: 0.045470
iteration 33

  9%|██▌                          | 13/150 [51:12<8:58:44, 235.95s/it]

iteration 3602 : loss : 0.445121, loss_ce: 0.043206
iteration 3603 : loss : 0.431355, loss_ce: 0.027279
iteration 3604 : loss : 0.361535, loss_ce: 0.032884
iteration 3605 : loss : 0.355609, loss_ce: 0.019937
iteration 3606 : loss : 0.285128, loss_ce: 0.131487
iteration 3607 : loss : 0.317429, loss_ce: 0.017540
iteration 3608 : loss : 0.294204, loss_ce: 0.088153
iteration 3609 : loss : 0.395748, loss_ce: 0.199108
iteration 3610 : loss : 0.254767, loss_ce: 0.097888
iteration 3611 : loss : 0.349281, loss_ce: 0.118144
iteration 3612 : loss : 0.302899, loss_ce: 0.057274
iteration 3613 : loss : 0.341189, loss_ce: 0.175244
iteration 3614 : loss : 0.269282, loss_ce: 0.086461
iteration 3615 : loss : 0.254044, loss_ce: 0.076532
iteration 3616 : loss : 0.375047, loss_ce: 0.076535
iteration 3617 : loss : 0.384278, loss_ce: 0.046410
iteration 3618 : loss : 0.207387, loss_ce: 0.083776
iteration 3619 : loss : 0.337712, loss_ce: 0.117261
iteration 3620 : loss : 0.295590, loss_ce: 0.121633
iteration 36

  9%|██▋                          | 14/150 [55:09<8:55:00, 236.04s/it]

iteration 3879 : loss : 0.207685, loss_ce: 0.036142
iteration 3880 : loss : 0.266017, loss_ce: 0.083505
iteration 3881 : loss : 0.291808, loss_ce: 0.081111
iteration 3882 : loss : 0.337319, loss_ce: 0.031405
iteration 3883 : loss : 0.200896, loss_ce: 0.044927
iteration 3884 : loss : 0.305345, loss_ce: 0.060047
iteration 3885 : loss : 0.356544, loss_ce: 0.110262
iteration 3886 : loss : 0.289566, loss_ce: 0.078108
iteration 3887 : loss : 0.275155, loss_ce: 0.096323
iteration 3888 : loss : 0.255872, loss_ce: 0.103370
iteration 3889 : loss : 0.253223, loss_ce: 0.099370
iteration 3890 : loss : 0.262483, loss_ce: 0.085782
iteration 3891 : loss : 0.295534, loss_ce: 0.136478
iteration 3892 : loss : 0.271440, loss_ce: 0.083796
iteration 3893 : loss : 0.234293, loss_ce: 0.070895
iteration 3894 : loss : 0.213524, loss_ce: 0.051470
iteration 3895 : loss : 0.197790, loss_ce: 0.046534
iteration 3896 : loss : 0.342659, loss_ce: 0.041700
iteration 3897 : loss : 0.233805, loss_ce: 0.091294
iteration 38

 10%|██▉                          | 15/150 [59:05<8:51:22, 236.16s/it]

iteration 4156 : loss : 0.294594, loss_ce: 0.152787
iteration 4157 : loss : 0.225954, loss_ce: 0.070096
iteration 4158 : loss : 0.261667, loss_ce: 0.033508
iteration 4159 : loss : 0.287817, loss_ce: 0.041634
iteration 4160 : loss : 0.297376, loss_ce: 0.049683
iteration 4161 : loss : 0.180268, loss_ce: 0.051157
iteration 4162 : loss : 0.298739, loss_ce: 0.071917
iteration 4163 : loss : 0.214501, loss_ce: 0.068421
iteration 4164 : loss : 0.173934, loss_ce: 0.035740
iteration 4165 : loss : 0.301809, loss_ce: 0.028280
iteration 4166 : loss : 0.160412, loss_ce: 0.049565
iteration 4167 : loss : 0.279708, loss_ce: 0.089326
iteration 4168 : loss : 0.331782, loss_ce: 0.090486
iteration 4169 : loss : 0.253352, loss_ce: 0.065355
iteration 4170 : loss : 0.291372, loss_ce: 0.050653
iteration 4171 : loss : 0.329898, loss_ce: 0.067230
iteration 4172 : loss : 0.220350, loss_ce: 0.067345
iteration 4173 : loss : 0.222184, loss_ce: 0.103381
iteration 4174 : loss : 0.258640, loss_ce: 0.041794
iteration 41

 11%|██▉                        | 16/150 [1:03:01<8:47:21, 236.13s/it]

iteration 4433 : loss : 0.260165, loss_ce: 0.043939
iteration 4434 : loss : 0.307507, loss_ce: 0.041120
iteration 4435 : loss : 0.328720, loss_ce: 0.233288
iteration 4436 : loss : 0.199297, loss_ce: 0.078033
iteration 4437 : loss : 0.232318, loss_ce: 0.078360
iteration 4438 : loss : 0.154163, loss_ce: 0.054279
iteration 4439 : loss : 0.265970, loss_ce: 0.124155
iteration 4440 : loss : 0.257040, loss_ce: 0.077912
iteration 4441 : loss : 0.254394, loss_ce: 0.073098
iteration 4442 : loss : 0.239966, loss_ce: 0.082483
iteration 4443 : loss : 0.206061, loss_ce: 0.062196
iteration 4444 : loss : 0.164244, loss_ce: 0.059915
iteration 4445 : loss : 0.250722, loss_ce: 0.107764
iteration 4446 : loss : 0.235161, loss_ce: 0.063622
iteration 4447 : loss : 0.214540, loss_ce: 0.081247
iteration 4448 : loss : 0.294508, loss_ce: 0.052293
iteration 4449 : loss : 0.212864, loss_ce: 0.069027
iteration 4450 : loss : 0.302454, loss_ce: 0.063277
iteration 4451 : loss : 0.279815, loss_ce: 0.094684
iteration 44

 11%|███                        | 17/150 [1:06:57<8:43:26, 236.14s/it]

iteration 4710 : loss : 0.263912, loss_ce: 0.070686
iteration 4711 : loss : 0.186452, loss_ce: 0.054500
iteration 4712 : loss : 0.277939, loss_ce: 0.098512
iteration 4713 : loss : 0.212193, loss_ce: 0.071028
iteration 4714 : loss : 0.220500, loss_ce: 0.106709
iteration 4715 : loss : 0.222342, loss_ce: 0.060175
iteration 4716 : loss : 0.199103, loss_ce: 0.041133
iteration 4717 : loss : 0.191783, loss_ce: 0.044059
iteration 4718 : loss : 0.237286, loss_ce: 0.084006
iteration 4719 : loss : 0.299799, loss_ce: 0.040150
iteration 4720 : loss : 0.186926, loss_ce: 0.081618
iteration 4721 : loss : 0.191598, loss_ce: 0.043373
iteration 4722 : loss : 0.233796, loss_ce: 0.101541
iteration 4723 : loss : 0.259221, loss_ce: 0.072257
iteration 4724 : loss : 0.290322, loss_ce: 0.024045
iteration 4725 : loss : 0.285222, loss_ce: 0.041229
iteration 4726 : loss : 0.216671, loss_ce: 0.094261
iteration 4727 : loss : 0.286092, loss_ce: 0.028873
iteration 4728 : loss : 0.323934, loss_ce: 0.097303
iteration 47

 12%|███▏                       | 18/150 [1:10:54<8:39:37, 236.19s/it]

iteration 4987 : loss : 0.225258, loss_ce: 0.051346
iteration 4988 : loss : 0.230675, loss_ce: 0.059944
iteration 4989 : loss : 0.303968, loss_ce: 0.107847
iteration 4990 : loss : 0.238280, loss_ce: 0.064670
iteration 4991 : loss : 0.141407, loss_ce: 0.044131
iteration 4992 : loss : 0.264335, loss_ce: 0.054705
iteration 4993 : loss : 0.225028, loss_ce: 0.097428
iteration 4994 : loss : 0.229319, loss_ce: 0.077362
iteration 4995 : loss : 0.201117, loss_ce: 0.069547
iteration 4996 : loss : 0.310490, loss_ce: 0.057077
iteration 4997 : loss : 0.199683, loss_ce: 0.068056
iteration 4998 : loss : 0.125843, loss_ce: 0.055676
iteration 4999 : loss : 0.230143, loss_ce: 0.115613
iteration 5000 : loss : 0.238402, loss_ce: 0.115740
iteration 5001 : loss : 0.201151, loss_ce: 0.097495
iteration 5002 : loss : 0.233431, loss_ce: 0.044285
iteration 5003 : loss : 0.168492, loss_ce: 0.031341
iteration 5004 : loss : 0.207028, loss_ce: 0.062401
iteration 5005 : loss : 0.251371, loss_ce: 0.075927
iteration 50

 13%|███▍                       | 19/150 [1:14:50<8:35:44, 236.22s/it]

iteration 5264 : loss : 0.316165, loss_ce: 0.080928
iteration 5265 : loss : 0.203926, loss_ce: 0.080509
iteration 5266 : loss : 0.238238, loss_ce: 0.068875
iteration 5267 : loss : 0.231503, loss_ce: 0.065597
iteration 5268 : loss : 0.266536, loss_ce: 0.135787
iteration 5269 : loss : 0.170833, loss_ce: 0.025698
iteration 5270 : loss : 0.207872, loss_ce: 0.046107
iteration 5271 : loss : 0.277428, loss_ce: 0.027145
iteration 5272 : loss : 0.273893, loss_ce: 0.126619
iteration 5273 : loss : 0.256356, loss_ce: 0.095332
iteration 5274 : loss : 0.202546, loss_ce: 0.040982
iteration 5275 : loss : 0.144861, loss_ce: 0.025248
iteration 5276 : loss : 0.287110, loss_ce: 0.076536
iteration 5277 : loss : 0.141872, loss_ce: 0.075086
iteration 5278 : loss : 0.243377, loss_ce: 0.032489
iteration 5279 : loss : 0.220978, loss_ce: 0.105259
iteration 5280 : loss : 0.268152, loss_ce: 0.014015
iteration 5281 : loss : 0.242185, loss_ce: 0.045505
iteration 5282 : loss : 0.254272, loss_ce: 0.064024
iteration 52

 13%|███▌                       | 20/150 [1:18:46<8:31:47, 236.21s/it]

iteration 5541 : loss : 0.260341, loss_ce: 0.039210
iteration 5542 : loss : 0.313803, loss_ce: 0.061091
iteration 5543 : loss : 0.349485, loss_ce: 0.058329
iteration 5544 : loss : 0.205759, loss_ce: 0.087057
iteration 5545 : loss : 0.142514, loss_ce: 0.045502
iteration 5546 : loss : 0.188017, loss_ce: 0.066516
iteration 5547 : loss : 0.294929, loss_ce: 0.062952
iteration 5548 : loss : 0.253443, loss_ce: 0.067101
iteration 5549 : loss : 0.269102, loss_ce: 0.027258
iteration 5550 : loss : 0.217076, loss_ce: 0.072243
iteration 5551 : loss : 0.293599, loss_ce: 0.060961
iteration 5552 : loss : 0.203568, loss_ce: 0.059749
iteration 5553 : loss : 0.383600, loss_ce: 0.046259
iteration 5554 : loss : 0.209653, loss_ce: 0.016906
iteration 5555 : loss : 0.312142, loss_ce: 0.047836
iteration 5556 : loss : 0.199638, loss_ce: 0.063685
iteration 5557 : loss : 0.190812, loss_ce: 0.022444
iteration 5558 : loss : 0.259329, loss_ce: 0.046770
iteration 5559 : loss : 0.206347, loss_ce: 0.073884
iteration 55

 14%|███▊                       | 21/150 [1:22:42<8:27:57, 236.26s/it]

iteration 5818 : loss : 0.252884, loss_ce: 0.114094
iteration 5819 : loss : 0.421013, loss_ce: 0.104299
iteration 5820 : loss : 0.164310, loss_ce: 0.056444
iteration 5821 : loss : 0.369207, loss_ce: 0.072662
iteration 5822 : loss : 0.263991, loss_ce: 0.073127
iteration 5823 : loss : 0.205217, loss_ce: 0.035998
iteration 5824 : loss : 0.289375, loss_ce: 0.044905
iteration 5825 : loss : 0.280498, loss_ce: 0.063556
iteration 5826 : loss : 0.232121, loss_ce: 0.100791
iteration 5827 : loss : 0.205561, loss_ce: 0.056401
iteration 5828 : loss : 0.308202, loss_ce: 0.109527
iteration 5829 : loss : 0.242448, loss_ce: 0.058232
iteration 5830 : loss : 0.225540, loss_ce: 0.068003
iteration 5831 : loss : 0.221743, loss_ce: 0.091363
iteration 5832 : loss : 0.277444, loss_ce: 0.027190
iteration 5833 : loss : 0.172641, loss_ce: 0.036052
iteration 5834 : loss : 0.152934, loss_ce: 0.044192
iteration 5835 : loss : 0.222211, loss_ce: 0.026230
iteration 5836 : loss : 0.198248, loss_ce: 0.022574
iteration 58

 15%|███▉                       | 22/150 [1:26:39<8:23:57, 236.23s/it]

iteration 6095 : loss : 0.120226, loss_ce: 0.030102
iteration 6096 : loss : 0.197573, loss_ce: 0.050182
iteration 6097 : loss : 0.223598, loss_ce: 0.049715
iteration 6098 : loss : 0.253434, loss_ce: 0.092051
iteration 6099 : loss : 0.151360, loss_ce: 0.071552
iteration 6100 : loss : 0.258307, loss_ce: 0.039172
iteration 6101 : loss : 0.227969, loss_ce: 0.062774
iteration 6102 : loss : 0.226927, loss_ce: 0.073431
iteration 6103 : loss : 0.354672, loss_ce: 0.048814
iteration 6104 : loss : 0.219402, loss_ce: 0.068543
iteration 6105 : loss : 0.246902, loss_ce: 0.032022
iteration 6106 : loss : 0.278961, loss_ce: 0.034473
iteration 6107 : loss : 0.209894, loss_ce: 0.067502
iteration 6108 : loss : 0.210653, loss_ce: 0.035467
iteration 6109 : loss : 0.210558, loss_ce: 0.123434
iteration 6110 : loss : 0.145426, loss_ce: 0.047507
iteration 6111 : loss : 0.177597, loss_ce: 0.057980
iteration 6112 : loss : 0.219698, loss_ce: 0.107010
iteration 6113 : loss : 0.301745, loss_ce: 0.071132
iteration 61

 15%|████▏                      | 23/150 [1:30:35<8:20:02, 236.24s/it]

iteration 6372 : loss : 0.175161, loss_ce: 0.031486
iteration 6373 : loss : 0.175928, loss_ce: 0.051569
iteration 6374 : loss : 0.160637, loss_ce: 0.047319
iteration 6375 : loss : 0.218771, loss_ce: 0.096991
iteration 6376 : loss : 0.195425, loss_ce: 0.050221
iteration 6377 : loss : 0.181537, loss_ce: 0.064678
iteration 6378 : loss : 0.260982, loss_ce: 0.181131
iteration 6379 : loss : 0.297814, loss_ce: 0.059338
iteration 6380 : loss : 0.275030, loss_ce: 0.039432
iteration 6381 : loss : 0.264047, loss_ce: 0.088705
iteration 6382 : loss : 0.234581, loss_ce: 0.053523
iteration 6383 : loss : 0.169515, loss_ce: 0.050365
iteration 6384 : loss : 0.160979, loss_ce: 0.063314
iteration 6385 : loss : 0.239191, loss_ce: 0.060536
iteration 6386 : loss : 0.303177, loss_ce: 0.063666
iteration 6387 : loss : 0.181665, loss_ce: 0.058027
iteration 6388 : loss : 0.228507, loss_ce: 0.021371
iteration 6389 : loss : 0.275800, loss_ce: 0.035259
iteration 6390 : loss : 0.134972, loss_ce: 0.027024
iteration 63

 16%|████▎                      | 24/150 [1:34:31<8:15:49, 236.10s/it]

iteration 6649 : loss : 0.204038, loss_ce: 0.044359
iteration 6650 : loss : 0.220153, loss_ce: 0.036511
iteration 6651 : loss : 0.192517, loss_ce: 0.053330
iteration 6652 : loss : 0.209457, loss_ce: 0.030497
iteration 6653 : loss : 0.092232, loss_ce: 0.028487
iteration 6654 : loss : 0.184929, loss_ce: 0.101956
iteration 6655 : loss : 0.318469, loss_ce: 0.081763
iteration 6656 : loss : 0.248711, loss_ce: 0.023710
iteration 6657 : loss : 0.190788, loss_ce: 0.023571
iteration 6658 : loss : 0.244976, loss_ce: 0.046051
iteration 6659 : loss : 0.133182, loss_ce: 0.051351
iteration 6660 : loss : 0.223365, loss_ce: 0.044064
iteration 6661 : loss : 0.239511, loss_ce: 0.029590
iteration 6662 : loss : 0.327459, loss_ce: 0.045098
iteration 6663 : loss : 0.246296, loss_ce: 0.084001
iteration 6664 : loss : 0.211028, loss_ce: 0.044523
iteration 6665 : loss : 0.130601, loss_ce: 0.031937
iteration 6666 : loss : 0.157391, loss_ce: 0.043509
iteration 6667 : loss : 0.213665, loss_ce: 0.041497
iteration 66

 17%|████▌                      | 25/150 [1:38:27<8:11:56, 236.13s/it]

iteration 6926 : loss : 0.147040, loss_ce: 0.028987
iteration 6927 : loss : 0.176699, loss_ce: 0.048835
iteration 6928 : loss : 0.186294, loss_ce: 0.092050
iteration 6929 : loss : 0.216954, loss_ce: 0.100904
iteration 6930 : loss : 0.133398, loss_ce: 0.037587
iteration 6931 : loss : 0.182978, loss_ce: 0.061673
iteration 6932 : loss : 0.307090, loss_ce: 0.047972
iteration 6933 : loss : 0.161256, loss_ce: 0.029802
iteration 6934 : loss : 0.192881, loss_ce: 0.071434
iteration 6935 : loss : 0.213671, loss_ce: 0.065393
iteration 6936 : loss : 0.242099, loss_ce: 0.063316
iteration 6937 : loss : 0.177395, loss_ce: 0.031557
iteration 6938 : loss : 0.277748, loss_ce: 0.065724
iteration 6939 : loss : 0.213537, loss_ce: 0.040410
iteration 6940 : loss : 0.183280, loss_ce: 0.040144
iteration 6941 : loss : 0.260667, loss_ce: 0.038084
iteration 6942 : loss : 0.167955, loss_ce: 0.031834
iteration 6943 : loss : 0.279516, loss_ce: 0.025291
iteration 6944 : loss : 0.250844, loss_ce: 0.049052
iteration 69

 17%|████▋                      | 26/150 [1:42:23<8:08:02, 236.15s/it]

iteration 7203 : loss : 0.274980, loss_ce: 0.058090
iteration 7204 : loss : 0.221334, loss_ce: 0.042739
iteration 7205 : loss : 0.280283, loss_ce: 0.066772
iteration 7206 : loss : 0.168729, loss_ce: 0.071424
iteration 7207 : loss : 0.120410, loss_ce: 0.037858
iteration 7208 : loss : 0.137676, loss_ce: 0.053339
iteration 7209 : loss : 0.333956, loss_ce: 0.024078
iteration 7210 : loss : 0.107179, loss_ce: 0.022961
iteration 7211 : loss : 0.224899, loss_ce: 0.059999
iteration 7212 : loss : 0.157639, loss_ce: 0.033085
iteration 7213 : loss : 0.210206, loss_ce: 0.094405
iteration 7214 : loss : 0.140442, loss_ce: 0.048012
iteration 7215 : loss : 0.134897, loss_ce: 0.047390
iteration 7216 : loss : 0.146300, loss_ce: 0.040943
iteration 7217 : loss : 0.140508, loss_ce: 0.041270
iteration 7218 : loss : 0.186954, loss_ce: 0.025707
iteration 7219 : loss : 0.211575, loss_ce: 0.028354
iteration 7220 : loss : 0.183141, loss_ce: 0.049476
iteration 7221 : loss : 0.240558, loss_ce: 0.041267
iteration 72

 18%|████▊                      | 27/150 [1:46:19<8:04:10, 236.18s/it]

iteration 7480 : loss : 0.154378, loss_ce: 0.017080
iteration 7481 : loss : 0.144001, loss_ce: 0.060437
iteration 7482 : loss : 0.260616, loss_ce: 0.019161
iteration 7483 : loss : 0.175419, loss_ce: 0.077722
iteration 7484 : loss : 0.372055, loss_ce: 0.019005
iteration 7485 : loss : 0.193902, loss_ce: 0.026023
iteration 7486 : loss : 0.233702, loss_ce: 0.048604
iteration 7487 : loss : 0.267282, loss_ce: 0.015532
iteration 7488 : loss : 0.176229, loss_ce: 0.038485
iteration 7489 : loss : 0.251546, loss_ce: 0.020275
iteration 7490 : loss : 0.175397, loss_ce: 0.040993
iteration 7491 : loss : 0.166163, loss_ce: 0.081622
iteration 7492 : loss : 0.185409, loss_ce: 0.067460
iteration 7493 : loss : 0.208266, loss_ce: 0.047042
iteration 7494 : loss : 0.169897, loss_ce: 0.036553
iteration 7495 : loss : 0.127382, loss_ce: 0.044054
iteration 7496 : loss : 0.128550, loss_ce: 0.053520
iteration 7497 : loss : 0.168453, loss_ce: 0.068953
iteration 7498 : loss : 0.129220, loss_ce: 0.062434
iteration 74

 19%|█████                      | 28/150 [1:50:16<8:00:24, 236.27s/it]

iteration 7757 : loss : 0.290273, loss_ce: 0.026206
iteration 7758 : loss : 0.235168, loss_ce: 0.088532
iteration 7759 : loss : 0.166103, loss_ce: 0.069029
iteration 7760 : loss : 0.256539, loss_ce: 0.028255
iteration 7761 : loss : 0.180501, loss_ce: 0.061910
iteration 7762 : loss : 0.115772, loss_ce: 0.045598
iteration 7763 : loss : 0.204992, loss_ce: 0.050439
iteration 7764 : loss : 0.163399, loss_ce: 0.063635
iteration 7765 : loss : 0.148499, loss_ce: 0.037032
iteration 7766 : loss : 0.243884, loss_ce: 0.044071
iteration 7767 : loss : 0.161101, loss_ce: 0.085331
iteration 7768 : loss : 0.151891, loss_ce: 0.053021
iteration 7769 : loss : 0.307133, loss_ce: 0.098566
iteration 7770 : loss : 0.301130, loss_ce: 0.063849
iteration 7771 : loss : 0.149735, loss_ce: 0.061782
iteration 7772 : loss : 0.195985, loss_ce: 0.089010
iteration 7773 : loss : 0.151073, loss_ce: 0.049914
iteration 7774 : loss : 0.144509, loss_ce: 0.056631
iteration 7775 : loss : 0.140810, loss_ce: 0.048030
iteration 77

 19%|█████▏                     | 29/150 [1:54:12<7:56:27, 236.26s/it]

iteration 8034 : loss : 0.139468, loss_ce: 0.025524
iteration 8035 : loss : 0.247721, loss_ce: 0.077733
iteration 8036 : loss : 0.273949, loss_ce: 0.063691
iteration 8037 : loss : 0.100378, loss_ce: 0.040407
iteration 8038 : loss : 0.144010, loss_ce: 0.034582
iteration 8039 : loss : 0.135707, loss_ce: 0.047224
iteration 8040 : loss : 0.199090, loss_ce: 0.053169
iteration 8041 : loss : 0.188757, loss_ce: 0.092227
iteration 8042 : loss : 0.310084, loss_ce: 0.072171
iteration 8043 : loss : 0.304201, loss_ce: 0.035521
iteration 8044 : loss : 0.176545, loss_ce: 0.066939
iteration 8045 : loss : 0.220449, loss_ce: 0.073448
iteration 8046 : loss : 0.237008, loss_ce: 0.064903
iteration 8047 : loss : 0.125116, loss_ce: 0.039156
iteration 8048 : loss : 0.104888, loss_ce: 0.050671
iteration 8049 : loss : 0.097367, loss_ce: 0.033421
iteration 8050 : loss : 0.176437, loss_ce: 0.091763
iteration 8051 : loss : 0.158075, loss_ce: 0.032997
iteration 8052 : loss : 0.163352, loss_ce: 0.031472
iteration 80

 20%|█████▍                     | 30/150 [1:58:09<7:52:46, 236.38s/it]

iteration 8311 : loss : 0.176723, loss_ce: 0.069118
iteration 8312 : loss : 0.172876, loss_ce: 0.040521
iteration 8313 : loss : 0.146395, loss_ce: 0.058025
iteration 8314 : loss : 0.158409, loss_ce: 0.048247
iteration 8315 : loss : 0.175288, loss_ce: 0.046638
iteration 8316 : loss : 0.118291, loss_ce: 0.038801
iteration 8317 : loss : 0.173008, loss_ce: 0.053457
iteration 8318 : loss : 0.150858, loss_ce: 0.040375
iteration 8319 : loss : 0.158026, loss_ce: 0.025425
iteration 8320 : loss : 0.174636, loss_ce: 0.059229
iteration 8321 : loss : 0.151979, loss_ce: 0.026220
iteration 8322 : loss : 0.145071, loss_ce: 0.042484
iteration 8323 : loss : 0.175210, loss_ce: 0.035136
iteration 8324 : loss : 0.162491, loss_ce: 0.069390
iteration 8325 : loss : 0.224554, loss_ce: 0.076191
iteration 8326 : loss : 0.174674, loss_ce: 0.031211
iteration 8327 : loss : 0.118025, loss_ce: 0.024770
iteration 8328 : loss : 0.194709, loss_ce: 0.032056
iteration 8329 : loss : 0.226408, loss_ce: 0.038959
iteration 83

 21%|█████▌                     | 31/150 [2:02:05<7:48:55, 236.43s/it]

iteration 8588 : loss : 0.262926, loss_ce: 0.046092
iteration 8589 : loss : 0.178115, loss_ce: 0.036217
iteration 8590 : loss : 0.206561, loss_ce: 0.086369
iteration 8591 : loss : 0.170004, loss_ce: 0.034731
iteration 8592 : loss : 0.123883, loss_ce: 0.054997
iteration 8593 : loss : 0.144287, loss_ce: 0.048025
iteration 8594 : loss : 0.133616, loss_ce: 0.025183
iteration 8595 : loss : 0.136970, loss_ce: 0.061427
iteration 8596 : loss : 0.244536, loss_ce: 0.035656
iteration 8597 : loss : 0.341825, loss_ce: 0.043546
iteration 8598 : loss : 0.134307, loss_ce: 0.044814
iteration 8599 : loss : 0.199096, loss_ce: 0.025420
iteration 8600 : loss : 0.248198, loss_ce: 0.046662
iteration 8601 : loss : 0.192175, loss_ce: 0.069006
iteration 8602 : loss : 0.203654, loss_ce: 0.040309
iteration 8603 : loss : 0.182812, loss_ce: 0.019919
iteration 8604 : loss : 0.257815, loss_ce: 0.080265
iteration 8605 : loss : 0.266493, loss_ce: 0.087556
iteration 8606 : loss : 0.127011, loss_ce: 0.062123
iteration 86

 21%|█████▊                     | 32/150 [2:06:01<7:44:56, 236.41s/it]

iteration 8865 : loss : 0.247514, loss_ce: 0.056107
iteration 8866 : loss : 0.199823, loss_ce: 0.073927
iteration 8867 : loss : 0.194903, loss_ce: 0.081489
iteration 8868 : loss : 0.319291, loss_ce: 0.014695
iteration 8869 : loss : 0.216827, loss_ce: 0.027503
iteration 8870 : loss : 0.297849, loss_ce: 0.020673
iteration 8871 : loss : 0.217324, loss_ce: 0.039417
iteration 8872 : loss : 0.124631, loss_ce: 0.040995
iteration 8873 : loss : 0.186769, loss_ce: 0.015864
iteration 8874 : loss : 0.304705, loss_ce: 0.067399
iteration 8875 : loss : 0.192335, loss_ce: 0.066988
iteration 8876 : loss : 0.186233, loss_ce: 0.070956
iteration 8877 : loss : 0.147535, loss_ce: 0.034502
iteration 8878 : loss : 0.167239, loss_ce: 0.034142
iteration 8879 : loss : 0.124937, loss_ce: 0.032205
iteration 8880 : loss : 0.104958, loss_ce: 0.034999
iteration 8881 : loss : 0.160095, loss_ce: 0.021104
iteration 8882 : loss : 0.118042, loss_ce: 0.047839
iteration 8883 : loss : 0.227225, loss_ce: 0.091103
iteration 88

 22%|█████▉                     | 33/150 [2:09:58<7:40:59, 236.41s/it]

iteration 9142 : loss : 0.132333, loss_ce: 0.048923
iteration 9143 : loss : 0.165574, loss_ce: 0.041165
iteration 9144 : loss : 0.138293, loss_ce: 0.073358
iteration 9145 : loss : 0.122159, loss_ce: 0.060153
iteration 9146 : loss : 0.169708, loss_ce: 0.031385
iteration 9147 : loss : 0.208387, loss_ce: 0.043123
iteration 9148 : loss : 0.304460, loss_ce: 0.044037
iteration 9149 : loss : 0.237191, loss_ce: 0.016399
iteration 9150 : loss : 0.142348, loss_ce: 0.020626
iteration 9151 : loss : 0.170285, loss_ce: 0.047856
iteration 9152 : loss : 0.251880, loss_ce: 0.017603
iteration 9153 : loss : 0.213204, loss_ce: 0.032151
iteration 9154 : loss : 0.193922, loss_ce: 0.046376
iteration 9155 : loss : 0.155982, loss_ce: 0.050159
iteration 9156 : loss : 0.181021, loss_ce: 0.049270
iteration 9157 : loss : 0.263653, loss_ce: 0.028173
iteration 9158 : loss : 0.105586, loss_ce: 0.023176
iteration 9159 : loss : 0.217415, loss_ce: 0.029911
iteration 9160 : loss : 0.111236, loss_ce: 0.049327
iteration 91

 23%|██████                     | 34/150 [2:13:54<7:37:07, 236.44s/it]

iteration 9419 : loss : 0.194850, loss_ce: 0.036114
iteration 9420 : loss : 0.106712, loss_ce: 0.056798
iteration 9421 : loss : 0.261665, loss_ce: 0.019309
iteration 9422 : loss : 0.192747, loss_ce: 0.046168
iteration 9423 : loss : 0.188705, loss_ce: 0.044191
iteration 9424 : loss : 0.134964, loss_ce: 0.042886
iteration 9425 : loss : 0.131474, loss_ce: 0.027606
iteration 9426 : loss : 0.134453, loss_ce: 0.027142
iteration 9427 : loss : 0.167074, loss_ce: 0.039038
iteration 9428 : loss : 0.182718, loss_ce: 0.039835
iteration 9429 : loss : 0.086497, loss_ce: 0.034528
iteration 9430 : loss : 0.211583, loss_ce: 0.019205
iteration 9431 : loss : 0.233225, loss_ce: 0.028807
iteration 9432 : loss : 0.266208, loss_ce: 0.020743
iteration 9433 : loss : 0.263137, loss_ce: 0.020953
iteration 9434 : loss : 0.076073, loss_ce: 0.025659
iteration 9435 : loss : 0.206677, loss_ce: 0.011634
iteration 9436 : loss : 0.130510, loss_ce: 0.022943
iteration 9437 : loss : 0.157008, loss_ce: 0.073024
iteration 94

 23%|██████▎                    | 35/150 [2:17:51<7:33:12, 236.45s/it]

iteration 9696 : loss : 0.179568, loss_ce: 0.021438
iteration 9697 : loss : 0.297571, loss_ce: 0.057557
iteration 9698 : loss : 0.148591, loss_ce: 0.039426
iteration 9699 : loss : 0.169837, loss_ce: 0.048567
iteration 9700 : loss : 0.168810, loss_ce: 0.012840
iteration 9701 : loss : 0.322505, loss_ce: 0.013957
iteration 9702 : loss : 0.203192, loss_ce: 0.037556
iteration 9703 : loss : 0.141464, loss_ce: 0.034142
iteration 9704 : loss : 0.161266, loss_ce: 0.056668
iteration 9705 : loss : 0.165419, loss_ce: 0.023584
iteration 9706 : loss : 0.143734, loss_ce: 0.040166
iteration 9707 : loss : 0.226135, loss_ce: 0.019531
iteration 9708 : loss : 0.129875, loss_ce: 0.047684
iteration 9709 : loss : 0.161236, loss_ce: 0.045772
iteration 9710 : loss : 0.086008, loss_ce: 0.022491
iteration 9711 : loss : 0.267466, loss_ce: 0.015395
iteration 9712 : loss : 0.147165, loss_ce: 0.054292
iteration 9713 : loss : 0.140128, loss_ce: 0.028895
iteration 9714 : loss : 0.141674, loss_ce: 0.045782
iteration 97

 24%|██████▍                    | 36/150 [2:21:47<7:29:18, 236.48s/it]

iteration 9973 : loss : 0.346645, loss_ce: 0.011279
iteration 9974 : loss : 0.140774, loss_ce: 0.044067
iteration 9975 : loss : 0.211090, loss_ce: 0.027236
iteration 9976 : loss : 0.230877, loss_ce: 0.042940
iteration 9977 : loss : 0.258506, loss_ce: 0.102559
iteration 9978 : loss : 0.165920, loss_ce: 0.055078
iteration 9979 : loss : 0.111749, loss_ce: 0.015101
iteration 9980 : loss : 0.196538, loss_ce: 0.020313
iteration 9981 : loss : 0.108987, loss_ce: 0.048599
iteration 9982 : loss : 0.081918, loss_ce: 0.030930
iteration 9983 : loss : 0.115871, loss_ce: 0.029315
iteration 9984 : loss : 0.244947, loss_ce: 0.035373
iteration 9985 : loss : 0.221580, loss_ce: 0.063729
iteration 9986 : loss : 0.153507, loss_ce: 0.071939
iteration 9987 : loss : 0.103441, loss_ce: 0.036275
iteration 9988 : loss : 0.153509, loss_ce: 0.047324
iteration 9989 : loss : 0.205848, loss_ce: 0.024950
iteration 9990 : loss : 0.139312, loss_ce: 0.060145
iteration 9991 : loss : 0.200960, loss_ce: 0.050037
iteration 99

 25%|██████▋                    | 37/150 [2:25:44<7:25:28, 236.54s/it]

iteration 10250 : loss : 0.104434, loss_ce: 0.033712
iteration 10251 : loss : 0.101723, loss_ce: 0.043879
iteration 10252 : loss : 0.283675, loss_ce: 0.103898
iteration 10253 : loss : 0.159611, loss_ce: 0.032125
iteration 10254 : loss : 0.136623, loss_ce: 0.062464
iteration 10255 : loss : 0.143175, loss_ce: 0.029743
iteration 10256 : loss : 0.227590, loss_ce: 0.035521
iteration 10257 : loss : 0.150465, loss_ce: 0.036887
iteration 10258 : loss : 0.094878, loss_ce: 0.037052
iteration 10259 : loss : 0.156257, loss_ce: 0.042855
iteration 10260 : loss : 0.212447, loss_ce: 0.022500
iteration 10261 : loss : 0.145851, loss_ce: 0.019018
iteration 10262 : loss : 0.251026, loss_ce: 0.037758
iteration 10263 : loss : 0.151126, loss_ce: 0.026111
iteration 10264 : loss : 0.207679, loss_ce: 0.115089
iteration 10265 : loss : 0.160280, loss_ce: 0.038405
iteration 10266 : loss : 0.203073, loss_ce: 0.038192
iteration 10267 : loss : 0.228233, loss_ce: 0.047050
iteration 10268 : loss : 0.154930, loss_ce: 0.

 25%|██████▊                    | 38/150 [2:29:41<7:21:30, 236.52s/it]

iteration 10527 : loss : 0.143862, loss_ce: 0.080348
iteration 10528 : loss : 0.259979, loss_ce: 0.052099
iteration 10529 : loss : 0.201636, loss_ce: 0.038504
iteration 10530 : loss : 0.376185, loss_ce: 0.014539
iteration 10531 : loss : 0.132342, loss_ce: 0.033434
iteration 10532 : loss : 0.128062, loss_ce: 0.039619
iteration 10533 : loss : 0.121805, loss_ce: 0.047732
iteration 10534 : loss : 0.143365, loss_ce: 0.034095
iteration 10535 : loss : 0.188144, loss_ce: 0.028423
iteration 10536 : loss : 0.136944, loss_ce: 0.053016
iteration 10537 : loss : 0.180297, loss_ce: 0.065226
iteration 10538 : loss : 0.110015, loss_ce: 0.046200
iteration 10539 : loss : 0.140255, loss_ce: 0.048100
iteration 10540 : loss : 0.182390, loss_ce: 0.013702
iteration 10541 : loss : 0.150020, loss_ce: 0.030053
iteration 10542 : loss : 0.120811, loss_ce: 0.042621
iteration 10543 : loss : 0.156368, loss_ce: 0.045852
iteration 10544 : loss : 0.104666, loss_ce: 0.042832
iteration 10545 : loss : 0.145789, loss_ce: 0.

 26%|███████                    | 39/150 [2:33:37<7:17:43, 236.61s/it]

iteration 10804 : loss : 0.330935, loss_ce: 0.050628
iteration 10805 : loss : 0.097800, loss_ce: 0.053443
iteration 10806 : loss : 0.129059, loss_ce: 0.024367
iteration 10807 : loss : 0.137445, loss_ce: 0.033308
iteration 10808 : loss : 0.111996, loss_ce: 0.037177
iteration 10809 : loss : 0.158419, loss_ce: 0.019437
iteration 10810 : loss : 0.133187, loss_ce: 0.021121
iteration 10811 : loss : 0.174273, loss_ce: 0.039270
iteration 10812 : loss : 0.216545, loss_ce: 0.028732
iteration 10813 : loss : 0.298904, loss_ce: 0.022052
iteration 10814 : loss : 0.221834, loss_ce: 0.013250
iteration 10815 : loss : 0.176750, loss_ce: 0.021800
iteration 10816 : loss : 0.206923, loss_ce: 0.057507
iteration 10817 : loss : 0.129605, loss_ce: 0.044488
iteration 10818 : loss : 0.205321, loss_ce: 0.023765
iteration 10819 : loss : 0.179056, loss_ce: 0.064215
iteration 10820 : loss : 0.271009, loss_ce: 0.042145
iteration 10821 : loss : 0.219934, loss_ce: 0.050463
iteration 10822 : loss : 0.281818, loss_ce: 0.

 27%|███████▏                   | 40/150 [2:37:34<7:13:45, 236.60s/it]

iteration 11081 : loss : 0.156690, loss_ce: 0.033917
iteration 11082 : loss : 0.143742, loss_ce: 0.052008
iteration 11083 : loss : 0.114593, loss_ce: 0.043088
iteration 11084 : loss : 0.138879, loss_ce: 0.050063
iteration 11085 : loss : 0.169685, loss_ce: 0.031368
iteration 11086 : loss : 0.085519, loss_ce: 0.031359
iteration 11087 : loss : 0.336999, loss_ce: 0.010646
iteration 11088 : loss : 0.329632, loss_ce: 0.027694
iteration 11089 : loss : 0.157011, loss_ce: 0.038920
iteration 11090 : loss : 0.274526, loss_ce: 0.021948
iteration 11091 : loss : 0.096295, loss_ce: 0.030997
iteration 11092 : loss : 0.132301, loss_ce: 0.045310
iteration 11093 : loss : 0.173607, loss_ce: 0.015180
iteration 11094 : loss : 0.153468, loss_ce: 0.053518
iteration 11095 : loss : 0.184531, loss_ce: 0.083698
iteration 11096 : loss : 0.106467, loss_ce: 0.035313
iteration 11097 : loss : 0.133903, loss_ce: 0.043218
iteration 11098 : loss : 0.197542, loss_ce: 0.024623
iteration 11099 : loss : 0.301730, loss_ce: 0.

 27%|███████▍                   | 41/150 [2:41:31<7:09:48, 236.59s/it]

iteration 11358 : loss : 0.170741, loss_ce: 0.081512
iteration 11359 : loss : 0.203707, loss_ce: 0.029761
iteration 11360 : loss : 0.137266, loss_ce: 0.050591
iteration 11361 : loss : 0.101238, loss_ce: 0.050771
iteration 11362 : loss : 0.220320, loss_ce: 0.022382
iteration 11363 : loss : 0.141325, loss_ce: 0.031808
iteration 11364 : loss : 0.084684, loss_ce: 0.035106
iteration 11365 : loss : 0.127581, loss_ce: 0.034190
iteration 11366 : loss : 0.260404, loss_ce: 0.025594
iteration 11367 : loss : 0.211810, loss_ce: 0.063693
iteration 11368 : loss : 0.067231, loss_ce: 0.018045
iteration 11369 : loss : 0.115498, loss_ce: 0.055311
iteration 11370 : loss : 0.182670, loss_ce: 0.043181
iteration 11371 : loss : 0.186876, loss_ce: 0.046699
iteration 11372 : loss : 0.145693, loss_ce: 0.030424
iteration 11373 : loss : 0.195919, loss_ce: 0.017722
iteration 11374 : loss : 0.133246, loss_ce: 0.039781
iteration 11375 : loss : 0.175908, loss_ce: 0.054840
iteration 11376 : loss : 0.110730, loss_ce: 0.

 28%|███████▌                   | 42/150 [2:45:27<7:05:52, 236.59s/it]

iteration 11635 : loss : 0.126622, loss_ce: 0.050301
iteration 11636 : loss : 0.178667, loss_ce: 0.043483
iteration 11637 : loss : 0.321755, loss_ce: 0.013412
iteration 11638 : loss : 0.273419, loss_ce: 0.011432
iteration 11639 : loss : 0.094194, loss_ce: 0.031911
iteration 11640 : loss : 0.102175, loss_ce: 0.036490
iteration 11641 : loss : 0.158609, loss_ce: 0.045899
iteration 11642 : loss : 0.111993, loss_ce: 0.031748
iteration 11643 : loss : 0.113174, loss_ce: 0.031197
iteration 11644 : loss : 0.156540, loss_ce: 0.040345
iteration 11645 : loss : 0.217195, loss_ce: 0.048994
iteration 11646 : loss : 0.087207, loss_ce: 0.038693
iteration 11647 : loss : 0.083461, loss_ce: 0.020348
iteration 11648 : loss : 0.170713, loss_ce: 0.036297
iteration 11649 : loss : 0.232098, loss_ce: 0.061824
iteration 11650 : loss : 0.185769, loss_ce: 0.015368
iteration 11651 : loss : 0.105571, loss_ce: 0.027917
iteration 11652 : loss : 0.180095, loss_ce: 0.019382
iteration 11653 : loss : 0.298179, loss_ce: 0.

 29%|███████▋                   | 43/150 [2:49:24<7:01:49, 236.54s/it]

iteration 11912 : loss : 0.127624, loss_ce: 0.045170
iteration 11913 : loss : 0.246811, loss_ce: 0.032784
iteration 11914 : loss : 0.099893, loss_ce: 0.027873
iteration 11915 : loss : 0.268590, loss_ce: 0.028067
iteration 11916 : loss : 0.140604, loss_ce: 0.041831
iteration 11917 : loss : 0.100119, loss_ce: 0.042523
iteration 11918 : loss : 0.253865, loss_ce: 0.013519
iteration 11919 : loss : 0.355025, loss_ce: 0.012257
iteration 11920 : loss : 0.092593, loss_ce: 0.037702
iteration 11921 : loss : 0.120577, loss_ce: 0.029284
iteration 11922 : loss : 0.150353, loss_ce: 0.049432
iteration 11923 : loss : 0.144433, loss_ce: 0.020444
iteration 11924 : loss : 0.130860, loss_ce: 0.054059
iteration 11925 : loss : 0.143691, loss_ce: 0.048661
iteration 11926 : loss : 0.194102, loss_ce: 0.040212
iteration 11927 : loss : 0.068830, loss_ce: 0.041462
iteration 11928 : loss : 0.085679, loss_ce: 0.019466
iteration 11929 : loss : 0.266052, loss_ce: 0.013000
iteration 11930 : loss : 0.135300, loss_ce: 0.

 29%|███████▉                   | 44/150 [2:53:20<6:58:01, 236.62s/it]

iteration 12189 : loss : 0.179304, loss_ce: 0.014192
iteration 12190 : loss : 0.069397, loss_ce: 0.022427
iteration 12191 : loss : 0.377526, loss_ce: 0.011360
iteration 12192 : loss : 0.097502, loss_ce: 0.034723
iteration 12193 : loss : 0.063207, loss_ce: 0.019111
iteration 12194 : loss : 0.202858, loss_ce: 0.040910
iteration 12195 : loss : 0.085589, loss_ce: 0.034640
iteration 12196 : loss : 0.078531, loss_ce: 0.018720
iteration 12197 : loss : 0.068739, loss_ce: 0.021579
iteration 12198 : loss : 0.171474, loss_ce: 0.046178
iteration 12199 : loss : 0.204464, loss_ce: 0.027621
iteration 12200 : loss : 0.211927, loss_ce: 0.023899
iteration 12201 : loss : 0.105486, loss_ce: 0.026727
iteration 12202 : loss : 0.145807, loss_ce: 0.030307
iteration 12203 : loss : 0.221662, loss_ce: 0.022385
iteration 12204 : loss : 0.302690, loss_ce: 0.007160
iteration 12205 : loss : 0.113672, loss_ce: 0.032376
iteration 12206 : loss : 0.094846, loss_ce: 0.041498
iteration 12207 : loss : 0.221430, loss_ce: 0.

 30%|████████                   | 45/150 [2:57:17<6:54:06, 236.64s/it]

iteration 12466 : loss : 0.339788, loss_ce: 0.017486
iteration 12467 : loss : 0.101029, loss_ce: 0.024882
iteration 12468 : loss : 0.150143, loss_ce: 0.048920
iteration 12469 : loss : 0.198568, loss_ce: 0.029650
iteration 12470 : loss : 0.212408, loss_ce: 0.062575
iteration 12471 : loss : 0.080688, loss_ce: 0.020327
iteration 12472 : loss : 0.140788, loss_ce: 0.028040
iteration 12473 : loss : 0.318738, loss_ce: 0.017238
iteration 12474 : loss : 0.236476, loss_ce: 0.024595
iteration 12475 : loss : 0.220850, loss_ce: 0.052070
iteration 12476 : loss : 0.053276, loss_ce: 0.026258
iteration 12477 : loss : 0.182465, loss_ce: 0.040955
iteration 12478 : loss : 0.156981, loss_ce: 0.032710
iteration 12479 : loss : 0.181167, loss_ce: 0.027566
iteration 12480 : loss : 0.210219, loss_ce: 0.013866
iteration 12481 : loss : 0.106362, loss_ce: 0.055145
iteration 12482 : loss : 0.080817, loss_ce: 0.033767
iteration 12483 : loss : 0.139886, loss_ce: 0.056947
iteration 12484 : loss : 0.085990, loss_ce: 0.

 31%|████████▎                  | 46/150 [3:01:14<6:50:07, 236.61s/it]

iteration 12743 : loss : 0.145018, loss_ce: 0.029569
iteration 12744 : loss : 0.206874, loss_ce: 0.022289
iteration 12745 : loss : 0.175563, loss_ce: 0.030257
iteration 12746 : loss : 0.256589, loss_ce: 0.016837
iteration 12747 : loss : 0.097344, loss_ce: 0.033381
iteration 12748 : loss : 0.073601, loss_ce: 0.025975
iteration 12749 : loss : 0.190582, loss_ce: 0.015386
iteration 12750 : loss : 0.257335, loss_ce: 0.023106
iteration 12751 : loss : 0.170593, loss_ce: 0.032847
iteration 12752 : loss : 0.136313, loss_ce: 0.026540
iteration 12753 : loss : 0.097675, loss_ce: 0.046596
iteration 12754 : loss : 0.179032, loss_ce: 0.061316
iteration 12755 : loss : 0.129333, loss_ce: 0.024077
iteration 12756 : loss : 0.160575, loss_ce: 0.016641
iteration 12757 : loss : 0.085239, loss_ce: 0.027030
iteration 12758 : loss : 0.067235, loss_ce: 0.029779
iteration 12759 : loss : 0.073433, loss_ce: 0.022511
iteration 12760 : loss : 0.111575, loss_ce: 0.048190
iteration 12761 : loss : 0.111154, loss_ce: 0.

 31%|████████▍                  | 47/150 [3:05:10<6:46:05, 236.56s/it]

iteration 13020 : loss : 0.119821, loss_ce: 0.019575
iteration 13021 : loss : 0.105617, loss_ce: 0.030731
iteration 13022 : loss : 0.103949, loss_ce: 0.020648
iteration 13023 : loss : 0.151490, loss_ce: 0.058910
iteration 13024 : loss : 0.127585, loss_ce: 0.023372
iteration 13025 : loss : 0.085939, loss_ce: 0.018734
iteration 13026 : loss : 0.099341, loss_ce: 0.025761
iteration 13027 : loss : 0.093918, loss_ce: 0.045331
iteration 13028 : loss : 0.138654, loss_ce: 0.018647
iteration 13029 : loss : 0.091253, loss_ce: 0.022319
iteration 13030 : loss : 0.077578, loss_ce: 0.027155
iteration 13031 : loss : 0.134967, loss_ce: 0.034669
iteration 13032 : loss : 0.136405, loss_ce: 0.044498
iteration 13033 : loss : 0.052763, loss_ce: 0.015204
iteration 13034 : loss : 0.162981, loss_ce: 0.033234
iteration 13035 : loss : 0.106574, loss_ce: 0.028512
iteration 13036 : loss : 0.128180, loss_ce: 0.031312
iteration 13037 : loss : 0.119705, loss_ce: 0.019485
iteration 13038 : loss : 0.165152, loss_ce: 0.

 32%|████████▋                  | 48/150 [3:09:07<6:42:14, 236.61s/it]

iteration 13297 : loss : 0.142503, loss_ce: 0.038103
iteration 13298 : loss : 0.080426, loss_ce: 0.025157
iteration 13299 : loss : 0.285308, loss_ce: 0.019846
iteration 13300 : loss : 0.176378, loss_ce: 0.023881
iteration 13301 : loss : 0.092233, loss_ce: 0.055381
iteration 13302 : loss : 0.135919, loss_ce: 0.024360
iteration 13303 : loss : 0.095125, loss_ce: 0.035962
iteration 13304 : loss : 0.176168, loss_ce: 0.020903
iteration 13305 : loss : 0.120580, loss_ce: 0.055641
iteration 13306 : loss : 0.103999, loss_ce: 0.048669
iteration 13307 : loss : 0.292765, loss_ce: 0.015445
iteration 13308 : loss : 0.166160, loss_ce: 0.055919
iteration 13309 : loss : 0.134130, loss_ce: 0.028711
iteration 13310 : loss : 0.069021, loss_ce: 0.033086
iteration 13311 : loss : 0.110759, loss_ce: 0.021128
iteration 13312 : loss : 0.250228, loss_ce: 0.018842
iteration 13313 : loss : 0.157883, loss_ce: 0.017445
iteration 13314 : loss : 0.116154, loss_ce: 0.034041
iteration 13315 : loss : 0.166960, loss_ce: 0.

 33%|████████▊                  | 49/150 [3:13:03<6:38:18, 236.62s/it]

iteration 13574 : loss : 0.241854, loss_ce: 0.020853
iteration 13575 : loss : 0.272269, loss_ce: 0.013907
iteration 13576 : loss : 0.135883, loss_ce: 0.019065
iteration 13577 : loss : 0.146820, loss_ce: 0.034702
iteration 13578 : loss : 0.150809, loss_ce: 0.027706
iteration 13579 : loss : 0.077591, loss_ce: 0.021910
iteration 13580 : loss : 0.242032, loss_ce: 0.014789
iteration 13581 : loss : 0.132382, loss_ce: 0.034827
iteration 13582 : loss : 0.183495, loss_ce: 0.013625
iteration 13583 : loss : 0.124531, loss_ce: 0.072180
iteration 13584 : loss : 0.136493, loss_ce: 0.019345
iteration 13585 : loss : 0.243775, loss_ce: 0.010342
iteration 13586 : loss : 0.128898, loss_ce: 0.058934
iteration 13587 : loss : 0.260018, loss_ce: 0.024894
iteration 13588 : loss : 0.149889, loss_ce: 0.025692
iteration 13589 : loss : 0.117584, loss_ce: 0.040408
iteration 13590 : loss : 0.158644, loss_ce: 0.058073
iteration 13591 : loss : 0.150661, loss_ce: 0.028536
iteration 13592 : loss : 0.104166, loss_ce: 0.

 33%|█████████                  | 50/150 [3:17:00<6:34:23, 236.63s/it]

iteration 13851 : loss : 0.124812, loss_ce: 0.037717
iteration 13852 : loss : 0.187457, loss_ce: 0.027123
iteration 13853 : loss : 0.112702, loss_ce: 0.020001
iteration 13854 : loss : 0.058737, loss_ce: 0.017782
iteration 13855 : loss : 0.096949, loss_ce: 0.045188
iteration 13856 : loss : 0.063836, loss_ce: 0.024499
iteration 13857 : loss : 0.069855, loss_ce: 0.024953
iteration 13858 : loss : 0.090252, loss_ce: 0.032263
iteration 13859 : loss : 0.123633, loss_ce: 0.026221
iteration 13860 : loss : 0.166561, loss_ce: 0.031168
iteration 13861 : loss : 0.152568, loss_ce: 0.037339
iteration 13862 : loss : 0.053698, loss_ce: 0.018927
iteration 13863 : loss : 0.110019, loss_ce: 0.036314
iteration 13864 : loss : 0.271816, loss_ce: 0.055856
iteration 13865 : loss : 0.292128, loss_ce: 0.034966
iteration 13866 : loss : 0.318581, loss_ce: 0.003468
iteration 13867 : loss : 0.154843, loss_ce: 0.060772
iteration 13868 : loss : 0.093153, loss_ce: 0.044808
iteration 13869 : loss : 0.141535, loss_ce: 0.

 34%|█████████▏                 | 51/150 [3:20:57<6:30:34, 236.71s/it]

iteration 14128 : loss : 0.178120, loss_ce: 0.021481
iteration 14129 : loss : 0.184609, loss_ce: 0.045932
iteration 14130 : loss : 0.112398, loss_ce: 0.044334
iteration 14131 : loss : 0.144213, loss_ce: 0.036919
iteration 14132 : loss : 0.073708, loss_ce: 0.044374
iteration 14133 : loss : 0.187839, loss_ce: 0.018462
iteration 14134 : loss : 0.187096, loss_ce: 0.018945
iteration 14135 : loss : 0.108446, loss_ce: 0.043449
iteration 14136 : loss : 0.088679, loss_ce: 0.016456
iteration 14137 : loss : 0.158880, loss_ce: 0.039112
iteration 14138 : loss : 0.081991, loss_ce: 0.036658
iteration 14139 : loss : 0.087221, loss_ce: 0.033585
iteration 14140 : loss : 0.248976, loss_ce: 0.036196
iteration 14141 : loss : 0.134652, loss_ce: 0.036174
iteration 14142 : loss : 0.113418, loss_ce: 0.034103
iteration 14143 : loss : 0.112817, loss_ce: 0.030964
iteration 14144 : loss : 0.149267, loss_ce: 0.015232
iteration 14145 : loss : 0.096236, loss_ce: 0.039100
iteration 14146 : loss : 0.156680, loss_ce: 0.

 35%|█████████▎                 | 52/150 [3:24:54<6:26:36, 236.70s/it]

iteration 14405 : loss : 0.099876, loss_ce: 0.020570
iteration 14406 : loss : 0.161737, loss_ce: 0.084549
iteration 14407 : loss : 0.214582, loss_ce: 0.050906
iteration 14408 : loss : 0.219439, loss_ce: 0.024025
iteration 14409 : loss : 0.083509, loss_ce: 0.020783
iteration 14410 : loss : 0.115124, loss_ce: 0.034967
iteration 14411 : loss : 0.146305, loss_ce: 0.032993
iteration 14412 : loss : 0.232791, loss_ce: 0.008471
iteration 14413 : loss : 0.103142, loss_ce: 0.023581
iteration 14414 : loss : 0.153702, loss_ce: 0.046267
iteration 14415 : loss : 0.096241, loss_ce: 0.018009
iteration 14416 : loss : 0.211107, loss_ce: 0.025910
iteration 14417 : loss : 0.097171, loss_ce: 0.047674
iteration 14418 : loss : 0.204168, loss_ce: 0.020311
iteration 14419 : loss : 0.121567, loss_ce: 0.026183
iteration 14420 : loss : 0.137117, loss_ce: 0.008712
iteration 14421 : loss : 0.117902, loss_ce: 0.025242
iteration 14422 : loss : 0.100954, loss_ce: 0.025771
iteration 14423 : loss : 0.159376, loss_ce: 0.

 35%|█████████▌                 | 53/150 [3:28:50<6:22:35, 236.66s/it]

iteration 14682 : loss : 0.154550, loss_ce: 0.018169
iteration 14683 : loss : 0.166349, loss_ce: 0.023651
iteration 14684 : loss : 0.164071, loss_ce: 0.022757
iteration 14685 : loss : 0.275381, loss_ce: 0.015742
iteration 14686 : loss : 0.094019, loss_ce: 0.040450
iteration 14687 : loss : 0.106517, loss_ce: 0.032863
iteration 14688 : loss : 0.136301, loss_ce: 0.035742
iteration 14689 : loss : 0.202583, loss_ce: 0.016910
iteration 14690 : loss : 0.099387, loss_ce: 0.042269
iteration 14691 : loss : 0.183741, loss_ce: 0.011167
iteration 14692 : loss : 0.105152, loss_ce: 0.035277
iteration 14693 : loss : 0.165329, loss_ce: 0.046202
iteration 14694 : loss : 0.136084, loss_ce: 0.061198
iteration 14695 : loss : 0.121335, loss_ce: 0.023219
iteration 14696 : loss : 0.235965, loss_ce: 0.009075
iteration 14697 : loss : 0.164180, loss_ce: 0.027965
iteration 14698 : loss : 0.071452, loss_ce: 0.017757
iteration 14699 : loss : 0.075325, loss_ce: 0.031826
iteration 14700 : loss : 0.088967, loss_ce: 0.

 36%|█████████▋                 | 54/150 [3:32:47<6:18:35, 236.62s/it]

iteration 14959 : loss : 0.116902, loss_ce: 0.022325
iteration 14960 : loss : 0.110465, loss_ce: 0.016143
iteration 14961 : loss : 0.141003, loss_ce: 0.035288
iteration 14962 : loss : 0.103159, loss_ce: 0.033892
iteration 14963 : loss : 0.189500, loss_ce: 0.013430
iteration 14964 : loss : 0.069231, loss_ce: 0.032275
iteration 14965 : loss : 0.269247, loss_ce: 0.011461
iteration 14966 : loss : 0.245621, loss_ce: 0.013118
iteration 14967 : loss : 0.071665, loss_ce: 0.026019
iteration 14968 : loss : 0.103095, loss_ce: 0.038351
iteration 14969 : loss : 0.401118, loss_ce: 0.007304
iteration 14970 : loss : 0.097151, loss_ce: 0.018404
iteration 14971 : loss : 0.178396, loss_ce: 0.019337
iteration 14972 : loss : 0.116609, loss_ce: 0.022997
iteration 14973 : loss : 0.152342, loss_ce: 0.066175
iteration 14974 : loss : 0.141295, loss_ce: 0.046864
iteration 14975 : loss : 0.085221, loss_ce: 0.023886
iteration 14976 : loss : 0.156804, loss_ce: 0.034513
iteration 14977 : loss : 0.106068, loss_ce: 0.

 37%|█████████▉                 | 55/150 [3:36:44<6:14:43, 236.67s/it]

iteration 15236 : loss : 0.260731, loss_ce: 0.018458
iteration 15237 : loss : 0.138167, loss_ce: 0.028348
iteration 15238 : loss : 0.127039, loss_ce: 0.051462
iteration 15239 : loss : 0.168587, loss_ce: 0.033122
iteration 15240 : loss : 0.103374, loss_ce: 0.029844
iteration 15241 : loss : 0.083514, loss_ce: 0.024919
iteration 15242 : loss : 0.081494, loss_ce: 0.036005
iteration 15243 : loss : 0.092343, loss_ce: 0.050309
iteration 15244 : loss : 0.313172, loss_ce: 0.010865
iteration 15245 : loss : 0.119808, loss_ce: 0.052818
iteration 15246 : loss : 0.246928, loss_ce: 0.018598
iteration 15247 : loss : 0.132444, loss_ce: 0.033404
iteration 15248 : loss : 0.258730, loss_ce: 0.012673
iteration 15249 : loss : 0.125131, loss_ce: 0.036623
iteration 15250 : loss : 0.114607, loss_ce: 0.060139
iteration 15251 : loss : 0.145240, loss_ce: 0.020441
iteration 15252 : loss : 0.119805, loss_ce: 0.015469
iteration 15253 : loss : 0.089714, loss_ce: 0.028669
iteration 15254 : loss : 0.134920, loss_ce: 0.

 37%|██████████                 | 56/150 [3:40:41<6:10:55, 236.76s/it]

iteration 15513 : loss : 0.180873, loss_ce: 0.009610
iteration 15514 : loss : 0.165875, loss_ce: 0.068893
iteration 15515 : loss : 0.145895, loss_ce: 0.015545
iteration 15516 : loss : 0.094389, loss_ce: 0.025433
iteration 15517 : loss : 0.087419, loss_ce: 0.029253
iteration 15518 : loss : 0.171542, loss_ce: 0.007986
iteration 15519 : loss : 0.223075, loss_ce: 0.083245
iteration 15520 : loss : 0.116128, loss_ce: 0.029706
iteration 15521 : loss : 0.179507, loss_ce: 0.007826
iteration 15522 : loss : 0.407642, loss_ce: 0.019196
iteration 15523 : loss : 0.121764, loss_ce: 0.018234
iteration 15524 : loss : 0.070252, loss_ce: 0.022971
iteration 15525 : loss : 0.128158, loss_ce: 0.057215
iteration 15526 : loss : 0.134410, loss_ce: 0.036940
iteration 15527 : loss : 0.096521, loss_ce: 0.049285
iteration 15528 : loss : 0.108142, loss_ce: 0.077604
iteration 15529 : loss : 0.081198, loss_ce: 0.021580
iteration 15530 : loss : 0.122672, loss_ce: 0.043004
iteration 15531 : loss : 0.176158, loss_ce: 0.

 38%|██████████▎                | 57/150 [3:44:37<6:07:03, 236.81s/it]

iteration 15790 : loss : 0.137422, loss_ce: 0.062578
iteration 15791 : loss : 0.070877, loss_ce: 0.025148
iteration 15792 : loss : 0.120249, loss_ce: 0.047170
iteration 15793 : loss : 0.119645, loss_ce: 0.054214
iteration 15794 : loss : 0.114571, loss_ce: 0.043436
iteration 15795 : loss : 0.109372, loss_ce: 0.012074
iteration 15796 : loss : 0.080859, loss_ce: 0.024108
iteration 15797 : loss : 0.124424, loss_ce: 0.015178
iteration 15798 : loss : 0.080057, loss_ce: 0.008733
iteration 15799 : loss : 0.099829, loss_ce: 0.030280
iteration 15800 : loss : 0.043883, loss_ce: 0.010451
iteration 15801 : loss : 0.106086, loss_ce: 0.031602
iteration 15802 : loss : 0.121906, loss_ce: 0.021049
iteration 15803 : loss : 0.266310, loss_ce: 0.010225
iteration 15804 : loss : 0.168058, loss_ce: 0.015819
iteration 15805 : loss : 0.203687, loss_ce: 0.014688
iteration 15806 : loss : 0.066210, loss_ce: 0.033264
iteration 15807 : loss : 0.076823, loss_ce: 0.031631
iteration 15808 : loss : 0.186107, loss_ce: 0.

 39%|██████████▍                | 58/150 [3:48:34<6:03:06, 236.81s/it]

iteration 16067 : loss : 0.160287, loss_ce: 0.019775
iteration 16068 : loss : 0.078580, loss_ce: 0.049558
iteration 16069 : loss : 0.097962, loss_ce: 0.034521
iteration 16070 : loss : 0.164483, loss_ce: 0.021235
iteration 16071 : loss : 0.238415, loss_ce: 0.027843
iteration 16072 : loss : 0.158679, loss_ce: 0.023134
iteration 16073 : loss : 0.069844, loss_ce: 0.023050
iteration 16074 : loss : 0.122626, loss_ce: 0.017341
iteration 16075 : loss : 0.084139, loss_ce: 0.030276
iteration 16076 : loss : 0.208507, loss_ce: 0.006386
iteration 16077 : loss : 0.154483, loss_ce: 0.009587
iteration 16078 : loss : 0.073486, loss_ce: 0.045183
iteration 16079 : loss : 0.190817, loss_ce: 0.029972
iteration 16080 : loss : 0.131187, loss_ce: 0.028933
iteration 16081 : loss : 0.359294, loss_ce: 0.012906
iteration 16082 : loss : 0.057275, loss_ce: 0.018664
iteration 16083 : loss : 0.090318, loss_ce: 0.025406
iteration 16084 : loss : 0.117803, loss_ce: 0.028418
iteration 16085 : loss : 0.137831, loss_ce: 0.

 39%|██████████▌                | 59/150 [3:52:31<5:59:18, 236.91s/it]

iteration 16344 : loss : 0.107271, loss_ce: 0.037078
iteration 16345 : loss : 0.096016, loss_ce: 0.028300
iteration 16346 : loss : 0.119483, loss_ce: 0.028170
iteration 16347 : loss : 0.076222, loss_ce: 0.016828
iteration 16348 : loss : 0.069472, loss_ce: 0.025071
iteration 16349 : loss : 0.226475, loss_ce: 0.009590
iteration 16350 : loss : 0.082178, loss_ce: 0.032180
iteration 16351 : loss : 0.100904, loss_ce: 0.027457
iteration 16352 : loss : 0.391914, loss_ce: 0.002093
iteration 16353 : loss : 0.112616, loss_ce: 0.049924
iteration 16354 : loss : 0.165315, loss_ce: 0.015133
iteration 16355 : loss : 0.075736, loss_ce: 0.048208
iteration 16356 : loss : 0.075072, loss_ce: 0.025787
iteration 16357 : loss : 0.113372, loss_ce: 0.012542
iteration 16358 : loss : 0.148517, loss_ce: 0.050670
iteration 16359 : loss : 0.247346, loss_ce: 0.017312
iteration 16360 : loss : 0.081646, loss_ce: 0.018218
iteration 16361 : loss : 0.073259, loss_ce: 0.018924
iteration 16362 : loss : 0.069768, loss_ce: 0.

 40%|██████████▊                | 60/150 [3:56:29<5:55:40, 237.11s/it]

iteration 16621 : loss : 0.141318, loss_ce: 0.026116
iteration 16622 : loss : 0.109335, loss_ce: 0.019451
iteration 16623 : loss : 0.109955, loss_ce: 0.019134
iteration 16624 : loss : 0.159633, loss_ce: 0.018808
iteration 16625 : loss : 0.180899, loss_ce: 0.015866
iteration 16626 : loss : 0.058857, loss_ce: 0.027987
iteration 16627 : loss : 0.096641, loss_ce: 0.027907
iteration 16628 : loss : 0.129662, loss_ce: 0.043575
iteration 16629 : loss : 0.116845, loss_ce: 0.027933
iteration 16630 : loss : 0.146761, loss_ce: 0.017621
iteration 16631 : loss : 0.240898, loss_ce: 0.010456
iteration 16632 : loss : 0.046436, loss_ce: 0.014912
iteration 16633 : loss : 0.188424, loss_ce: 0.026924
iteration 16634 : loss : 0.141192, loss_ce: 0.028088
iteration 16635 : loss : 0.128167, loss_ce: 0.031739
iteration 16636 : loss : 0.155610, loss_ce: 0.040496
iteration 16637 : loss : 0.104512, loss_ce: 0.049138
iteration 16638 : loss : 0.104095, loss_ce: 0.018776
iteration 16639 : loss : 0.132630, loss_ce: 0.

 41%|██████████▉                | 61/150 [4:00:27<5:51:57, 237.27s/it]

iteration 16898 : loss : 0.327796, loss_ce: 0.011332
iteration 16899 : loss : 0.199537, loss_ce: 0.027328
iteration 16900 : loss : 0.113176, loss_ce: 0.028554
iteration 16901 : loss : 0.079836, loss_ce: 0.022217
iteration 16902 : loss : 0.138930, loss_ce: 0.059168
iteration 16903 : loss : 0.100170, loss_ce: 0.019464
iteration 16904 : loss : 0.092239, loss_ce: 0.030369
iteration 16905 : loss : 0.387715, loss_ce: 0.004165
iteration 16906 : loss : 0.102319, loss_ce: 0.028360
iteration 16907 : loss : 0.243527, loss_ce: 0.025193
iteration 16908 : loss : 0.093516, loss_ce: 0.031408
iteration 16909 : loss : 0.080392, loss_ce: 0.024109
iteration 16910 : loss : 0.129166, loss_ce: 0.039628
iteration 16911 : loss : 0.130190, loss_ce: 0.035091
iteration 16912 : loss : 0.180246, loss_ce: 0.022239
iteration 16913 : loss : 0.079281, loss_ce: 0.052378
iteration 16914 : loss : 0.239274, loss_ce: 0.009952
iteration 16915 : loss : 0.081984, loss_ce: 0.043065
iteration 16916 : loss : 0.090969, loss_ce: 0.

 41%|███████████▏               | 62/150 [4:04:24<5:47:58, 237.25s/it]

iteration 17175 : loss : 0.195041, loss_ce: 0.029076
iteration 17176 : loss : 0.120417, loss_ce: 0.048136
iteration 17177 : loss : 0.089368, loss_ce: 0.032481
iteration 17178 : loss : 0.121543, loss_ce: 0.026378
iteration 17179 : loss : 0.204563, loss_ce: 0.017423
iteration 17180 : loss : 0.160513, loss_ce: 0.048438
iteration 17181 : loss : 0.187453, loss_ce: 0.008221
iteration 17182 : loss : 0.060486, loss_ce: 0.018278
iteration 17183 : loss : 0.220705, loss_ce: 0.041856
iteration 17184 : loss : 0.174674, loss_ce: 0.022853
iteration 17185 : loss : 0.077267, loss_ce: 0.025212
iteration 17186 : loss : 0.065184, loss_ce: 0.023312
iteration 17187 : loss : 0.112586, loss_ce: 0.020901
iteration 17188 : loss : 0.122546, loss_ce: 0.027881
iteration 17189 : loss : 0.389166, loss_ce: 0.006524
iteration 17190 : loss : 0.142991, loss_ce: 0.016639
iteration 17191 : loss : 0.065586, loss_ce: 0.023028
iteration 17192 : loss : 0.083735, loss_ce: 0.024364
iteration 17193 : loss : 0.112625, loss_ce: 0.

 42%|███████████▎               | 63/150 [4:08:21<5:43:59, 237.24s/it]

iteration 17452 : loss : 0.090591, loss_ce: 0.042868
iteration 17453 : loss : 0.117909, loss_ce: 0.030342
iteration 17454 : loss : 0.123992, loss_ce: 0.030936
iteration 17455 : loss : 0.086448, loss_ce: 0.033235
iteration 17456 : loss : 0.371828, loss_ce: 0.015056
iteration 17457 : loss : 0.097730, loss_ce: 0.036187
iteration 17458 : loss : 0.078868, loss_ce: 0.047607
iteration 17459 : loss : 0.124030, loss_ce: 0.021637
iteration 17460 : loss : 0.342264, loss_ce: 0.011571
iteration 17461 : loss : 0.100147, loss_ce: 0.040742
iteration 17462 : loss : 0.172744, loss_ce: 0.016436
iteration 17463 : loss : 0.164300, loss_ce: 0.016575
iteration 17464 : loss : 0.131917, loss_ce: 0.033608
iteration 17465 : loss : 0.129546, loss_ce: 0.021837
iteration 17466 : loss : 0.065953, loss_ce: 0.024264
iteration 17467 : loss : 0.127296, loss_ce: 0.027328
iteration 17468 : loss : 0.116207, loss_ce: 0.029896
iteration 17469 : loss : 0.444492, loss_ce: 0.000721
iteration 17470 : loss : 0.251808, loss_ce: 0.

 43%|███████████▌               | 64/150 [4:12:18<5:39:52, 237.12s/it]

iteration 17729 : loss : 0.148269, loss_ce: 0.030476
iteration 17730 : loss : 0.090422, loss_ce: 0.026271
iteration 17731 : loss : 0.319615, loss_ce: 0.030048
iteration 17732 : loss : 0.221925, loss_ce: 0.027015
iteration 17733 : loss : 0.234995, loss_ce: 0.031592
iteration 17734 : loss : 0.130892, loss_ce: 0.010791
iteration 17735 : loss : 0.107108, loss_ce: 0.035115
iteration 17736 : loss : 0.286095, loss_ce: 0.025140
iteration 17737 : loss : 0.264632, loss_ce: 0.033740
iteration 17738 : loss : 0.121632, loss_ce: 0.036919
iteration 17739 : loss : 0.126123, loss_ce: 0.019896
iteration 17740 : loss : 0.101582, loss_ce: 0.020731
iteration 17741 : loss : 0.163849, loss_ce: 0.010734
iteration 17742 : loss : 0.070304, loss_ce: 0.044770
iteration 17743 : loss : 0.077200, loss_ce: 0.036729
iteration 17744 : loss : 0.096016, loss_ce: 0.020445
iteration 17745 : loss : 0.105781, loss_ce: 0.027926
iteration 17746 : loss : 0.198751, loss_ce: 0.016020
iteration 17747 : loss : 0.077421, loss_ce: 0.

 43%|███████████▋               | 65/150 [4:16:15<5:35:47, 237.03s/it]

iteration 18006 : loss : 0.099464, loss_ce: 0.049139
iteration 18007 : loss : 0.107349, loss_ce: 0.033506
iteration 18008 : loss : 0.079171, loss_ce: 0.037898
iteration 18009 : loss : 0.101892, loss_ce: 0.050314
iteration 18010 : loss : 0.087147, loss_ce: 0.022652
iteration 18011 : loss : 0.064364, loss_ce: 0.014504
iteration 18012 : loss : 0.076249, loss_ce: 0.030003
iteration 18013 : loss : 0.060080, loss_ce: 0.028854
iteration 18014 : loss : 0.053139, loss_ce: 0.012504
iteration 18015 : loss : 0.204565, loss_ce: 0.029402
iteration 18016 : loss : 0.052104, loss_ce: 0.016259
iteration 18017 : loss : 0.109303, loss_ce: 0.026648
iteration 18018 : loss : 0.252355, loss_ce: 0.008522
iteration 18019 : loss : 0.108469, loss_ce: 0.029633
iteration 18020 : loss : 0.225484, loss_ce: 0.006786
iteration 18021 : loss : 0.082858, loss_ce: 0.044930
iteration 18022 : loss : 0.214479, loss_ce: 0.012842
iteration 18023 : loss : 0.115522, loss_ce: 0.027997
iteration 18024 : loss : 0.074504, loss_ce: 0.

 44%|███████████▉               | 66/150 [4:20:11<5:31:43, 236.95s/it]

iteration 18283 : loss : 0.078534, loss_ce: 0.031630
iteration 18284 : loss : 0.116958, loss_ce: 0.031846
iteration 18285 : loss : 0.081178, loss_ce: 0.027773
iteration 18286 : loss : 0.125705, loss_ce: 0.028989
iteration 18287 : loss : 0.149318, loss_ce: 0.029452
iteration 18288 : loss : 0.069828, loss_ce: 0.018228
iteration 18289 : loss : 0.108816, loss_ce: 0.054408
iteration 18290 : loss : 0.170271, loss_ce: 0.017400
iteration 18291 : loss : 0.300260, loss_ce: 0.008631
iteration 18292 : loss : 0.161922, loss_ce: 0.010966
iteration 18293 : loss : 0.048353, loss_ce: 0.014440
iteration 18294 : loss : 0.132699, loss_ce: 0.054560
iteration 18295 : loss : 0.067271, loss_ce: 0.028012
iteration 18296 : loss : 0.163913, loss_ce: 0.022735
iteration 18297 : loss : 0.337674, loss_ce: 0.001802
iteration 18298 : loss : 0.270817, loss_ce: 0.018248
iteration 18299 : loss : 0.190271, loss_ce: 0.011740
iteration 18300 : loss : 0.130435, loss_ce: 0.029771
iteration 18301 : loss : 0.103238, loss_ce: 0.

 45%|████████████               | 67/150 [4:24:08<5:27:46, 236.94s/it]

iteration 18560 : loss : 0.074901, loss_ce: 0.039598
iteration 18561 : loss : 0.079694, loss_ce: 0.018357
iteration 18562 : loss : 0.096869, loss_ce: 0.019598
iteration 18563 : loss : 0.082092, loss_ce: 0.035645
iteration 18564 : loss : 0.075682, loss_ce: 0.021752
iteration 18565 : loss : 0.094875, loss_ce: 0.006741
iteration 18566 : loss : 0.141809, loss_ce: 0.041924
iteration 18567 : loss : 0.140390, loss_ce: 0.040740
iteration 18568 : loss : 0.059145, loss_ce: 0.023015
iteration 18569 : loss : 0.166510, loss_ce: 0.039471
iteration 18570 : loss : 0.152792, loss_ce: 0.025500
iteration 18571 : loss : 0.058818, loss_ce: 0.027664
iteration 18572 : loss : 0.124909, loss_ce: 0.020188
iteration 18573 : loss : 0.093819, loss_ce: 0.017180
iteration 18574 : loss : 0.152318, loss_ce: 0.034930
iteration 18575 : loss : 0.115953, loss_ce: 0.014343
iteration 18576 : loss : 0.064678, loss_ce: 0.013240
iteration 18577 : loss : 0.197439, loss_ce: 0.011103
iteration 18578 : loss : 0.188290, loss_ce: 0.

 45%|████████████▏              | 68/150 [4:28:05<5:23:51, 236.97s/it]

iteration 18837 : loss : 0.083998, loss_ce: 0.035007
iteration 18838 : loss : 0.079655, loss_ce: 0.024714
iteration 18839 : loss : 0.258803, loss_ce: 0.022452
iteration 18840 : loss : 0.113124, loss_ce: 0.032964
iteration 18841 : loss : 0.093792, loss_ce: 0.026591
iteration 18842 : loss : 0.243208, loss_ce: 0.016785
iteration 18843 : loss : 0.072299, loss_ce: 0.023322
iteration 18844 : loss : 0.058355, loss_ce: 0.020627
iteration 18845 : loss : 0.123326, loss_ce: 0.022407
iteration 18846 : loss : 0.246574, loss_ce: 0.014983
iteration 18847 : loss : 0.071626, loss_ce: 0.016427
iteration 18848 : loss : 0.104116, loss_ce: 0.017420
iteration 18849 : loss : 0.109591, loss_ce: 0.022619
iteration 18850 : loss : 0.100408, loss_ce: 0.025468
iteration 18851 : loss : 0.102412, loss_ce: 0.025845
iteration 18852 : loss : 0.053330, loss_ce: 0.022563
iteration 18853 : loss : 0.103268, loss_ce: 0.022263
iteration 18854 : loss : 0.063660, loss_ce: 0.032372
iteration 18855 : loss : 0.173956, loss_ce: 0.

 46%|████████████▍              | 69/150 [4:32:02<5:19:53, 236.96s/it]

iteration 19114 : loss : 0.245832, loss_ce: 0.013238
iteration 19115 : loss : 0.053954, loss_ce: 0.021240
iteration 19116 : loss : 0.179136, loss_ce: 0.043932
iteration 19117 : loss : 0.067253, loss_ce: 0.025774
iteration 19118 : loss : 0.078213, loss_ce: 0.017058
iteration 19119 : loss : 0.178322, loss_ce: 0.014447
iteration 19120 : loss : 0.095095, loss_ce: 0.065921
iteration 19121 : loss : 0.121876, loss_ce: 0.012921
iteration 19122 : loss : 0.071165, loss_ce: 0.034403
iteration 19123 : loss : 0.098379, loss_ce: 0.034086
iteration 19124 : loss : 0.058724, loss_ce: 0.029213
iteration 19125 : loss : 0.053593, loss_ce: 0.015585
iteration 19126 : loss : 0.078530, loss_ce: 0.029878
iteration 19127 : loss : 0.080409, loss_ce: 0.037485
iteration 19128 : loss : 0.061489, loss_ce: 0.014380
iteration 19129 : loss : 0.093519, loss_ce: 0.027643
iteration 19130 : loss : 0.134813, loss_ce: 0.036383
iteration 19131 : loss : 0.202542, loss_ce: 0.009287
iteration 19132 : loss : 0.064554, loss_ce: 0.

 47%|████████████▌              | 70/150 [4:35:59<5:15:52, 236.91s/it]

iteration 19391 : loss : 0.076835, loss_ce: 0.024838
iteration 19392 : loss : 0.133783, loss_ce: 0.013453
iteration 19393 : loss : 0.116472, loss_ce: 0.036350
iteration 19394 : loss : 0.155527, loss_ce: 0.011221
iteration 19395 : loss : 0.041318, loss_ce: 0.012995
iteration 19396 : loss : 0.073447, loss_ce: 0.017414
iteration 19397 : loss : 0.083519, loss_ce: 0.027464
iteration 19398 : loss : 0.113293, loss_ce: 0.021290
iteration 19399 : loss : 0.063093, loss_ce: 0.027556
iteration 19400 : loss : 0.055714, loss_ce: 0.019143
iteration 19401 : loss : 0.191222, loss_ce: 0.012035
iteration 19402 : loss : 0.073138, loss_ce: 0.028660
iteration 19403 : loss : 0.145620, loss_ce: 0.019791
iteration 19404 : loss : 0.119293, loss_ce: 0.013673
iteration 19405 : loss : 0.067362, loss_ce: 0.022527
iteration 19406 : loss : 0.098472, loss_ce: 0.024262
iteration 19407 : loss : 0.080728, loss_ce: 0.009981
iteration 19408 : loss : 0.190766, loss_ce: 0.014543
iteration 19409 : loss : 0.096613, loss_ce: 0.

 47%|████████████▊              | 71/150 [4:39:56<5:12:01, 236.98s/it]

iteration 19668 : loss : 0.108131, loss_ce: 0.017069
iteration 19669 : loss : 0.228514, loss_ce: 0.021758
iteration 19670 : loss : 0.141526, loss_ce: 0.038738
iteration 19671 : loss : 0.242355, loss_ce: 0.024984
iteration 19672 : loss : 0.098012, loss_ce: 0.038270
iteration 19673 : loss : 0.122082, loss_ce: 0.046785
iteration 19674 : loss : 0.206391, loss_ce: 0.023693
iteration 19675 : loss : 0.067803, loss_ce: 0.037014
iteration 19676 : loss : 0.259428, loss_ce: 0.016316
iteration 19677 : loss : 0.299967, loss_ce: 0.018762
iteration 19678 : loss : 0.129628, loss_ce: 0.102502
iteration 19679 : loss : 0.087116, loss_ce: 0.041833
iteration 19680 : loss : 0.088780, loss_ce: 0.025735
iteration 19681 : loss : 0.109372, loss_ce: 0.021613
iteration 19682 : loss : 0.081849, loss_ce: 0.031681
iteration 19683 : loss : 0.072405, loss_ce: 0.023661
iteration 19684 : loss : 0.123148, loss_ce: 0.062413
iteration 19685 : loss : 0.116009, loss_ce: 0.037742
iteration 19686 : loss : 0.211907, loss_ce: 0.

 48%|████████████▉              | 72/150 [4:43:53<5:08:04, 236.98s/it]

iteration 19945 : loss : 0.162216, loss_ce: 0.031475
iteration 19946 : loss : 0.226455, loss_ce: 0.011822
iteration 19947 : loss : 0.253757, loss_ce: 0.019344
iteration 19948 : loss : 0.093088, loss_ce: 0.048336
iteration 19949 : loss : 0.113830, loss_ce: 0.023247
iteration 19950 : loss : 0.061751, loss_ce: 0.023185
iteration 19951 : loss : 0.083377, loss_ce: 0.014590
iteration 19952 : loss : 0.055541, loss_ce: 0.019757
iteration 19953 : loss : 0.080706, loss_ce: 0.020248
iteration 19954 : loss : 0.082066, loss_ce: 0.032766
iteration 19955 : loss : 0.133156, loss_ce: 0.035502
iteration 19956 : loss : 0.059023, loss_ce: 0.028350
iteration 19957 : loss : 0.135856, loss_ce: 0.042063
iteration 19958 : loss : 0.085696, loss_ce: 0.043874
iteration 19959 : loss : 0.126622, loss_ce: 0.023403
iteration 19960 : loss : 0.204126, loss_ce: 0.009362
iteration 19961 : loss : 0.128405, loss_ce: 0.015552
iteration 19962 : loss : 0.138170, loss_ce: 0.037854
iteration 19963 : loss : 0.154614, loss_ce: 0.

 49%|█████████████▏             | 73/150 [4:47:50<5:04:08, 237.00s/it]

iteration 20222 : loss : 0.123809, loss_ce: 0.012799
iteration 20223 : loss : 0.065419, loss_ce: 0.036756
iteration 20224 : loss : 0.113298, loss_ce: 0.023652
iteration 20225 : loss : 0.103390, loss_ce: 0.021211
iteration 20226 : loss : 0.053622, loss_ce: 0.026200
iteration 20227 : loss : 0.152778, loss_ce: 0.020421
iteration 20228 : loss : 0.077557, loss_ce: 0.034046
iteration 20229 : loss : 0.051507, loss_ce: 0.033698
iteration 20230 : loss : 0.080186, loss_ce: 0.009127
iteration 20231 : loss : 0.106246, loss_ce: 0.025421
iteration 20232 : loss : 0.117526, loss_ce: 0.008054
iteration 20233 : loss : 0.070325, loss_ce: 0.047189
iteration 20234 : loss : 0.132701, loss_ce: 0.014047
iteration 20235 : loss : 0.097758, loss_ce: 0.018391
iteration 20236 : loss : 0.195309, loss_ce: 0.010397
iteration 20237 : loss : 0.111941, loss_ce: 0.034050
iteration 20238 : loss : 0.109545, loss_ce: 0.031074
iteration 20239 : loss : 0.217698, loss_ce: 0.027127
iteration 20240 : loss : 0.108836, loss_ce: 0.

 49%|█████████████▎             | 74/150 [4:51:47<5:00:13, 237.02s/it]

iteration 20499 : loss : 0.168441, loss_ce: 0.028146
iteration 20500 : loss : 0.072440, loss_ce: 0.020897
iteration 20501 : loss : 0.167605, loss_ce: 0.030545
iteration 20502 : loss : 0.129616, loss_ce: 0.034532
iteration 20503 : loss : 0.143157, loss_ce: 0.007090
iteration 20504 : loss : 0.094900, loss_ce: 0.007826
iteration 20505 : loss : 0.138513, loss_ce: 0.033320
iteration 20506 : loss : 0.079650, loss_ce: 0.018531
iteration 20507 : loss : 0.097103, loss_ce: 0.038980
iteration 20508 : loss : 0.075257, loss_ce: 0.038477
iteration 20509 : loss : 0.068618, loss_ce: 0.030455
iteration 20510 : loss : 0.285439, loss_ce: 0.004712
iteration 20511 : loss : 0.204012, loss_ce: 0.024548
iteration 20512 : loss : 0.120715, loss_ce: 0.016282
iteration 20513 : loss : 0.124923, loss_ce: 0.035899
iteration 20514 : loss : 0.307886, loss_ce: 0.011460
iteration 20515 : loss : 0.090544, loss_ce: 0.014876
iteration 20516 : loss : 0.100819, loss_ce: 0.013657
iteration 20517 : loss : 0.111701, loss_ce: 0.

 50%|█████████████▌             | 75/150 [4:55:45<4:56:22, 237.09s/it]

iteration 20776 : loss : 0.103542, loss_ce: 0.017268
iteration 20777 : loss : 0.108321, loss_ce: 0.016789
iteration 20778 : loss : 0.086004, loss_ce: 0.022186
iteration 20779 : loss : 0.165641, loss_ce: 0.011299
iteration 20780 : loss : 0.047640, loss_ce: 0.010422
iteration 20781 : loss : 0.043296, loss_ce: 0.011423
iteration 20782 : loss : 0.166369, loss_ce: 0.006815
iteration 20783 : loss : 0.052936, loss_ce: 0.017194
iteration 20784 : loss : 0.061384, loss_ce: 0.012206
iteration 20785 : loss : 0.170611, loss_ce: 0.045588
iteration 20786 : loss : 0.157314, loss_ce: 0.044428
iteration 20787 : loss : 0.093704, loss_ce: 0.041840
iteration 20788 : loss : 0.115922, loss_ce: 0.011823
iteration 20789 : loss : 0.115927, loss_ce: 0.020586
iteration 20790 : loss : 0.072082, loss_ce: 0.028359
iteration 20791 : loss : 0.073189, loss_ce: 0.037532
iteration 20792 : loss : 0.104183, loss_ce: 0.007404
iteration 20793 : loss : 0.338337, loss_ce: 0.001694
iteration 20794 : loss : 0.194879, loss_ce: 0.

 51%|█████████████▋             | 76/150 [4:59:42<4:52:26, 237.11s/it]

iteration 21053 : loss : 0.151375, loss_ce: 0.020098
iteration 21054 : loss : 0.083410, loss_ce: 0.023580
iteration 21055 : loss : 0.160151, loss_ce: 0.038778
iteration 21056 : loss : 0.143247, loss_ce: 0.009905
iteration 21057 : loss : 0.130759, loss_ce: 0.022656
iteration 21058 : loss : 0.076137, loss_ce: 0.030484
iteration 21059 : loss : 0.103734, loss_ce: 0.020126
iteration 21060 : loss : 0.062558, loss_ce: 0.021224
iteration 21061 : loss : 0.078035, loss_ce: 0.034307
iteration 21062 : loss : 0.130951, loss_ce: 0.025875
iteration 21063 : loss : 0.064844, loss_ce: 0.024701
iteration 21064 : loss : 0.066275, loss_ce: 0.020750
iteration 21065 : loss : 0.197295, loss_ce: 0.014697
iteration 21066 : loss : 0.080249, loss_ce: 0.028431
iteration 21067 : loss : 0.242172, loss_ce: 0.018169
iteration 21068 : loss : 0.352684, loss_ce: 0.012628
iteration 21069 : loss : 0.072087, loss_ce: 0.027803
iteration 21070 : loss : 0.052287, loss_ce: 0.029201
iteration 21071 : loss : 0.073620, loss_ce: 0.

 51%|█████████████▊             | 77/150 [5:03:39<4:48:27, 237.08s/it]

iteration 21330 : loss : 0.074681, loss_ce: 0.037094
iteration 21331 : loss : 0.348764, loss_ce: 0.015355
iteration 21332 : loss : 0.219031, loss_ce: 0.015844
iteration 21333 : loss : 0.063841, loss_ce: 0.028897
iteration 21334 : loss : 0.049675, loss_ce: 0.021735
iteration 21335 : loss : 0.058305, loss_ce: 0.030931
iteration 21336 : loss : 0.112982, loss_ce: 0.034138
iteration 21337 : loss : 0.048446, loss_ce: 0.016956
iteration 21338 : loss : 0.159677, loss_ce: 0.026409
iteration 21339 : loss : 0.205010, loss_ce: 0.028790
iteration 21340 : loss : 0.113736, loss_ce: 0.016152
iteration 21341 : loss : 0.249686, loss_ce: 0.012293
iteration 21342 : loss : 0.155058, loss_ce: 0.012578
iteration 21343 : loss : 0.237667, loss_ce: 0.007755
iteration 21344 : loss : 0.106253, loss_ce: 0.016792
iteration 21345 : loss : 0.113069, loss_ce: 0.028249
iteration 21346 : loss : 0.122622, loss_ce: 0.015524
iteration 21347 : loss : 0.077698, loss_ce: 0.023095
iteration 21348 : loss : 0.160093, loss_ce: 0.

 52%|██████████████             | 78/150 [5:07:36<4:44:28, 237.06s/it]

iteration 21607 : loss : 0.145411, loss_ce: 0.021501
iteration 21608 : loss : 0.063280, loss_ce: 0.024432
iteration 21609 : loss : 0.174941, loss_ce: 0.009766
iteration 21610 : loss : 0.050607, loss_ce: 0.023349
iteration 21611 : loss : 0.055780, loss_ce: 0.015310
iteration 21612 : loss : 0.059870, loss_ce: 0.039373
iteration 21613 : loss : 0.098080, loss_ce: 0.011869
iteration 21614 : loss : 0.132840, loss_ce: 0.013131
iteration 21615 : loss : 0.107617, loss_ce: 0.019961
iteration 21616 : loss : 0.123812, loss_ce: 0.031290
iteration 21617 : loss : 0.109055, loss_ce: 0.014244
iteration 21618 : loss : 0.236003, loss_ce: 0.002408
iteration 21619 : loss : 0.065238, loss_ce: 0.023606
iteration 21620 : loss : 0.086199, loss_ce: 0.032556
iteration 21621 : loss : 0.136524, loss_ce: 0.026703
iteration 21622 : loss : 0.097130, loss_ce: 0.012478
iteration 21623 : loss : 0.081068, loss_ce: 0.015091
iteration 21624 : loss : 0.119984, loss_ce: 0.013721
iteration 21625 : loss : 0.090450, loss_ce: 0.

 53%|██████████████▏            | 79/150 [5:11:33<4:40:37, 237.15s/it]

iteration 21884 : loss : 0.062058, loss_ce: 0.027885
iteration 21885 : loss : 0.202982, loss_ce: 0.018424
iteration 21886 : loss : 0.158018, loss_ce: 0.016684
iteration 21887 : loss : 0.052598, loss_ce: 0.014861
iteration 21888 : loss : 0.067303, loss_ce: 0.016235
iteration 21889 : loss : 0.068146, loss_ce: 0.031663
iteration 21890 : loss : 0.106277, loss_ce: 0.011577
iteration 21891 : loss : 0.123209, loss_ce: 0.032699
iteration 21892 : loss : 0.066474, loss_ce: 0.023257
iteration 21893 : loss : 0.085159, loss_ce: 0.029348
iteration 21894 : loss : 0.059137, loss_ce: 0.022551
iteration 21895 : loss : 0.112291, loss_ce: 0.032610
iteration 21896 : loss : 0.105214, loss_ce: 0.023907
iteration 21897 : loss : 0.054573, loss_ce: 0.017215
iteration 21898 : loss : 0.063236, loss_ce: 0.020285
iteration 21899 : loss : 0.098372, loss_ce: 0.010707
iteration 21900 : loss : 0.108236, loss_ce: 0.008624
iteration 21901 : loss : 0.205827, loss_ce: 0.007024
iteration 21902 : loss : 0.163227, loss_ce: 0.

 53%|██████████████▍            | 80/150 [5:15:30<4:36:36, 237.10s/it]

iteration 22161 : loss : 0.148382, loss_ce: 0.028228
iteration 22162 : loss : 0.065850, loss_ce: 0.028815
iteration 22163 : loss : 0.093907, loss_ce: 0.021755
iteration 22164 : loss : 0.187371, loss_ce: 0.010637
iteration 22165 : loss : 0.070156, loss_ce: 0.024499
iteration 22166 : loss : 0.109489, loss_ce: 0.012977
iteration 22167 : loss : 0.167397, loss_ce: 0.026336
iteration 22168 : loss : 0.135557, loss_ce: 0.015636
iteration 22169 : loss : 0.107517, loss_ce: 0.031243
iteration 22170 : loss : 0.072189, loss_ce: 0.029269
iteration 22171 : loss : 0.160088, loss_ce: 0.021522
iteration 22172 : loss : 0.057923, loss_ce: 0.031682
iteration 22173 : loss : 0.141072, loss_ce: 0.051903
iteration 22174 : loss : 0.085969, loss_ce: 0.016042
iteration 22175 : loss : 0.212776, loss_ce: 0.022865
iteration 22176 : loss : 0.059577, loss_ce: 0.033918
iteration 22177 : loss : 0.062462, loss_ce: 0.022634
iteration 22178 : loss : 0.091999, loss_ce: 0.010330
iteration 22179 : loss : 0.044184, loss_ce: 0.

 54%|██████████████▌            | 81/150 [5:19:27<4:32:36, 237.05s/it]

iteration 22438 : loss : 0.083595, loss_ce: 0.038830
iteration 22439 : loss : 0.141457, loss_ce: 0.017636
iteration 22440 : loss : 0.068676, loss_ce: 0.040439
iteration 22441 : loss : 0.121541, loss_ce: 0.036924
iteration 22442 : loss : 0.055541, loss_ce: 0.028500
iteration 22443 : loss : 0.054240, loss_ce: 0.034113
iteration 22444 : loss : 0.084631, loss_ce: 0.010446
iteration 22445 : loss : 0.057401, loss_ce: 0.021543
iteration 22446 : loss : 0.111886, loss_ce: 0.023765
iteration 22447 : loss : 0.194218, loss_ce: 0.015265
iteration 22448 : loss : 0.077295, loss_ce: 0.015715
iteration 22449 : loss : 0.139861, loss_ce: 0.017842
iteration 22450 : loss : 0.074103, loss_ce: 0.043208
iteration 22451 : loss : 0.065166, loss_ce: 0.033353
iteration 22452 : loss : 0.097931, loss_ce: 0.046041
iteration 22453 : loss : 0.067796, loss_ce: 0.021468
iteration 22454 : loss : 0.055947, loss_ce: 0.025860
iteration 22455 : loss : 0.280998, loss_ce: 0.006713
iteration 22456 : loss : 0.158904, loss_ce: 0.

 55%|██████████████▊            | 82/150 [5:23:24<4:28:39, 237.05s/it]

iteration 22715 : loss : 0.223615, loss_ce: 0.011834
iteration 22716 : loss : 0.128382, loss_ce: 0.021633
iteration 22717 : loss : 0.073191, loss_ce: 0.022771
iteration 22718 : loss : 0.240185, loss_ce: 0.015284
iteration 22719 : loss : 0.071697, loss_ce: 0.027334
iteration 22720 : loss : 0.092177, loss_ce: 0.013476
iteration 22721 : loss : 0.074162, loss_ce: 0.042488
iteration 22722 : loss : 0.059068, loss_ce: 0.024332
iteration 22723 : loss : 0.219519, loss_ce: 0.019467
iteration 22724 : loss : 0.169739, loss_ce: 0.028816
iteration 22725 : loss : 0.132165, loss_ce: 0.025754
iteration 22726 : loss : 0.054600, loss_ce: 0.013713
iteration 22727 : loss : 0.123076, loss_ce: 0.050812
iteration 22728 : loss : 0.086725, loss_ce: 0.026060
iteration 22729 : loss : 0.080107, loss_ce: 0.013355
iteration 22730 : loss : 0.146477, loss_ce: 0.010041
iteration 22731 : loss : 0.246430, loss_ce: 0.008099
iteration 22732 : loss : 0.058121, loss_ce: 0.023326
iteration 22733 : loss : 0.115505, loss_ce: 0.

 55%|██████████████▉            | 83/150 [5:27:21<4:24:39, 237.00s/it]

iteration 22992 : loss : 0.128505, loss_ce: 0.030116
iteration 22993 : loss : 0.343357, loss_ce: 0.004821
iteration 22994 : loss : 0.064414, loss_ce: 0.027834
iteration 22995 : loss : 0.184705, loss_ce: 0.007555
iteration 22996 : loss : 0.079074, loss_ce: 0.027785
iteration 22997 : loss : 0.246316, loss_ce: 0.003685
iteration 22998 : loss : 0.096922, loss_ce: 0.010470
iteration 22999 : loss : 0.165347, loss_ce: 0.016942
iteration 23000 : loss : 0.169308, loss_ce: 0.010505
iteration 23001 : loss : 0.115063, loss_ce: 0.032352
iteration 23002 : loss : 0.074202, loss_ce: 0.024132
iteration 23003 : loss : 0.073245, loss_ce: 0.024906
iteration 23004 : loss : 0.104184, loss_ce: 0.010466
iteration 23005 : loss : 0.059449, loss_ce: 0.026242
iteration 23006 : loss : 0.110705, loss_ce: 0.014267
iteration 23007 : loss : 0.054594, loss_ce: 0.034166
iteration 23008 : loss : 0.114264, loss_ce: 0.039555
iteration 23009 : loss : 0.076135, loss_ce: 0.043515
iteration 23010 : loss : 0.163706, loss_ce: 0.

 56%|███████████████            | 84/150 [5:31:20<4:21:14, 237.49s/it]

iteration 23269 : loss : 0.130629, loss_ce: 0.019932
iteration 23270 : loss : 0.045746, loss_ce: 0.016172
iteration 23271 : loss : 0.100546, loss_ce: 0.028950
iteration 23272 : loss : 0.146292, loss_ce: 0.014181
iteration 23273 : loss : 0.114461, loss_ce: 0.021434
iteration 23274 : loss : 0.087667, loss_ce: 0.014337
iteration 23275 : loss : 0.160012, loss_ce: 0.018680
iteration 23276 : loss : 0.061699, loss_ce: 0.019616
iteration 23277 : loss : 0.065746, loss_ce: 0.015926
iteration 23278 : loss : 0.098142, loss_ce: 0.013263
iteration 23279 : loss : 0.061156, loss_ce: 0.029925
iteration 23280 : loss : 0.108324, loss_ce: 0.023029
iteration 23281 : loss : 0.113494, loss_ce: 0.019626
iteration 23282 : loss : 0.119182, loss_ce: 0.017771
iteration 23283 : loss : 0.155398, loss_ce: 0.026839
iteration 23284 : loss : 0.145846, loss_ce: 0.013855
iteration 23285 : loss : 0.056474, loss_ce: 0.017712
iteration 23286 : loss : 0.077929, loss_ce: 0.028242
iteration 23287 : loss : 0.119913, loss_ce: 0.

 57%|███████████████▎           | 85/150 [5:35:18<4:17:34, 237.76s/it]

iteration 23546 : loss : 0.071171, loss_ce: 0.040209
iteration 23547 : loss : 0.044645, loss_ce: 0.018074
iteration 23548 : loss : 0.085374, loss_ce: 0.056824
iteration 23549 : loss : 0.143788, loss_ce: 0.013807
iteration 23550 : loss : 0.073294, loss_ce: 0.035898
iteration 23551 : loss : 0.188355, loss_ce: 0.008920
iteration 23552 : loss : 0.143965, loss_ce: 0.031452
iteration 23553 : loss : 0.107368, loss_ce: 0.028635
iteration 23554 : loss : 0.223146, loss_ce: 0.016990
iteration 23555 : loss : 0.061667, loss_ce: 0.019608
iteration 23556 : loss : 0.161878, loss_ce: 0.019535
iteration 23557 : loss : 0.047871, loss_ce: 0.024773
iteration 23558 : loss : 0.119284, loss_ce: 0.025530
iteration 23559 : loss : 0.057995, loss_ce: 0.010414
iteration 23560 : loss : 0.192553, loss_ce: 0.018292
iteration 23561 : loss : 0.147540, loss_ce: 0.016842
iteration 23562 : loss : 0.104353, loss_ce: 0.040047
iteration 23563 : loss : 0.124421, loss_ce: 0.028929
iteration 23564 : loss : 0.103627, loss_ce: 0.

 57%|███████████████▍           | 86/150 [5:39:17<4:14:05, 238.20s/it]

iteration 23823 : loss : 0.064651, loss_ce: 0.023905
iteration 23824 : loss : 0.074419, loss_ce: 0.027878
iteration 23825 : loss : 0.125111, loss_ce: 0.025511
iteration 23826 : loss : 0.047100, loss_ce: 0.013980
iteration 23827 : loss : 0.076974, loss_ce: 0.020755
iteration 23828 : loss : 0.156075, loss_ce: 0.027643
iteration 23829 : loss : 0.078384, loss_ce: 0.014716
iteration 23830 : loss : 0.150289, loss_ce: 0.020486
iteration 23831 : loss : 0.145709, loss_ce: 0.008893
iteration 23832 : loss : 0.099869, loss_ce: 0.029033
iteration 23833 : loss : 0.115598, loss_ce: 0.021039
iteration 23834 : loss : 0.149753, loss_ce: 0.020787
iteration 23835 : loss : 0.105980, loss_ce: 0.024078
iteration 23836 : loss : 0.068924, loss_ce: 0.022361
iteration 23837 : loss : 0.094485, loss_ce: 0.034883
iteration 23838 : loss : 0.340904, loss_ce: 0.002511
iteration 23839 : loss : 0.077231, loss_ce: 0.017106
iteration 23840 : loss : 0.083456, loss_ce: 0.034708
iteration 23841 : loss : 0.098115, loss_ce: 0.

 58%|███████████████▋           | 87/150 [5:43:15<4:09:59, 238.09s/it]

iteration 24100 : loss : 0.084067, loss_ce: 0.025015
iteration 24101 : loss : 0.104536, loss_ce: 0.018753
iteration 24102 : loss : 0.179208, loss_ce: 0.018759
iteration 24103 : loss : 0.046988, loss_ce: 0.019760
iteration 24104 : loss : 0.109355, loss_ce: 0.009250
iteration 24105 : loss : 0.051696, loss_ce: 0.011088
iteration 24106 : loss : 0.253730, loss_ce: 0.008659
iteration 24107 : loss : 0.066625, loss_ce: 0.028570
iteration 24108 : loss : 0.049437, loss_ce: 0.025992
iteration 24109 : loss : 0.155071, loss_ce: 0.014724
iteration 24110 : loss : 0.114920, loss_ce: 0.015398
iteration 24111 : loss : 0.251050, loss_ce: 0.028007
iteration 24112 : loss : 0.125823, loss_ce: 0.029651
iteration 24113 : loss : 0.076084, loss_ce: 0.030510
iteration 24114 : loss : 0.063479, loss_ce: 0.019750
iteration 24115 : loss : 0.094918, loss_ce: 0.012773
iteration 24116 : loss : 0.071220, loss_ce: 0.045667
iteration 24117 : loss : 0.071546, loss_ce: 0.029004
iteration 24118 : loss : 0.120945, loss_ce: 0.

 59%|███████████████▊           | 88/150 [5:47:12<4:05:43, 237.79s/it]

iteration 24377 : loss : 0.043977, loss_ce: 0.022022
iteration 24378 : loss : 0.069485, loss_ce: 0.039110
iteration 24379 : loss : 0.292515, loss_ce: 0.006219
iteration 24380 : loss : 0.081829, loss_ce: 0.057750
iteration 24381 : loss : 0.119472, loss_ce: 0.009040
iteration 24382 : loss : 0.133848, loss_ce: 0.026251
iteration 24383 : loss : 0.151665, loss_ce: 0.011957
iteration 24384 : loss : 0.113792, loss_ce: 0.015155
iteration 24385 : loss : 0.061668, loss_ce: 0.034139
iteration 24386 : loss : 0.260661, loss_ce: 0.007283
iteration 24387 : loss : 0.106242, loss_ce: 0.008391
iteration 24388 : loss : 0.065771, loss_ce: 0.030897
iteration 24389 : loss : 0.185179, loss_ce: 0.006162
iteration 24390 : loss : 0.061560, loss_ce: 0.025020
iteration 24391 : loss : 0.159416, loss_ce: 0.015499
iteration 24392 : loss : 0.149283, loss_ce: 0.019409
iteration 24393 : loss : 0.246214, loss_ce: 0.017107
iteration 24394 : loss : 0.064613, loss_ce: 0.021851
iteration 24395 : loss : 0.069913, loss_ce: 0.

 59%|████████████████           | 89/150 [5:51:09<4:01:31, 237.56s/it]

iteration 24654 : loss : 0.247656, loss_ce: 0.014008
iteration 24655 : loss : 0.083244, loss_ce: 0.033870
iteration 24656 : loss : 0.077040, loss_ce: 0.020392
iteration 24657 : loss : 0.240917, loss_ce: 0.006692
iteration 24658 : loss : 0.066701, loss_ce: 0.021923
iteration 24659 : loss : 0.042331, loss_ce: 0.020897
iteration 24660 : loss : 0.142665, loss_ce: 0.017856
iteration 24661 : loss : 0.232633, loss_ce: 0.005115
iteration 24662 : loss : 0.045936, loss_ce: 0.025957
iteration 24663 : loss : 0.073366, loss_ce: 0.035061
iteration 24664 : loss : 0.076479, loss_ce: 0.022585
iteration 24665 : loss : 0.095125, loss_ce: 0.035951
iteration 24666 : loss : 0.058195, loss_ce: 0.032136
iteration 24667 : loss : 0.060365, loss_ce: 0.018627
iteration 24668 : loss : 0.054217, loss_ce: 0.021903
iteration 24669 : loss : 0.053937, loss_ce: 0.019430
iteration 24670 : loss : 0.402744, loss_ce: 0.003722
iteration 24671 : loss : 0.101418, loss_ce: 0.025327
iteration 24672 : loss : 0.150942, loss_ce: 0.

In [20]:
from torchinfo import summary

summary(net, (1,3,224,224))

Layer (type:depth-idx)                                                 Output Shape              Param #
VisionTransformer                                                      --                        --
├─Transformer: 1                                                       --                        --
│    └─Encoder: 2                                                      --                        --
│    │    └─ModuleList: 3-1                                            --                        85,054,464
├─DecoderCup: 1                                                        --                        --
│    └─ModuleList: 2-1                                                 --                        --
├─Transformer: 1-1                                                     [1, 196, 768]             --
│    └─Embeddings: 2-2                                                 [1, 196, 768]             --
│    │    └─ResNetV2: 3-2                                              [1, 1024, 14, 14