In [1]:
import os
import random
import pandas as pd
import numpy as np
import mxnet as mx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.utils.data import Dataset, DataLoader
from pytorch_metric_learning import losses
from einops import rearrange, repeat
import optuna
from optuna.trial import TrialState
from tqdm import tqdm

In [2]:
def file_to_embed(embeds, file):
    emb = []
    for f in file:
        emb.append(embeds[f][0])
    return torch.stack(emb)

In [3]:
MIN_NUM_PATCHES = 16

In [4]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

In [5]:
class AdienceDataset(Dataset):
    def __init__(self, annot_file, img_dir):
        self.img_lbls = pd.read_csv(annot_file, header=None)
        self.img_dir = img_dir
    
    def __len__(self):
        return len(self.img_lbls)
    
    def __getitem__(self, idx):
        img_file = self.img_lbls.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_file)
        image = mx.image.imread(img_path)
        if image.shape[1] != 112:
            image = mx.image.resize_short(image, 112)
        image = mx.nd.transpose(image, axes=(2,0,1))
        image = torch.tensor(image.asnumpy()).type(torch.FloatTensor)
        label = self.img_lbls.iloc[idx, 1]

        return image, label, img_file

In [6]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/")
val_data = AdienceDataset("../val.csv", "../cropped_Adience/")

In [7]:
class CosFace(nn.Module):
    r"""Implement of CosFace (https://arxiv.org/pdf/1801.09414.pdf):
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
        s: norm of input feature
        m: margin
        cos(theta)-m
    """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.35):
        super(CosFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id
        self.s = s
        self.m = m
        print("self.device_id", self.device_id)
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------

        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)
        phi = cosine - self.m
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot

        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features = ' + str(self.in_features) \
               + ', out_features = ' + str(self.out_features) \
               + ', s = ' + str(self.s) \
               + ', m = ' + str(self.m) + ')'

In [8]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

In [9]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [10]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [11]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        #embed()
        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, mask_value)
            del mask

        attn = dots.softmax(dim=-1)

        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)

        return out

In [12]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, mask = None):
        for attn, ff in self.layers:
            x = attn(x, mask = mask)
            #embed()
            x = ff(x)
        return x

In [13]:
class ViT_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, dim, depth, heads, mlp_dim, pool = 'mean', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")
        else:
            if self.loss_type == 'CosFace':
                self.loss = CosFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)

    def forward(self, img, label=None, mask=None):
        p = self.patch_size
        
        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        x = self.transformer(x, mask)

        # y = x[:, 0]
        z = x[:, 1:].mean(dim = 1)

        # y = self.to_latent(y)
        # emb_y = self.mlp_head(y)
        z = self.to_latent(z)
        emb_z = self.mlp_head(z)
        # emb = torch.cat((emb_y, emb_z), dim=1)
        emb = emb_z
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        else:
            return emb

In [14]:
class ViT_plus(nn.Module):
    def __init__(self):
        super(ViT_plus, self).__init__()
        
        self.fc1 = nn.Linear(in_features=512, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=2)
        
    def forward(self, x):
        x = self.fc1(x)
        x_cosface = x
        x_classification = self.fc2(x)
        
        return x_cosface, x_classification

In [15]:
model = ViT_face(
            image_size=112,
            patch_size=8,
            loss_type='CosFace',
            GPU_ID= [device],
            num_class=93431,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        ).to(device)
model.load_state_dict(
    torch.load("../Backbone_VIT_Epoch_2_Batch_20000_Time_2021-01-12-16-48_checkpoint.pth", map_location=device)
)

self.device_id [device(type='cuda', index=1)]


<All keys matched successfully>

In [16]:
for param in model.parameters():
    param.requires_grad = False

In [17]:
embeds = {}
model.eval()

with torch.no_grad():
    for img, label, file in train_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

    for img, label, file in val_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

In [18]:
best_accu = 0.9284737706184387
def objective(trial):
    model_xtr = ViT_plus().to(device)
    
    loss_lr = trial.suggest_float("loss_learning_rate", 1e-4, 1e-2, log=True)
    arc_margin = losses.ArcFaceLoss(2, 512).to(device)
    loss_optimizer = opt.AdamW(arc_margin.parameters(), lr=loss_lr)
    
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    wd = trial.suggest_float('weight_decay', 1e-4, 1e-2, log=True)
    eps = trial.suggest_float("epsilon", 1e-9, 1e-7, log=True)
    optimizer = opt.AdamW(model_xtr.parameters(), lr=lr, eps=eps, weight_decay=wd)
    
    criterion = nn.CrossEntropyLoss()
    
    batch_size = trial.suggest_int('batch_size', 50, 300)
    num_epochs = trial.suggest_int('epochs', 10, 100)
    
    print("Learning rate for Loss: "+ str(loss_lr))
    print("Learning rate: "+ str(lr))
    print("Weight decay: "+ str(wd))
    print("Epsilon: "+ str(eps))
    print("Batch size: "+ str(batch_size))
    print("Number of epochs: "+ str(num_epochs))
    
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=4)
        
        # training loop
        running_loss = []
        running_accu = []
        
        model_xtr.train()
        for img, label, file in tqdm(train_loader, desc="Training", leave=False):
            img, label = img.to(device), label.to(device)

            x = file_to_embed(embeds, file)
            
            optimizer.zero_grad()
            embed, output = model_xtr(x)
            
            pred = torch.argmax(output, 1)
            accuracy = torch.eq(pred, label).sum() / len(img)

            class_loss = criterion(output, label)
            arc_loss = arc_margin(embed, label)
            loss = class_loss + arc_loss
            loss.backward()
            loss_optimizer.step()
            optimizer.step()

            running_accu.append(accuracy.cpu().detach().numpy())
            running_loss.append(loss.cpu().detach().numpy())
        print("Epoch: {}/{} - Loss: {:.4f} - Accuracy: {:.4f}".format(epoch+1, num_epochs, np.mean(running_loss), np.mean(running_accu)))
        
        # validation loop
        val_loss = []
        val_accu = []

        model_xtr.eval()
        with torch.no_grad():
            for img, label, file in tqdm(val_loader):
                img, label = img.to(device), label.to(device)
                
                x = file_to_embed(embeds, file)
                
                embed, output = model_xtr(x)
                
                pred = torch.argmax(output, 1)
                accuracy = torch.eq(pred, label).sum() / len(img)
                
                class_loss = criterion(output, label)
                arc_loss = arc_margin(embed, label)
                loss = class_loss + arc_loss
                
                val_accu.append(accuracy.cpu().detach().numpy())
                val_loss.append(loss.cpu().detach().numpy())
        val_accu = np.mean(val_accu)
        val_loss = np.mean(val_loss)
        print("Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(val_loss, val_accu))
        
        trial.report(val_accu, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    global best_accu
    if val_accu > best_accu:
        best_accu = val_accu
        print("Saving best model...")
        torch.save(model_xtr.state_dict(), "../vit_8-8_arcface_mean_only.pt")
            
    return val_accu

In [21]:
study = optuna.create_study(direction='maximize',
                            study_name='arcface-8-8-mean-only-vit-study',
                            storage='sqlite:///study1.db',
                            load_if_exists=True)
study.optimize(objective, n_trials=5)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# Display the study statistics
print("\nStudy statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

[32m[I 2023-12-14 11:37:21,264][0m Using an existing study with name 'arcface-8-8-mean-only-vit-study' instead of creating a new one.[0m


Learning rate for Loss: 0.0008973117805300103
Learning rate: 0.0004208581724265923
Weight decay: 0.008769956140978797
Epsilon: 5.981951515551795e-09
Batch size: 174
Number of epochs: 28


Epochs:   0%|          | 0/28 [00:00<?, ?it/s]
Training:   0%|          | 0/81 [00:00<?, ?it/s][A
Training:   1%|          | 1/81 [00:01<01:37,  1.22s/it][A
Training:   5%|▍         | 4/81 [00:01<00:20,  3.81it/s][A
Training:   7%|▋         | 6/81 [00:02<00:22,  3.30it/s][A
Training:  11%|█         | 9/81 [00:02<00:19,  3.64it/s][A
Training:  14%|█▎        | 11/81 [00:02<00:14,  4.90it/s][A
Training:  16%|█▌        | 13/81 [00:03<00:17,  3.90it/s][A
Training:  19%|█▊        | 15/81 [00:03<00:12,  5.18it/s][A
Training:  21%|██        | 17/81 [00:04<00:16,  3.83it/s][A
Training:  23%|██▎       | 19/81 [00:04<00:12,  5.05it/s][A
Training:  26%|██▌       | 21/81 [00:05<00:14,  4.02it/s][A
Training:  28%|██▊       | 23/81 [00:05<00:10,  5.28it/s][A
Training:  31%|███       | 25/81 [00:06<00:13,  4.02it/s][A
Training:  36%|███▌      | 29/81 [00:07<00:12,  4.33it/s][A
Training:  38%|███▊      | 31/81 [00:07<00:09,  5.36it/s][A
Training:  41%|████      | 33/81 [00:07<00:11,  4.3

Epoch: 1/28 - Loss: 13.8411 - Accuracy: 0.7461



  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:01<00:11,  1.16s/it][A
 36%|███▋      | 4/11 [00:01<00:01,  4.03it/s][A
 55%|█████▍    | 6/11 [00:02<00:01,  3.12it/s][A
 82%|████████▏ | 9/11 [00:02<00:00,  3.66it/s][A
100%|██████████| 11/11 [00:03<00:00,  3.64it/s][A
Epochs:   4%|▎         | 1/28 [00:20<09:12, 20.47s/it]

Val Loss: 6.6824 - Val Accuracy: 0.8845



Training:   0%|          | 0/81 [00:00<?, ?it/s][A
Training:   1%|          | 1/81 [00:01<01:24,  1.06s/it][A
Training:   2%|▏         | 2/81 [00:01<00:39,  2.01it/s][A
Training:   6%|▌         | 5/81 [00:02<00:26,  2.91it/s][A
Training:   7%|▋         | 6/81 [00:02<00:21,  3.53it/s][A
Training:  11%|█         | 9/81 [00:02<00:19,  3.76it/s][A
Training:  14%|█▎        | 11/81 [00:03<00:14,  4.93it/s][A
Training:  16%|█▌        | 13/81 [00:03<00:16,  4.15it/s][A
Training:  17%|█▋        | 14/81 [00:03<00:14,  4.47it/s][A
Training:  21%|██        | 17/81 [00:04<00:14,  4.33it/s][A
Training:  26%|██▌       | 21/81 [00:05<00:13,  4.40it/s][A
Training:  27%|██▋       | 22/81 [00:05<00:12,  4.77it/s][A
Training:  30%|██▉       | 24/81 [00:05<00:09,  6.03it/s][A
Training:  32%|███▏      | 26/81 [00:06<00:12,  4.56it/s][A
Training:  36%|███▌      | 29/81 [00:07<00:12,  4.27it/s][A
Training:  41%|████      | 33/81 [00:08<00:11,  4.19it/s][A
Training:  44%|████▍     | 36/81 [00:

Epoch: 2/28 - Loss: 5.4711 - Accuracy: 0.8889



  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:01<00:16,  1.62s/it][A
 27%|██▋       | 3/11 [00:01<00:03,  2.18it/s][A
 45%|████▌     | 5/11 [00:02<00:02,  2.23it/s][A
100%|██████████| 11/11 [00:03<00:00,  2.92it/s][A
Epochs:   4%|▎         | 1/28 [00:42<19:17, 42.89s/it]
[32m[I 2023-12-14 11:38:04,901][0m Trial 15 pruned. [0m


Val Loss: 4.5865 - Val Accuracy: 0.8898
Learning rate for Loss: 0.0002785264376987574
Learning rate: 0.020706733651149094
Weight decay: 0.0007312113943213927
Epsilon: 3.018628619307904e-09
Batch size: 111
Number of epochs: 67


Epochs:   0%|          | 0/67 [00:00<?, ?it/s]
Training:   0%|          | 0/126 [00:00<?, ?it/s][A
Training:   1%|          | 1/126 [00:01<02:19,  1.12s/it][A
Training:   2%|▏         | 3/126 [00:01<00:42,  2.91it/s][A
Training:   4%|▍         | 5/126 [00:01<00:36,  3.31it/s][A
Training:   6%|▋         | 8/126 [00:01<00:19,  6.04it/s][A
Training:   8%|▊         | 10/126 [00:02<00:26,  4.30it/s][A
Training:  10%|█         | 13/126 [00:03<00:26,  4.22it/s][A
Training:  11%|█         | 14/126 [00:03<00:24,  4.60it/s][A
Training:  13%|█▎        | 17/126 [00:04<00:21,  4.99it/s][A
Training:  15%|█▌        | 19/126 [00:04<00:17,  6.23it/s][A
Training:  17%|█▋        | 21/126 [00:04<00:19,  5.26it/s][A
Training:  18%|█▊        | 23/126 [00:04<00:15,  6.60it/s][A
Training:  20%|█▉        | 25/126 [00:05<00:18,  5.41it/s][A
Training:  23%|██▎       | 29/126 [00:05<00:16,  6.00it/s][A
Training:  25%|██▍       | 31/126 [00:06<00:13,  7.13it/s][A
Training:  26%|██▌       | 33/126 [0

Epoch: 1/67 - Loss: 15.4062 - Accuracy: 0.7719



  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:01<00:18,  1.21s/it][A
 31%|███▏      | 5/16 [00:01<00:03,  2.96it/s][A
 50%|█████     | 8/16 [00:02<00:01,  5.16it/s][A
 62%|██████▎   | 10/16 [00:02<00:01,  4.51it/s][A
100%|██████████| 16/16 [00:03<00:00,  4.73it/s][A
Epochs:   1%|▏         | 1/67 [00:25<28:32, 25.95s/it]

Val Loss: 4.0577 - Val Accuracy: 0.9020



Training:   0%|          | 0/126 [00:00<?, ?it/s][A
Training:   1%|          | 1/126 [00:01<02:04,  1.00it/s][A
Training:   2%|▏         | 3/126 [00:01<00:39,  3.15it/s][A
Training:   4%|▍         | 5/126 [00:01<00:33,  3.61it/s][A
Training:   6%|▌         | 7/126 [00:01<00:22,  5.18it/s][A
Training:   7%|▋         | 9/126 [00:02<00:28,  4.16it/s][A
Training:  10%|█         | 13/126 [00:03<00:23,  4.83it/s][A
Training:  13%|█▎        | 16/126 [00:03<00:16,  6.76it/s][A
Training:  14%|█▍        | 18/126 [00:03<00:20,  5.20it/s][A
Training:  16%|█▌        | 20/126 [00:04<00:16,  6.43it/s][A
Training:  17%|█▋        | 22/126 [00:04<00:20,  5.06it/s][A
Training:  19%|█▉        | 24/126 [00:04<00:16,  6.34it/s][A
Training:  21%|██        | 26/126 [00:05<00:19,  5.08it/s][A
Training:  23%|██▎       | 29/126 [00:06<00:20,  4.76it/s][A
Training:  25%|██▍       | 31/126 [00:06<00:16,  5.91it/s][A
Training:  26%|██▌       | 33/126 [00:06<00:19,  4.72it/s][A
Training:  29%|██▊   

Epoch: 2/67 - Loss: 4.1000 - Accuracy: 0.8742



  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:01<00:16,  1.13s/it][A
 25%|██▌       | 4/16 [00:01<00:03,  3.97it/s][A
 38%|███▊      | 6/16 [00:01<00:02,  3.91it/s][A
 50%|█████     | 8/16 [00:01<00:01,  5.55it/s][A
 62%|██████▎   | 10/16 [00:02<00:01,  4.35it/s][A
 75%|███████▌  | 12/16 [00:02<00:00,  5.77it/s][A
100%|██████████| 16/16 [00:03<00:00,  4.61it/s][A
Epochs:   1%|▏         | 1/67 [00:51<56:53, 51.72s/it]
[32m[I 2023-12-14 11:38:57,281][0m Trial 16 pruned. [0m


Val Loss: 3.4416 - Val Accuracy: 0.8707
Learning rate for Loss: 0.0005329964504300528
Learning rate: 0.00019818998664349456
Weight decay: 0.0011944103450635386
Epsilon: 1.0310585528692037e-08
Batch size: 248
Number of epochs: 100


Epochs:   0%|          | 0/100 [00:00<?, ?it/s]
Training:   0%|          | 0/57 [00:00<?, ?it/s][A
Training:   2%|▏         | 1/57 [00:02<01:51,  2.00s/it][A
Training:   4%|▎         | 2/57 [00:02<00:49,  1.11it/s][A
Training:   7%|▋         | 4/57 [00:02<00:19,  2.72it/s][A
Training:  11%|█         | 6/57 [00:03<00:29,  1.73it/s][A
Training:  12%|█▏        | 7/57 [00:03<00:23,  2.16it/s][A
Training:  16%|█▌        | 9/57 [00:05<00:26,  1.83it/s][A
Training:  18%|█▊        | 10/57 [00:05<00:21,  2.17it/s][A
Training:  21%|██        | 12/57 [00:05<00:13,  3.33it/s][A
Training:  23%|██▎       | 13/57 [00:06<00:21,  2.05it/s][A
Training:  25%|██▍       | 14/57 [00:06<00:17,  2.50it/s][A
Training:  28%|██▊       | 16/57 [00:07<00:10,  3.77it/s][A
Training:  30%|██▉       | 17/57 [00:08<00:18,  2.13it/s][A
Training:  32%|███▏      | 18/57 [00:08<00:14,  2.63it/s][A
Training:  35%|███▌      | 20/57 [00:08<00:09,  3.99it/s][A
Training:  39%|███▊      | 22/57 [00:10<00:16,  2.15

Epoch: 1/100 - Loss: 18.2483 - Accuracy: 0.6484



  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:01<00:13,  1.95s/it][A
 62%|██████▎   | 5/8 [00:03<00:01,  1.80it/s][A
100%|██████████| 8/8 [00:03<00:00,  2.25it/s][A
Epochs:   0%|          | 0/100 [00:26<?, ?it/s]
[32m[I 2023-12-14 11:39:24,732][0m Trial 17 pruned. [0m


Val Loss: 12.2941 - Val Accuracy: 0.7421
Learning rate for Loss: 0.0029646983161703507
Learning rate: 0.0013293119848171884
Weight decay: 0.00038483722607233036
Epsilon: 1.4163547089060394e-09
Batch size: 148
Number of epochs: 51


Epochs:   0%|          | 0/51 [00:00<?, ?it/s]
Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:09,  1.38s/it][A
Training:   3%|▎         | 3/95 [00:01<00:36,  2.49it/s][A
Training:   5%|▌         | 5/95 [00:02<00:35,  2.54it/s][A
Training:   8%|▊         | 8/95 [00:02<00:19,  4.46it/s][A
Training:   9%|▉         | 9/95 [00:03<00:27,  3.17it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:19,  4.30it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:18,  4.50it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:26,  3.12it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.57it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:17,  4.46it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.25it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.73it/s][A
Training:  21%|██        | 20/95 [00:05<00:15,  4.83it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:19,  3.83it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:14,  4.81

Epoch: 1/51 - Loss: 11.2548 - Accuracy: 0.7797



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.27s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.56it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.48it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.83it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.50it/s][A
Epochs:   2%|▏         | 1/51 [00:26<21:48, 26.17s/it]

Val Loss: 4.5632 - Val Accuracy: 0.8809



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:56,  1.24s/it][A
Training:   2%|▏         | 2/95 [00:01<00:58,  1.58it/s][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.39it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.42it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.15it/s][A
Training:   8%|▊         | 8/95 [00:02<00:16,  5.14it/s][A
Training:  11%|█         | 10/95 [00:03<00:23,  3.68it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:19,  4.20it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:20,  4.09it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:20,  4.02it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:18,  4.13it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:18,  4.18it/s][A
Training:  21%|██        | 20/95 [00:05<00:13,  5.60it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:20,  3.69it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:19,  3.79it/s][A
Training:  25%|██▌       | 24/95 [00:0

Epoch: 2/51 - Loss: 4.0686 - Accuracy: 0.8978



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.19s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.86it/s][A
 42%|████▏     | 5/12 [00:01<00:02,  2.94it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.76it/s][A
Epochs:   4%|▍         | 2/51 [00:52<21:33, 26.40s/it]

Val Loss: 4.0986 - Val Accuracy: 0.9102



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:14,  1.43s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.42it/s][A
Training:   5%|▌         | 5/95 [00:02<00:34,  2.63it/s][A
Training:   6%|▋         | 6/95 [00:02<00:27,  3.19it/s][A
Training:   8%|▊         | 8/95 [00:02<00:17,  4.97it/s][A
Training:  11%|█         | 10/95 [00:03<00:26,  3.25it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:18,  4.52it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:22,  3.56it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:16,  4.86it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:22,  3.49it/s][A
Training:  21%|██        | 20/95 [00:05<00:15,  4.69it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:21,  3.45it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:20,  3.39it/s][A
Training:  27%|██▋       | 26/95 [00:07<00:19,  3.48it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:14,  4.50it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 3/51 - Loss: 3.9936 - Accuracy: 0.9071



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.37s/it][A
 42%|████▏     | 5/12 [00:02<00:02,  2.45it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.63it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.45it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.57it/s][A
Epochs:   6%|▌         | 3/51 [01:18<20:58, 26.21s/it]

Val Loss: 3.3288 - Val Accuracy: 0.9172



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:08,  1.37s/it][A
Training:   2%|▏         | 2/95 [00:01<01:00,  1.54it/s][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.40it/s][A
Training:   5%|▌         | 5/95 [00:02<00:33,  2.68it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.08it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.72it/s][A
Training:   9%|▉         | 9/95 [00:03<00:26,  3.26it/s][A
Training:  11%|█         | 10/95 [00:03<00:21,  3.90it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:15,  5.30it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:24,  3.31it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:16,  4.80it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  4.96it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:24,  3.22it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.68it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.59it/s][A
Training:  24%|██▍       | 23/95 [00:06

Epoch: 4/51 - Loss: 3.1178 - Accuracy: 0.9206



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:17,  1.56s/it][A
 17%|█▋        | 2/12 [00:01<00:07,  1.42it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.48it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.98it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.21it/s][A
Epochs:   8%|▊         | 4/51 [01:46<20:55, 26.72s/it]

Val Loss: 2.9060 - Val Accuracy: 0.9280



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:10,  1.39s/it][A
Training:   4%|▍         | 4/95 [00:01<00:27,  3.36it/s][A
Training:   6%|▋         | 6/95 [00:02<00:30,  2.93it/s][A
Training:   9%|▉         | 9/95 [00:03<00:28,  3.00it/s][A
Training:  11%|█         | 10/95 [00:03<00:24,  3.44it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:18,  4.58it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.42it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.50it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.35it/s][A
Training:  21%|██        | 20/95 [00:05<00:14,  5.33it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:17,  4.09it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:19,  3.67it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:12,  5.17it/s][A
Training:  32%|███▏      | 30/95 [00:08<00:16,  3.89it/s][A
Training:  35%|███▍      | 33/95 [00:08<00:16,  3.84it/s][A
Training:  36%|███▌      | 34/95 [00

Epoch: 5/51 - Loss: 2.8265 - Accuracy: 0.9248



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:16,  1.47s/it][A
 17%|█▋        | 2/12 [00:01<00:07,  1.40it/s][A
 33%|███▎      | 4/12 [00:01<00:02,  3.33it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.89it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.35it/s][A
 83%|████████▎ | 10/12 [00:03<00:00,  3.26it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.16it/s][A
Epochs:  10%|▉         | 5/51 [02:13<20:31, 26.78s/it]

Val Loss: 3.0758 - Val Accuracy: 0.9199



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:09,  1.37s/it][A
Training:   2%|▏         | 2/95 [00:01<01:00,  1.54it/s][A
Training:   5%|▌         | 5/95 [00:02<00:31,  2.86it/s][A
Training:   6%|▋         | 6/95 [00:02<00:31,  2.83it/s][A
Training:   8%|▊         | 8/95 [00:02<00:20,  4.23it/s][A
Training:   9%|▉         | 9/95 [00:03<00:23,  3.70it/s][A
Training:  11%|█         | 10/95 [00:03<00:28,  2.98it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:19,  4.13it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:26,  3.03it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:18,  4.20it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:20,  3.89it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:24,  3.18it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:17,  4.34it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:20,  3.56it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:15,  4.62it/s][A
Training:  26%|██▋       | 25/95 [00:0

Epoch: 6/51 - Loss: 2.6738 - Accuracy: 0.9301



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.33s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.45it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.85it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.43it/s][A
 75%|███████▌  | 9/12 [00:03<00:01,  2.73it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.27it/s][A
Epochs:  12%|█▏        | 6/51 [02:40<20:18, 27.07s/it]

Val Loss: 2.6977 - Val Accuracy: 0.9268



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:20,  1.49s/it][A
Training:   2%|▏         | 2/95 [00:01<01:05,  1.42it/s][A
Training:   5%|▌         | 5/95 [00:02<00:35,  2.54it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.08it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.77it/s][A
Training:  11%|█         | 10/95 [00:03<00:23,  3.61it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:17,  4.82it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.25it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:21,  3.76it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  5.14it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:25,  3.04it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:21,  3.54it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:19,  3.82it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:14,  5.04it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:17,  3.95it/s][A
Training:  29%|██▉       | 28/95 [00:

Epoch: 7/51 - Loss: 2.6050 - Accuracy: 0.9281



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.37s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.29it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.80it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.04it/s][A
 75%|███████▌  | 9/12 [00:03<00:01,  2.93it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.49it/s][A
Epochs:  14%|█▎        | 7/51 [03:09<20:12, 27.57s/it]

Val Loss: 2.6696 - Val Accuracy: 0.9243



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:25,  1.54s/it][A
Training:   4%|▍         | 4/95 [00:01<00:30,  3.02it/s][A
Training:   6%|▋         | 6/95 [00:02<00:35,  2.50it/s][A
Training:   9%|▉         | 9/95 [00:03<00:31,  2.72it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:22,  3.65it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.18it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  4.94it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:22,  3.47it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:21,  3.40it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:16,  4.34it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:19,  3.56it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:15,  4.52it/s][A
Training:  31%|███       | 29/95 [00:08<00:17,  3.79it/s][A
Training:  33%|███▎      | 31/95 [00:08<00:13,  4.90it/s][A
Training:  35%|███▍      | 33/95 [00:09<00:16,  3.78it/s][A
Training:  37%|███▋      | 35/95 [00

Epoch: 8/51 - Loss: 2.7534 - Accuracy: 0.9258



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.26s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.69it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.54it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.75it/s][A
Epochs:  16%|█▌        | 8/51 [03:36<19:34, 27.32s/it]

Val Loss: 2.6322 - Val Accuracy: 0.9274



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:12,  1.41s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.41it/s][A
Training:   5%|▌         | 5/95 [00:02<00:34,  2.63it/s][A
Training:   7%|▋         | 7/95 [00:02<00:27,  3.26it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.35it/s][A
Training:  11%|█         | 10/95 [00:03<00:21,  3.86it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.18it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:21,  3.86it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.35it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:20,  3.88it/s][A
Training:  20%|██        | 19/95 [00:05<00:19,  3.81it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:15,  4.69it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:16,  4.49it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:20,  3.52it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:14,  4.87it/s][A
Training:  27%|██▋       | 26/95 [00:

Epoch: 9/51 - Loss: 2.5279 - Accuracy: 0.9321



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.17s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.84it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.47it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.74it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.61it/s][A
Epochs:  18%|█▊        | 9/51 [04:02<18:52, 26.96s/it]

Val Loss: 2.6070 - Val Accuracy: 0.9229



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:17,  1.47s/it][A
Training:   2%|▏         | 2/95 [00:01<01:01,  1.50it/s][A
Training:   4%|▍         | 4/95 [00:01<00:25,  3.58it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.18it/s][A
Training:   9%|▉         | 9/95 [00:03<00:26,  3.24it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:17,  4.85it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.25it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.33it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.29it/s][A
Training:  20%|██        | 19/95 [00:05<00:17,  4.37it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:22,  3.30it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:14,  5.06it/s][A
Training:  27%|██▋       | 26/95 [00:07<00:16,  4.09it/s][A
Training:  31%|███       | 29/95 [00:08<00:16,  4.01it/s][A
Training:  32%|███▏      | 30/95 [00:08<00:15,  4.14it/s][A
Training:  33%|███▎      | 31/95 [00:

Epoch: 10/51 - Loss: 2.5344 - Accuracy: 0.9333



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.34s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.54it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.51it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.89it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.18it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.45it/s][A
Epochs:  20%|█▉        | 10/51 [04:28<18:18, 26.80s/it]

Val Loss: 2.6848 - Val Accuracy: 0.9273



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:11,  1.40s/it][A
Training:   3%|▎         | 3/95 [00:01<00:41,  2.23it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.41it/s][A
Training:   7%|▋         | 7/95 [00:02<00:23,  3.80it/s][A
Training:   9%|▉         | 9/95 [00:03<00:29,  2.92it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.10it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.16it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:16,  4.89it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:18,  4.09it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.65it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:16,  4.44it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:17,  4.10it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:12,  5.26it/s][A
Training:  31%|███       | 29/95 [00:07<00:15,  4.38it/s][A
Training:  34%|███▎      | 32/95 [00:07<00:09,  6.47it/s][A
Training:  36%|███▌      | 34/95 [00:

Epoch: 11/51 - Loss: 2.5666 - Accuracy: 0.9332



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:16,  1.51s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.11it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.47it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.65it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.27it/s][A
Epochs:  22%|██▏       | 11/51 [04:54<17:40, 26.51s/it]

Val Loss: 2.7944 - Val Accuracy: 0.9202



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:48,  1.16s/it][A
Training:   3%|▎         | 3/95 [00:01<00:35,  2.63it/s][A
Training:   5%|▌         | 5/95 [00:02<00:34,  2.61it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.11it/s][A
Training:   7%|▋         | 7/95 [00:02<00:23,  3.80it/s][A
Training:   9%|▉         | 9/95 [00:03<00:23,  3.59it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:21,  3.93it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:21,  3.84it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:18,  4.32it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.36it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:16,  4.67it/s][A
Training:  19%|█▉        | 18/95 [00:04<00:16,  4.79it/s][A
Training:  20%|██        | 19/95 [00:05<00:18,  4.15it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:17,  4.34it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:18,  3.90it/s][A
Training:  26%|██▋       | 25/95 [00:0

Epoch: 12/51 - Loss: 2.5139 - Accuracy: 0.9341



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.36s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.55it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.41it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.76it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.41it/s][A
Epochs:  24%|██▎       | 12/51 [05:20<17:11, 26.45s/it]

Val Loss: 2.6694 - Val Accuracy: 0.9247



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:13,  1.42s/it][A
Training:   3%|▎         | 3/95 [00:01<00:39,  2.33it/s][A
Training:   5%|▌         | 5/95 [00:02<00:40,  2.24it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.54it/s][A
Training:   9%|▉         | 9/95 [00:03<00:29,  2.93it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.11it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:24,  3.33it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.31it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.29it/s][A
Training:  20%|██        | 19/95 [00:05<00:17,  4.31it/s][A
Training:  21%|██        | 20/95 [00:05<00:15,  4.75it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:23,  3.12it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:13,  5.31it/s][A
Training:  27%|██▋       | 26/95 [00:07<00:18,  3.68it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:13,  4.84it/s][A
Training:  32%|███▏      | 30/95 [00:

Epoch: 13/51 - Loss: 2.4530 - Accuracy: 0.9364



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:16,  1.48s/it][A
 42%|████▏     | 5/12 [00:02<00:02,  2.38it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.26it/s][A
Epochs:  25%|██▌       | 13/51 [05:47<16:51, 26.61s/it]

Val Loss: 2.5457 - Val Accuracy: 0.9290



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:01,  1.29s/it][A
Training:   2%|▏         | 2/95 [00:01<00:54,  1.69it/s][A
Training:   5%|▌         | 5/95 [00:02<00:30,  2.91it/s][A
Training:   6%|▋         | 6/95 [00:02<00:26,  3.37it/s][A
Training:   8%|▊         | 8/95 [00:02<00:17,  5.06it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.38it/s][A
Training:  11%|█         | 10/95 [00:03<00:21,  4.00it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:22,  3.71it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:22,  3.66it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.25it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:20,  3.73it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.55it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:19,  3.79it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:15,  4.74it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:16,  4.17it/s][A
Training:  28%|██▊       | 27/95 [00:0

Epoch: 14/51 - Loss: 2.4641 - Accuracy: 0.9343



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.39s/it][A
 17%|█▋        | 2/12 [00:01<00:07,  1.42it/s][A
 33%|███▎      | 4/12 [00:01<00:02,  3.25it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.49it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.89it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.61it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.13it/s][A
Epochs:  27%|██▋       | 14/51 [06:14<16:21, 26.52s/it]

Val Loss: 2.8994 - Val Accuracy: 0.9190



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:19,  1.48s/it][A
Training:   2%|▏         | 2/95 [00:01<01:03,  1.47it/s][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.37it/s][A
Training:   5%|▌         | 5/95 [00:02<00:38,  2.33it/s][A
Training:   6%|▋         | 6/95 [00:02<00:29,  3.02it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.65it/s][A
Training:   9%|▉         | 9/95 [00:03<00:31,  2.75it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:19,  4.28it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:27,  2.95it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:16,  4.68it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:18,  4.10it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:17,  4.24it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:15,  4.63it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:16,  4.30it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:11,  5.99it/s][A
Training:  32%|███▏      | 30/95 [00:08

Epoch: 15/51 - Loss: 2.5763 - Accuracy: 0.9301



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:16,  1.46s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.24it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.88it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.11it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.46it/s][A
Epochs:  29%|██▉       | 15/51 [06:40<15:57, 26.61s/it]

Val Loss: 3.0325 - Val Accuracy: 0.9216



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:56,  1.24s/it][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.47it/s][A
Training:   5%|▌         | 5/95 [00:02<00:33,  2.65it/s][A
Training:   6%|▋         | 6/95 [00:02<00:27,  3.20it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.78it/s][A
Training:   9%|▉         | 9/95 [00:02<00:26,  3.26it/s][A
Training:  11%|█         | 10/95 [00:03<00:23,  3.56it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:18,  4.60it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.42it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:22,  3.56it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:14,  5.30it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:21,  3.68it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:22,  3.43it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:16,  4.36it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:20,  3.51it/s][A
Training:  25%|██▌       | 24/95 [00:0

Epoch: 16/51 - Loss: 2.5007 - Accuracy: 0.9320



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.34s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.49it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.77it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.48it/s][A
Epochs:  31%|███▏      | 16/51 [07:08<15:35, 26.74s/it]

Val Loss: 2.5840 - Val Accuracy: 0.9302



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:26,  1.55s/it][A
Training:   3%|▎         | 3/95 [00:01<00:41,  2.22it/s][A
Training:   5%|▌         | 5/95 [00:02<00:38,  2.37it/s][A
Training:   7%|▋         | 7/95 [00:02<00:23,  3.73it/s][A
Training:   9%|▉         | 9/95 [00:03<00:29,  2.92it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.09it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.54it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:20,  3.82it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.65it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:19,  3.77it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:14,  4.81it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:18,  3.73it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:14,  4.67it/s][A
Training:  31%|███       | 29/95 [00:08<00:17,  3.75it/s][A
Training:  33%|███▎      | 31/95 [00:08<00:13,  4.88it/s][A
Training:  35%|███▍      | 33/95 [00:

Epoch: 17/51 - Loss: 2.4979 - Accuracy: 0.9306



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.36s/it][A
 17%|█▋        | 2/12 [00:01<00:06,  1.62it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.56it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.43it/s][A
Epochs:  33%|███▎      | 17/51 [07:34<15:08, 26.71s/it]

Val Loss: 2.6516 - Val Accuracy: 0.9216



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:11,  1.40s/it][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.39it/s][A
Training:   6%|▋         | 6/95 [00:02<00:30,  2.96it/s][A
Training:   8%|▊         | 8/95 [00:02<00:19,  4.36it/s][A
Training:  11%|█         | 10/95 [00:03<00:26,  3.23it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:22,  3.71it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.26it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:21,  3.74it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  5.16it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.30it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.73it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:21,  3.45it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:13,  5.40it/s][A
Training:  27%|██▋       | 26/95 [00:07<00:20,  3.34it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:15,  4.37it/s][A
Training:  32%|███▏      | 30/95 [00

Epoch: 18/51 - Loss: 2.4085 - Accuracy: 0.9339



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.29s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.50it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.94it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.93it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.47it/s][A
Epochs:  35%|███▌      | 18/51 [08:01<14:44, 26.80s/it]

Val Loss: 2.5884 - Val Accuracy: 0.9197



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:03,  1.31s/it][A
Training:   3%|▎         | 3/95 [00:01<00:35,  2.61it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.43it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.58it/s][A
Training:   9%|▉         | 9/95 [00:03<00:26,  3.27it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:21,  3.92it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:20,  3.95it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:16,  4.96it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:18,  4.23it/s][A
Training:  20%|██        | 19/95 [00:05<00:14,  5.13it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:18,  4.07it/s][A
Training:  24%|██▍       | 23/95 [00:05<00:14,  5.07it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:18,  3.82it/s][A
Training:  27%|██▋       | 26/95 [00:06<00:16,  4.25it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:15,  4.35it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 19/51 - Loss: 2.2835 - Accuracy: 0.9360



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.27s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.73it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.63it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.86it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.45it/s][A
Epochs:  37%|███▋      | 19/51 [08:28<14:18, 26.81s/it]

Val Loss: 2.8262 - Val Accuracy: 0.9267



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:02,  1.31s/it][A
Training:   2%|▏         | 2/95 [00:01<01:00,  1.55it/s][A
Training:   4%|▍         | 4/95 [00:01<00:25,  3.57it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.40it/s][A
Training:   6%|▋         | 6/95 [00:02<00:30,  2.93it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.35it/s][A
Training:  11%|█         | 10/95 [00:03<00:22,  3.71it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:16,  4.94it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:21,  3.86it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:20,  3.99it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:16,  4.80it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:18,  4.31it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:18,  4.17it/s][A
Training:  21%|██        | 20/95 [00:05<00:15,  4.83it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:16,  4.57it/s][A
Training:  23%|██▎       | 22/95 [00:0

Epoch: 20/51 - Loss: 2.3237 - Accuracy: 0.9377



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.21s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.77it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.69it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.26it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.48it/s][A
 83%|████████▎ | 10/12 [00:03<00:00,  3.79it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.48it/s][A
Epochs:  39%|███▉      | 20/51 [08:55<13:56, 26.99s/it]

Val Loss: 2.6500 - Val Accuracy: 0.9268



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:47,  1.14s/it][A
Training:   3%|▎         | 3/95 [00:01<00:31,  2.96it/s][A
Training:   5%|▌         | 5/95 [00:02<00:36,  2.47it/s][A
Training:   8%|▊         | 8/95 [00:02<00:19,  4.44it/s][A
Training:  11%|█         | 10/95 [00:03<00:25,  3.28it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.47it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.50it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:21,  3.62it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.70it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.64it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:15,  4.65it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:16,  4.42it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:20,  3.48it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:14,  4.67it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:14,  4.75it/s][A
Training:  31%|███       | 29/95 [00

Epoch: 21/51 - Loss: 2.4293 - Accuracy: 0.9384



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.29s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.65it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.75it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.11it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.75it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.42it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.46it/s][A
Epochs:  41%|████      | 21/51 [09:22<13:24, 26.82s/it]

Val Loss: 2.5584 - Val Accuracy: 0.9254



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:11,  1.40s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.37it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.41it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.35it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:17,  4.79it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:26,  3.10it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:19,  4.14it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.53it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.58it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.66it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:18,  3.72it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:14,  4.57it/s][A
Training:  31%|███       | 29/95 [00:08<00:17,  3.69it/s][A
Training:  34%|███▎      | 32/95 [00:08<00:11,  5.33it/s][A
Training:  36%|███▌      | 34/95 [00:09<00:15,  3.97it/s][A
Training:  39%|███▉      | 37/95 [00

Epoch: 22/51 - Loss: 2.3595 - Accuracy: 0.9357



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.39s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.39it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.72it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.96it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.47it/s][A
Epochs:  43%|████▎     | 22/51 [09:47<12:47, 26.45s/it]

Val Loss: 2.5126 - Val Accuracy: 0.9285



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:55,  1.23s/it][A
Training:   2%|▏         | 2/95 [00:01<00:55,  1.67it/s][A
Training:   5%|▌         | 5/95 [00:02<00:32,  2.78it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.16it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.40it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:20,  4.05it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:13,  5.74it/s][A
Training:  19%|█▉        | 18/95 [00:04<00:17,  4.40it/s][A
Training:  21%|██        | 20/95 [00:05<00:13,  5.38it/s][A
Training:  23%|██▎       | 22/95 [00:05<00:17,  4.24it/s][A
Training:  25%|██▌       | 24/95 [00:05<00:13,  5.21it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:16,  4.20it/s][A
Training:  27%|██▋       | 26/95 [00:06<00:16,  4.10it/s][A
Training:  29%|██▉       | 28/95 [00:06<00:12,  5.29it/s][A
Training:  31%|███       | 29/95 [00:07<00:17,  3.86it/s][A
Training:  32%|███▏      | 30/95 [00:

Epoch: 23/51 - Loss: 2.2915 - Accuracy: 0.9392



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.39s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.41it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.73it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.79it/s][A
 75%|███████▌  | 9/12 [00:03<00:01,  2.87it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.38it/s][A
Epochs:  45%|████▌     | 23/51 [10:13<12:14, 26.22s/it]

Val Loss: 2.5606 - Val Accuracy: 0.9278



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:16,  1.45s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.38it/s][A
Training:   5%|▌         | 5/95 [00:02<00:36,  2.45it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.57it/s][A
Training:   9%|▉         | 9/95 [00:03<00:28,  3.06it/s][A
Training:  11%|█         | 10/95 [00:03<00:24,  3.54it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.16it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:24,  3.36it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:16,  4.76it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.51it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.68it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:19,  3.87it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:13,  5.18it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:17,  3.95it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:13,  5.19it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 24/51 - Loss: 2.2973 - Accuracy: 0.9386



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:18,  1.72s/it][A
 25%|██▌       | 3/12 [00:01<00:04,  1.95it/s][A
 42%|████▏     | 5/12 [00:02<00:03,  2.14it/s][A
 50%|█████     | 6/12 [00:03<00:02,  2.44it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.11it/s][A
100%|██████████| 12/12 [00:04<00:00,  2.76it/s][A
Epochs:  47%|████▋     | 24/51 [10:41<12:00, 26.69s/it]

Val Loss: 2.5514 - Val Accuracy: 0.9264



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:01,  1.29s/it][A
Training:   3%|▎         | 3/95 [00:01<00:40,  2.29it/s][A
Training:   5%|▌         | 5/95 [00:02<00:32,  2.80it/s][A
Training:   6%|▋         | 6/95 [00:02<00:25,  3.44it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.56it/s][A
Training:   8%|▊         | 8/95 [00:02<00:20,  4.24it/s][A
Training:   9%|▉         | 9/95 [00:03<00:25,  3.36it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.12it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:20,  4.02it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.35it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:16,  4.78it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:20,  3.74it/s][A
Training:  20%|██        | 19/95 [00:05<00:18,  4.06it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:18,  3.92it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:16,  4.38it/s][A
Training:  26%|██▋       | 25/95 [00:06

Epoch: 25/51 - Loss: 2.4016 - Accuracy: 0.9391



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.34s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.50it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.12it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.57it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.64it/s][A
Epochs:  49%|████▉     | 25/51 [11:07<11:33, 26.66s/it]

Val Loss: 2.5284 - Val Accuracy: 0.9301



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:02,  1.30s/it][A
Training:   4%|▍         | 4/95 [00:01<00:25,  3.54it/s][A
Training:   6%|▋         | 6/95 [00:02<00:30,  2.89it/s][A
Training:   8%|▊         | 8/95 [00:02<00:20,  4.29it/s][A
Training:  11%|█         | 10/95 [00:03<00:26,  3.26it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:22,  3.76it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.17it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.44it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:20,  3.72it/s][A
Training:  20%|██        | 19/95 [00:05<00:15,  4.83it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:19,  3.74it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:14,  4.96it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:20,  3.47it/s][A
Training:  27%|██▋       | 26/95 [00:07<00:17,  3.90it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:12,  5.28it/s][A
Training:  32%|███▏      | 30/95 [00

Epoch: 26/51 - Loss: 2.3027 - Accuracy: 0.9397



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.26s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.43it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.45it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.06it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.68it/s][A
 75%|███████▌  | 9/12 [00:03<00:01,  2.85it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.52it/s][A
Epochs:  51%|█████     | 26/51 [11:33<10:55, 26.22s/it]

Val Loss: 2.5204 - Val Accuracy: 0.9263



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:20,  1.49s/it][A
Training:   3%|▎         | 3/95 [00:01<00:39,  2.30it/s][A
Training:   4%|▍         | 4/95 [00:01<00:29,  3.04it/s][A
Training:   5%|▌         | 5/95 [00:02<00:45,  1.99it/s][A
Training:   6%|▋         | 6/95 [00:02<00:33,  2.67it/s][A
Training:   8%|▊         | 8/95 [00:02<00:20,  4.32it/s][A
Training:   9%|▉         | 9/95 [00:03<00:32,  2.69it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.16it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:18,  4.43it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:29,  2.79it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:17,  4.49it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:24,  3.18it/s][A
Training:  21%|██        | 20/95 [00:05<00:14,  5.23it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:16,  4.32it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:12,  5.55it/s][A
Training:  27%|██▋       | 26/95 [00:07

Epoch: 27/51 - Loss: 2.2346 - Accuracy: 0.9403



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.36s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.56it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.38it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.76it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.38it/s][A
Epochs:  53%|█████▎    | 27/51 [12:00<10:36, 26.53s/it]

Val Loss: 2.5435 - Val Accuracy: 0.9302



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:09,  1.38s/it][A
Training:   3%|▎         | 3/95 [00:01<00:39,  2.33it/s][A
Training:   4%|▍         | 4/95 [00:01<00:29,  3.05it/s][A
Training:   5%|▌         | 5/95 [00:02<00:36,  2.45it/s][A
Training:   7%|▋         | 7/95 [00:02<00:25,  3.42it/s][A
Training:   9%|▉         | 9/95 [00:03<00:24,  3.45it/s][A
Training:  11%|█         | 10/95 [00:03<00:23,  3.58it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:19,  4.22it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:22,  3.61it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:19,  4.19it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.62it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.54it/s][A
Training:  20%|██        | 19/95 [00:05<00:15,  4.87it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.63it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:14,  4.88it/s][A
Training:  26%|██▋       | 25/95 [00:0

Epoch: 28/51 - Loss: 2.3987 - Accuracy: 0.9356



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.30s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.62it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.89it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.24it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.62it/s][A
Epochs:  55%|█████▍    | 28/51 [12:27<10:10, 26.56s/it]

Val Loss: 2.7310 - Val Accuracy: 0.9297



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:13,  1.42s/it][A
Training:   3%|▎         | 3/95 [00:01<00:37,  2.46it/s][A
Training:   5%|▌         | 5/95 [00:02<00:35,  2.53it/s][A
Training:   7%|▋         | 7/95 [00:02<00:22,  3.95it/s][A
Training:   9%|▉         | 9/95 [00:03<00:26,  3.26it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:19,  4.41it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:17,  4.76it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:27,  3.03it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.23it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.43it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.74it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:21,  3.42it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:16,  4.45it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:20,  3.47it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:14,  4.55it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 29/51 - Loss: 2.3396 - Accuracy: 0.9383



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.32s/it][A
 33%|███▎      | 4/12 [00:01<00:02,  3.47it/s][A
 50%|█████     | 6/12 [00:02<00:02,  2.72it/s][A
 75%|███████▌  | 9/12 [00:03<00:01,  2.93it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.36it/s][A
Epochs:  57%|█████▋    | 29/51 [12:53<09:42, 26.49s/it]

Val Loss: 2.6607 - Val Accuracy: 0.9287



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:03,  1.31s/it][A
Training:   4%|▍         | 4/95 [00:01<00:26,  3.38it/s][A
Training:   6%|▋         | 6/95 [00:02<00:29,  3.06it/s][A
Training:   8%|▊         | 8/95 [00:02<00:19,  4.37it/s][A
Training:   9%|▉         | 9/95 [00:03<00:30,  2.85it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:21,  3.98it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.43it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.24it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:23,  3.38it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:20,  3.85it/s][A
Training:  20%|██        | 19/95 [00:05<00:17,  4.35it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:22,  3.32it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:16,  4.48it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:18,  3.72it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:13,  4.97it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 30/51 - Loss: 2.2883 - Accuracy: 0.9381



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.31s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.55it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.56it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.67it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.47it/s][A
Epochs:  59%|█████▉    | 30/51 [13:19<09:14, 26.38s/it]

Val Loss: 2.4748 - Val Accuracy: 0.9302



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:02,  1.30s/it][A
Training:   3%|▎         | 3/95 [00:01<00:34,  2.63it/s][A
Training:   5%|▌         | 5/95 [00:02<00:33,  2.72it/s][A
Training:   6%|▋         | 6/95 [00:02<00:27,  3.26it/s][A
Training:   8%|▊         | 8/95 [00:02<00:17,  4.99it/s][A
Training:  11%|█         | 10/95 [00:03<00:25,  3.35it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:21,  3.85it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:21,  3.86it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:17,  4.56it/s][A
Training:  20%|██        | 19/95 [00:04<00:13,  5.75it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:15,  4.86it/s][A
Training:  25%|██▌       | 24/95 [00:05<00:10,  6.57it/s][A
Training:  27%|██▋       | 26/95 [00:06<00:14,  4.60it/s][A
Training:  29%|██▉       | 28/95 [00:06<00:11,  5.78it/s][A
Training:  32%|███▏      | 30/95 [00:07<00:16,  3.99it/s][A
Training:  34%|███▎      | 32/95 [00:

Epoch: 31/51 - Loss: 2.3096 - Accuracy: 0.9385



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.37s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.54it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.56it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.93it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.49it/s][A
Epochs:  61%|██████    | 31/51 [13:45<08:45, 26.27s/it]

Val Loss: 2.5197 - Val Accuracy: 0.9297



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:57,  1.25s/it][A
Training:   2%|▏         | 2/95 [00:01<00:54,  1.72it/s][A
Training:   4%|▍         | 4/95 [00:01<00:23,  3.88it/s][A
Training:   5%|▌         | 5/95 [00:02<00:34,  2.60it/s][A
Training:   6%|▋         | 6/95 [00:02<00:27,  3.20it/s][A
Training:   8%|▊         | 8/95 [00:02<00:17,  5.09it/s][A
Training:   9%|▉         | 9/95 [00:03<00:27,  3.18it/s][A
Training:  11%|█         | 10/95 [00:03<00:24,  3.43it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:15,  5.32it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:21,  3.82it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  5.26it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:20,  3.77it/s][A
Training:  20%|██        | 19/95 [00:05<00:18,  4.14it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:21,  3.48it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:15,  4.55it/s][A
Training:  26%|██▋       | 25/95 [00:06

Epoch: 32/51 - Loss: 2.4618 - Accuracy: 0.9334



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.42s/it][A
 17%|█▋        | 2/12 [00:01<00:06,  1.53it/s][A
 25%|██▌       | 3/12 [00:01<00:03,  2.45it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.56it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.22it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  5.01it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.43it/s][A
Epochs:  63%|██████▎   | 32/51 [14:11<08:19, 26.29s/it]

Val Loss: 2.4844 - Val Accuracy: 0.9285



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:01,  1.29s/it][A
Training:   3%|▎         | 3/95 [00:01<00:40,  2.29it/s][A
Training:   4%|▍         | 4/95 [00:01<00:29,  3.07it/s][A
Training:   5%|▌         | 5/95 [00:02<00:35,  2.55it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.17it/s][A
Training:   8%|▊         | 8/95 [00:02<00:16,  5.29it/s][A
Training:  11%|█         | 10/95 [00:03<00:22,  3.73it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.08it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:24,  3.38it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.34it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:20,  3.90it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:19,  4.05it/s][A
Training:  20%|██        | 19/95 [00:05<00:17,  4.30it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:18,  4.04it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:18,  3.93it/s][A
Training:  24%|██▍       | 23/95 [00:0

Epoch: 33/51 - Loss: 2.3530 - Accuracy: 0.9362



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.44s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.37it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.41it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.83it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.42it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.44it/s][A
Epochs:  65%|██████▍   | 33/51 [14:38<07:53, 26.33s/it]

Val Loss: 2.5713 - Val Accuracy: 0.9273



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:12,  1.41s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.38it/s][A
Training:   4%|▍         | 4/95 [00:01<00:28,  3.22it/s][A
Training:   5%|▌         | 5/95 [00:02<00:36,  2.45it/s][A
Training:   7%|▋         | 7/95 [00:02<00:21,  4.10it/s][A
Training:   9%|▉         | 9/95 [00:03<00:24,  3.58it/s][A
Training:  11%|█         | 10/95 [00:03<00:21,  4.01it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:14,  5.78it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:22,  3.66it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  4.99it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:21,  3.53it/s][A
Training:  21%|██        | 20/95 [00:05<00:15,  4.73it/s][A
Training:  23%|██▎       | 22/95 [00:06<00:19,  3.72it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:15,  4.58it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:17,  3.93it/s][A
Training:  27%|██▋       | 26/95 [00:0

Epoch: 34/51 - Loss: 2.2785 - Accuracy: 0.9383



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.41s/it][A
 17%|█▋        | 2/12 [00:01<00:06,  1.56it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.81it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.19it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.97it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.52it/s][A
Epochs:  67%|██████▋   | 34/51 [15:04<07:24, 26.17s/it]

Val Loss: 2.6998 - Val Accuracy: 0.9275



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:00,  1.29s/it][A
Training:   4%|▍         | 4/95 [00:01<00:25,  3.59it/s][A
Training:   6%|▋         | 6/95 [00:02<00:31,  2.86it/s][A
Training:   8%|▊         | 8/95 [00:02<00:20,  4.26it/s][A
Training:  11%|█         | 10/95 [00:03<00:24,  3.52it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:21,  3.75it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:20,  3.79it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.63it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:18,  3.90it/s][A
Training:  24%|██▍       | 23/95 [00:05<00:15,  4.79it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:18,  3.84it/s][A
Training:  28%|██▊       | 27/95 [00:06<00:13,  4.87it/s][A
Training:  31%|███       | 29/95 [00:07<00:17,  3.83it/s][A
Training:  35%|███▍      | 33/95 [00:08<00:15,  3.91it/s][A
Training:  38%|███▊      | 36/95 [00:08<00:11,  5.32it/s][A
Training:  40%|████      | 38/95 [00

Epoch: 35/51 - Loss: 2.2071 - Accuracy: 0.9397



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:11,  1.04s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.75it/s][A
 42%|████▏     | 5/12 [00:01<00:02,  2.89it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.48it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.94it/s][A
 75%|███████▌  | 9/12 [00:02<00:00,  3.44it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.73it/s][A
Epochs:  69%|██████▊   | 35/51 [15:29<06:56, 26.01s/it]

Val Loss: 2.4787 - Val Accuracy: 0.9309



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:05,  1.33s/it][A
Training:   5%|▌         | 5/95 [00:02<00:35,  2.52it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.65it/s][A
Training:   9%|▉         | 9/95 [00:03<00:29,  2.91it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:23,  3.46it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:18,  4.41it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:21,  3.60it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.62it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:19,  3.75it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:17,  3.96it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:12,  5.40it/s][A
Training:  32%|███▏      | 30/95 [00:07<00:14,  4.43it/s][A
Training:  35%|███▍      | 33/95 [00:08<00:14,  4.15it/s][A
Training:  36%|███▌      | 34/95 [00:08<00:13,  4.51it/s][A
Training:  38%|███▊      | 36/95 [00:09<00:10,  5.38it/s][A
Training:  39%|███▉      | 37/95 [00

Epoch: 36/51 - Loss: 2.2197 - Accuracy: 0.9401



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:15,  1.44s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.37it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.47it/s][A
 58%|█████▊    | 7/12 [00:02<00:01,  3.89it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.13it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.39it/s][A
Epochs:  71%|███████   | 36/51 [15:55<06:29, 25.99s/it]

Val Loss: 2.6028 - Val Accuracy: 0.9308



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:15,  1.45s/it][A
Training:   3%|▎         | 3/95 [00:01<00:39,  2.32it/s][A
Training:   5%|▌         | 5/95 [00:02<00:37,  2.42it/s][A
Training:   6%|▋         | 6/95 [00:02<00:29,  3.01it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.69it/s][A
Training:  11%|█         | 10/95 [00:03<00:24,  3.53it/s][A
Training:  12%|█▏        | 11/95 [00:03<00:20,  4.08it/s][A
Training:  14%|█▎        | 13/95 [00:04<00:25,  3.28it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.47it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.49it/s][A
Training:  20%|██        | 19/95 [00:05<00:16,  4.73it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:22,  3.23it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:17,  4.23it/s][A
Training:  26%|██▋       | 25/95 [00:07<00:20,  3.46it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:15,  4.53it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 37/51 - Loss: 2.1589 - Accuracy: 0.9392



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:14,  1.32s/it][A
 25%|██▌       | 3/12 [00:01<00:03,  2.62it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.79it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.26it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  5.09it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.46it/s][A
Epochs:  73%|███████▎  | 37/51 [16:22<06:05, 26.11s/it]

Val Loss: 2.5338 - Val Accuracy: 0.9291



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:56,  1.24s/it][A
Training:   2%|▏         | 2/95 [00:01<00:54,  1.72it/s][A
Training:   4%|▍         | 4/95 [00:01<00:22,  3.99it/s][A
Training:   6%|▋         | 6/95 [00:02<00:28,  3.14it/s][A
Training:   8%|▊         | 8/95 [00:02<00:18,  4.68it/s][A
Training:  11%|█         | 10/95 [00:03<00:23,  3.67it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:22,  3.62it/s][A
Training:  16%|█▌        | 15/95 [00:04<00:17,  4.61it/s][A
Training:  18%|█▊        | 17/95 [00:04<00:21,  3.64it/s][A
Training:  19%|█▉        | 18/95 [00:05<00:19,  4.02it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:17,  4.25it/s][A
Training:  23%|██▎       | 22/95 [00:05<00:16,  4.40it/s][A
Training:  25%|██▌       | 24/95 [00:06<00:12,  5.82it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:16,  4.24it/s][A
Training:  27%|██▋       | 26/95 [00:06<00:15,  4.35it/s][A
Training:  29%|██▉       | 28/95 [00:

Epoch: 38/51 - Loss: 2.3301 - Accuracy: 0.9337



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:13,  1.22s/it][A
 17%|█▋        | 2/12 [00:01<00:05,  1.70it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  3.09it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.36it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  4.95it/s][A
 75%|███████▌  | 9/12 [00:02<00:00,  3.56it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.60it/s][A
Epochs:  75%|███████▍  | 38/51 [16:46<05:33, 25.65s/it]

Val Loss: 2.4980 - Val Accuracy: 0.9293



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:04,  1.33s/it][A
Training:   3%|▎         | 3/95 [00:01<00:35,  2.57it/s][A
Training:   5%|▌         | 5/95 [00:02<00:34,  2.61it/s][A
Training:   9%|▉         | 9/95 [00:02<00:23,  3.68it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:14,  5.56it/s][A
Training:  15%|█▍        | 14/95 [00:03<00:19,  4.11it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:15,  4.98it/s][A
Training:  19%|█▉        | 18/95 [00:04<00:19,  3.89it/s][A
Training:  20%|██        | 19/95 [00:05<00:17,  4.33it/s][A
Training:  22%|██▏       | 21/95 [00:05<00:21,  3.49it/s][A
Training:  24%|██▍       | 23/95 [00:05<00:15,  4.58it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:17,  4.03it/s][A
Training:  28%|██▊       | 27/95 [00:06<00:13,  5.22it/s][A
Training:  31%|███       | 29/95 [00:07<00:17,  3.86it/s][A
Training:  33%|███▎      | 31/95 [00:07<00:12,  5.06it/s][A
Training:  35%|███▍      | 33/95 [00

Epoch: 39/51 - Loss: 2.2445 - Accuracy: 0.9374



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.18s/it][A
 17%|█▋        | 2/12 [00:01<00:05,  1.80it/s][A
 33%|███▎      | 4/12 [00:01<00:02,  3.94it/s][A
 42%|████▏     | 5/12 [00:02<00:02,  2.69it/s][A
 50%|█████     | 6/12 [00:02<00:01,  3.26it/s][A
 67%|██████▋   | 8/12 [00:02<00:00,  5.03it/s][A
 75%|███████▌  | 9/12 [00:03<00:00,  3.31it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.61it/s][A
Epochs:  76%|███████▋  | 39/51 [17:11<05:06, 25.51s/it]

Val Loss: 2.5504 - Val Accuracy: 0.9306



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<02:09,  1.38s/it][A
Training:   3%|▎         | 3/95 [00:01<00:38,  2.42it/s][A
Training:   5%|▌         | 5/95 [00:02<00:38,  2.31it/s][A
Training:   7%|▋         | 7/95 [00:02<00:24,  3.54it/s][A
Training:   9%|▉         | 9/95 [00:03<00:29,  2.92it/s][A
Training:  13%|█▎        | 12/95 [00:03<00:17,  4.68it/s][A
Training:  15%|█▍        | 14/95 [00:04<00:21,  3.69it/s][A
Training:  17%|█▋        | 16/95 [00:04<00:17,  4.58it/s][A
Training:  18%|█▊        | 17/95 [00:05<00:22,  3.54it/s][A
Training:  21%|██        | 20/95 [00:05<00:14,  5.31it/s][A
Training:  22%|██▏       | 21/95 [00:06<00:20,  3.54it/s][A
Training:  24%|██▍       | 23/95 [00:06<00:15,  4.75it/s][A
Training:  26%|██▋       | 25/95 [00:06<00:17,  4.01it/s][A
Training:  28%|██▊       | 27/95 [00:07<00:13,  5.15it/s][A
Training:  29%|██▉       | 28/95 [00:07<00:11,  5.63it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 40/51 - Loss: 2.2263 - Accuracy: 0.9363



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.17s/it][A
 42%|████▏     | 5/12 [00:01<00:02,  3.15it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.44it/s][A
Epochs:  78%|███████▊  | 40/51 [17:35<04:34, 24.98s/it]

Val Loss: 2.5012 - Val Accuracy: 0.9276



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:44,  1.11s/it][A
Training:   5%|▌         | 5/95 [00:01<00:27,  3.25it/s][A
Training:   7%|▋         | 7/95 [00:01<00:18,  4.66it/s][A
Training:   9%|▉         | 9/95 [00:02<00:22,  3.88it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:18,  4.42it/s][A
Training:  17%|█▋        | 16/95 [00:03<00:12,  6.26it/s][A
Training:  19%|█▉        | 18/95 [00:04<00:14,  5.27it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:15,  4.79it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:13,  5.03it/s][A
Training:  28%|██▊       | 27/95 [00:05<00:11,  5.96it/s][A
Training:  31%|███       | 29/95 [00:06<00:12,  5.23it/s][A
Training:  34%|███▎      | 32/95 [00:06<00:08,  7.08it/s][A
Training:  36%|███▌      | 34/95 [00:06<00:10,  5.80it/s][A
Training:  39%|███▉      | 37/95 [00:07<00:11,  5.26it/s][A
Training:  43%|████▎     | 41/95 [00:08<00:09,  5.49it/s][A
Training:  45%|████▌     | 43/95 [00

Epoch: 41/51 - Loss: 2.1789 - Accuracy: 0.9407



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:10,  1.09it/s][A
 33%|███▎      | 4/12 [00:01<00:01,  4.90it/s][A
 50%|█████     | 6/12 [00:01<00:01,  4.16it/s][A
 67%|██████▋   | 8/12 [00:01<00:00,  5.97it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.71it/s][A
Epochs:  80%|████████  | 41/51 [17:55<03:54, 23.47s/it]

Val Loss: 2.6144 - Val Accuracy: 0.9298



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:34,  1.01s/it][A
Training:   5%|▌         | 5/95 [00:01<00:25,  3.58it/s][A
Training:   9%|▉         | 9/95 [00:02<00:20,  4.25it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:17,  4.77it/s][A
Training:  16%|█▌        | 15/95 [00:03<00:14,  5.59it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:16,  4.83it/s][A
Training:  21%|██        | 20/95 [00:03<00:11,  6.73it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:14,  5.19it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:14,  4.90it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:13,  5.24it/s][A
Training:  31%|███       | 29/95 [00:06<00:13,  4.85it/s][A
Training:  33%|███▎      | 31/95 [00:06<00:10,  6.00it/s][A
Training:  35%|███▍      | 33/95 [00:06<00:11,  5.56it/s][A
Training:  36%|███▌      | 34/95 [00:06<00:11,  5.34it/s][A
Training:  39%|███▉      | 37/95 [00:07<00:10,  5.65it/s][A
Training:  40%|████      | 38/95 [0

Epoch: 42/51 - Loss: 2.2291 - Accuracy: 0.9380



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.17s/it][A
 42%|████▏     | 5/12 [00:01<00:02,  3.11it/s][A
 58%|█████▊    | 7/12 [00:01<00:01,  4.47it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.36it/s][A
Epochs:  82%|████████▏ | 42/51 [18:15<03:22, 22.51s/it]

Val Loss: 2.6323 - Val Accuracy: 0.9254



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:41,  1.08s/it][A
Training:   5%|▌         | 5/95 [00:01<00:27,  3.25it/s][A
Training:   9%|▉         | 9/95 [00:02<00:18,  4.54it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:15,  5.14it/s][A
Training:  17%|█▋        | 16/95 [00:03<00:11,  6.81it/s][A
Training:  19%|█▉        | 18/95 [00:03<00:14,  5.31it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:13,  5.30it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:13,  5.45it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:13,  5.22it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:12,  5.34it/s][A
Training:  29%|██▉       | 28/95 [00:05<00:10,  6.66it/s][A
Training:  31%|███       | 29/95 [00:05<00:13,  4.92it/s][A
Training:  33%|███▎      | 31/95 [00:06<00:10,  6.35it/s][A
Training:  35%|███▍      | 33/95 [00:06<00:11,  5.20it/s][A
Training:  37%|███▋      | 35/95 [00:06<00:09,  6.65it/s][A
Training:  39%|███▉      | 37/95 [0

Epoch: 43/51 - Loss: 2.1638 - Accuracy: 0.9398



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:10,  1.03it/s][A
 42%|████▏     | 5/12 [00:01<00:02,  3.39it/s][A
 58%|█████▊    | 7/12 [00:01<00:01,  4.73it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.71it/s][A
Epochs:  84%|████████▍ | 43/51 [18:35<02:53, 21.74s/it]

Val Loss: 2.5763 - Val Accuracy: 0.9312



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:39,  1.06s/it][A
Training:   5%|▌         | 5/95 [00:01<00:28,  3.21it/s][A
Training:   7%|▋         | 7/95 [00:01<00:18,  4.65it/s][A
Training:   9%|▉         | 9/95 [00:02<00:21,  3.94it/s][A
Training:  12%|█▏        | 11/95 [00:02<00:15,  5.36it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:17,  4.56it/s][A
Training:  16%|█▌        | 15/95 [00:03<00:13,  6.00it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:16,  4.69it/s][A
Training:  21%|██        | 20/95 [00:04<00:10,  6.98it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:14,  5.07it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:15,  4.50it/s][A
Training:  29%|██▉       | 28/95 [00:05<00:10,  6.34it/s][A
Training:  32%|███▏      | 30/95 [00:06<00:13,  4.89it/s][A
Training:  35%|███▍      | 33/95 [00:07<00:13,  4.72it/s][A
Training:  39%|███▉      | 37/95 [00:07<00:11,  5.03it/s][A
Training:  42%|████▏     | 40/95 [00

Epoch: 44/51 - Loss: 2.1815 - Accuracy: 0.9434



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:10,  1.02it/s][A
 33%|███▎      | 4/12 [00:01<00:01,  4.67it/s][A
 50%|█████     | 6/12 [00:01<00:01,  3.88it/s][A
 67%|██████▋   | 8/12 [00:01<00:00,  5.61it/s][A
 83%|████████▎ | 10/12 [00:02<00:00,  4.48it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.54it/s][A
Epochs:  86%|████████▋ | 44/51 [18:56<02:29, 21.42s/it]

Val Loss: 2.8608 - Val Accuracy: 0.9228



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:42,  1.09s/it][A
Training:   3%|▎         | 3/95 [00:01<00:29,  3.11it/s][A
Training:   5%|▌         | 5/95 [00:01<00:26,  3.43it/s][A
Training:   7%|▋         | 7/95 [00:01<00:17,  5.12it/s][A
Training:   9%|▉         | 9/95 [00:02<00:20,  4.19it/s][A
Training:  12%|█▏        | 11/95 [00:02<00:15,  5.52it/s][A
Training:  13%|█▎        | 12/95 [00:02<00:13,  5.98it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:18,  4.32it/s][A
Training:  16%|█▌        | 15/95 [00:03<00:13,  5.86it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:17,  4.52it/s][A
Training:  20%|██        | 19/95 [00:04<00:13,  5.81it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:14,  4.94it/s][A
Training:  24%|██▍       | 23/95 [00:04<00:12,  5.64it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:14,  4.80it/s][A
Training:  28%|██▊       | 27/95 [00:05<00:11,  5.88it/s][A
Training:  31%|███       | 29/95 [00:

Epoch: 45/51 - Loss: 2.1888 - Accuracy: 0.9387



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:10,  1.04it/s][A
 42%|████▏     | 5/12 [00:01<00:01,  3.51it/s][A
 58%|█████▊    | 7/12 [00:01<00:00,  5.10it/s][A
 75%|███████▌  | 9/12 [00:02<00:00,  4.26it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.75it/s][A
Epochs:  88%|████████▊ | 45/51 [19:16<02:06, 21.04s/it]

Val Loss: 2.6257 - Val Accuracy: 0.9291



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:00<01:32,  1.01it/s][A
Training:   5%|▌         | 5/95 [00:01<00:26,  3.41it/s][A
Training:   6%|▋         | 6/95 [00:01<00:22,  3.97it/s][A
Training:   9%|▉         | 9/95 [00:02<00:19,  4.32it/s][A
Training:  11%|█         | 10/95 [00:02<00:17,  4.73it/s][A
Training:  13%|█▎        | 12/95 [00:02<00:13,  6.28it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:17,  4.67it/s][A
Training:  15%|█▍        | 14/95 [00:03<00:16,  4.78it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:15,  5.08it/s][A
Training:  19%|█▉        | 18/95 [00:04<00:15,  5.04it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:14,  5.23it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:13,  5.29it/s][A
Training:  25%|██▌       | 24/95 [00:04<00:10,  6.96it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:14,  4.83it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:13,  5.17it/s][A
Training:  31%|███       | 29/95 [00

Epoch: 46/51 - Loss: 2.3178 - Accuracy: 0.9402



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:11,  1.01s/it][A
 25%|██▌       | 3/12 [00:01<00:02,  3.22it/s][A
 42%|████▏     | 5/12 [00:01<00:02,  3.29it/s][A
 67%|██████▋   | 8/12 [00:01<00:00,  5.65it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.53it/s][A
Epochs:  90%|█████████ | 46/51 [19:36<01:43, 20.79s/it]

Val Loss: 2.8061 - Val Accuracy: 0.9247



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:47,  1.14s/it][A
Training:   3%|▎         | 3/95 [00:01<00:30,  2.98it/s][A
Training:   5%|▌         | 5/95 [00:01<00:26,  3.34it/s][A
Training:   9%|▉         | 9/95 [00:02<00:18,  4.66it/s][A
Training:  12%|█▏        | 11/95 [00:02<00:14,  5.98it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:16,  4.92it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:14,  5.56it/s][A
Training:  20%|██        | 19/95 [00:03<00:11,  6.65it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:13,  5.51it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:12,  5.86it/s][A
Training:  25%|██▌       | 24/95 [00:04<00:09,  7.49it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:13,  5.30it/s][A
Training:  29%|██▉       | 28/95 [00:05<00:10,  6.63it/s][A
Training:  32%|███▏      | 30/95 [00:05<00:12,  5.33it/s][A
Training:  35%|███▍      | 33/95 [00:06<00:11,  5.28it/s][A
Training:  37%|███▋      | 35/95 [00

Epoch: 47/51 - Loss: 2.2754 - Accuracy: 0.9380



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:11,  1.02s/it][A
 42%|████▏     | 5/12 [00:01<00:02,  3.09it/s][A
 67%|██████▋   | 8/12 [00:01<00:00,  5.38it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.62it/s][A
Epochs:  92%|█████████▏| 47/51 [19:56<01:22, 20.61s/it]

Val Loss: 2.5203 - Val Accuracy: 0.9231



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:40,  1.06s/it][A
Training:   3%|▎         | 3/95 [00:01<00:30,  3.00it/s][A
Training:   5%|▌         | 5/95 [00:01<00:28,  3.21it/s][A
Training:   8%|▊         | 8/95 [00:01<00:14,  6.01it/s][A
Training:  11%|█         | 10/95 [00:02<00:16,  5.02it/s][A
Training:  13%|█▎        | 12/95 [00:02<00:13,  6.13it/s][A
Training:  15%|█▍        | 14/95 [00:03<00:17,  4.63it/s][A
Training:  17%|█▋        | 16/95 [00:03<00:13,  5.99it/s][A
Training:  19%|█▉        | 18/95 [00:03<00:15,  4.89it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:15,  4.74it/s][A
Training:  25%|██▌       | 24/95 [00:04<00:10,  6.59it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:13,  5.15it/s][A
Training:  31%|███       | 29/95 [00:05<00:12,  5.19it/s][A
Training:  32%|███▏      | 30/95 [00:06<00:12,  5.20it/s][A
Training:  35%|███▍      | 33/95 [00:06<00:11,  5.43it/s][A
Training:  36%|███▌      | 34/95 [00

Epoch: 48/51 - Loss: 2.1661 - Accuracy: 0.9388



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:11,  1.04s/it][A
 42%|████▏     | 5/12 [00:01<00:02,  3.02it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.36it/s][A
Epochs:  94%|█████████▍| 48/51 [20:17<01:01, 20.61s/it]

Val Loss: 2.5293 - Val Accuracy: 0.9268



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:00<01:31,  1.02it/s][A
Training:   4%|▍         | 4/95 [00:01<00:19,  4.67it/s][A
Training:   6%|▋         | 6/95 [00:01<00:23,  3.79it/s][A
Training:   9%|▉         | 9/95 [00:02<00:20,  4.17it/s][A
Training:  11%|█         | 10/95 [00:02<00:18,  4.62it/s][A
Training:  13%|█▎        | 12/95 [00:02<00:13,  6.26it/s][A
Training:  15%|█▍        | 14/95 [00:03<00:16,  5.03it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:15,  5.12it/s][A
Training:  19%|█▉        | 18/95 [00:03<00:14,  5.27it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:13,  5.32it/s][A
Training:  23%|██▎       | 22/95 [00:04<00:13,  5.33it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:12,  5.45it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:12,  5.42it/s][A
Training:  29%|██▉       | 28/95 [00:05<00:09,  6.78it/s][A
Training:  31%|███       | 29/95 [00:05<00:13,  5.07it/s][A
Training:  32%|███▏      | 30/95 [00

Epoch: 49/51 - Loss: 2.2597 - Accuracy: 0.9386



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.15s/it][A
 33%|███▎      | 4/12 [00:01<00:01,  4.02it/s][A
 50%|█████     | 6/12 [00:01<00:01,  3.54it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.54it/s][A
Epochs:  96%|█████████▌| 49/51 [20:37<00:40, 20.42s/it]

Val Loss: 2.5867 - Val Accuracy: 0.9276



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:01<01:43,  1.10s/it][A
Training:   3%|▎         | 3/95 [00:01<00:30,  3.05it/s][A
Training:   5%|▌         | 5/95 [00:01<00:29,  3.05it/s][A
Training:   8%|▊         | 8/95 [00:02<00:15,  5.60it/s][A
Training:  11%|█         | 10/95 [00:02<00:18,  4.52it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:17,  4.56it/s][A
Training:  16%|█▌        | 15/95 [00:03<00:15,  5.26it/s][A
Training:  18%|█▊        | 17/95 [00:03<00:15,  4.97it/s][A
Training:  20%|██        | 19/95 [00:04<00:12,  5.86it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:14,  5.08it/s][A
Training:  24%|██▍       | 23/95 [00:04<00:11,  6.07it/s][A
Training:  26%|██▋       | 25/95 [00:05<00:13,  5.14it/s][A
Training:  28%|██▊       | 27/95 [00:05<00:10,  6.44it/s][A
Training:  31%|███       | 29/95 [00:05<00:11,  5.56it/s][A
Training:  33%|███▎      | 31/95 [00:06<00:10,  5.88it/s][A
Training:  35%|███▍      | 33/95 [00

Epoch: 50/51 - Loss: 2.1268 - Accuracy: 0.9410



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:09,  1.14it/s][A
 17%|█▋        | 2/12 [00:01<00:04,  2.14it/s][A
 42%|████▏     | 5/12 [00:01<00:01,  4.00it/s][A
 50%|█████     | 6/12 [00:01<00:01,  4.01it/s][A
 67%|██████▋   | 8/12 [00:01<00:00,  5.91it/s][A
 75%|███████▌  | 9/12 [00:02<00:00,  4.12it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.53it/s][A
Epochs:  98%|█████████▊| 50/51 [20:57<00:20, 20.32s/it]

Val Loss: 3.8427 - Val Accuracy: 0.8887



Training:   0%|          | 0/95 [00:00<?, ?it/s][A
Training:   1%|          | 1/95 [00:00<01:30,  1.04it/s][A
Training:   3%|▎         | 3/95 [00:01<00:27,  3.39it/s][A
Training:   5%|▌         | 5/95 [00:01<00:28,  3.21it/s][A
Training:   7%|▋         | 7/95 [00:01<00:18,  4.73it/s][A
Training:   9%|▉         | 9/95 [00:02<00:21,  4.07it/s][A
Training:  12%|█▏        | 11/95 [00:02<00:15,  5.56it/s][A
Training:  14%|█▎        | 13/95 [00:03<00:16,  4.94it/s][A
Training:  17%|█▋        | 16/95 [00:03<00:10,  7.46it/s][A
Training:  19%|█▉        | 18/95 [00:03<00:12,  6.06it/s][A
Training:  22%|██▏       | 21/95 [00:04<00:13,  5.63it/s][A
Training:  24%|██▍       | 23/95 [00:04<00:10,  6.83it/s][A
Training:  26%|██▋       | 25/95 [00:04<00:12,  5.43it/s][A
Training:  27%|██▋       | 26/95 [00:05<00:12,  5.50it/s][A
Training:  31%|███       | 29/95 [00:05<00:11,  5.75it/s][A
Training:  32%|███▏      | 30/95 [00:05<00:12,  5.39it/s][A
Training:  35%|███▍      | 33/95 [00:

Epoch: 51/51 - Loss: 2.3133 - Accuracy: 0.9365



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:10,  1.03it/s][A
 33%|███▎      | 4/12 [00:01<00:01,  4.54it/s][A
 50%|█████     | 6/12 [00:01<00:01,  3.75it/s][A
100%|██████████| 12/12 [00:02<00:00,  4.77it/s][A
Epochs: 100%|██████████| 51/51 [21:17<00:00, 25.05s/it]


Val Loss: 2.6536 - Val Accuracy: 0.9214


[32m[I 2023-12-14 12:00:42,880][0m Trial 18 finished with value: 0.921368420124054 and parameters: {'loss_learning_rate': 0.0029646983161703507, 'learning_rate': 0.0013293119848171884, 'weight_decay': 0.00038483722607233036, 'epsilon': 1.4163547089060394e-09, 'batch_size': 148, 'epochs': 51}. Best is trial 3 with value: 0.9284737706184387.[0m


Learning rate for Loss: 0.0001811448599871288
Learning rate: 0.09241933688157715
Weight decay: 0.0030758549603975196
Epsilon: 2.4542434170139457e-09
Batch size: 184
Number of epochs: 10


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]
Training:   0%|          | 0/76 [00:00<?, ?it/s][A
Training:   1%|▏         | 1/76 [00:01<01:33,  1.24s/it][A
Training:   3%|▎         | 2/76 [00:01<00:43,  1.71it/s][A
Training:   7%|▋         | 5/76 [00:02<00:26,  2.68it/s][A
Training:  12%|█▏        | 9/76 [00:03<00:18,  3.58it/s][A
Training:  17%|█▋        | 13/76 [00:04<00:16,  3.77it/s][A
Training:  21%|██        | 16/76 [00:04<00:11,  5.19it/s][A
Training:  24%|██▎       | 18/76 [00:04<00:13,  4.23it/s][A
Training:  26%|██▋       | 20/76 [00:05<00:10,  5.21it/s][A
Training:  29%|██▉       | 22/76 [00:05<00:12,  4.33it/s][A
Training:  33%|███▎      | 25/76 [00:06<00:12,  4.02it/s][A
Training:  34%|███▍      | 26/76 [00:06<00:11,  4.30it/s][A
Training:  38%|███▊      | 29/76 [00:07<00:10,  4.28it/s][A
Training:  39%|███▉      | 30/76 [00:07<00:10,  4.58it/s][A
Training:  43%|████▎     | 33/76 [00:08<00:09,  4.57it/s][A
Training:  45%|████▍     | 34/76 [00:08<00:09,  4.5

Epoch: 1/10 - Loss: 149.2462 - Accuracy: 0.7103



  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:01<00:11,  1.27s/it][A
 40%|████      | 4/10 [00:01<00:01,  3.70it/s][A
 60%|██████    | 6/10 [00:02<00:01,  3.19it/s][A
100%|██████████| 10/10 [00:03<00:00,  3.30it/s][A
Epochs:   0%|          | 0/10 [00:20<?, ?it/s]
[32m[I 2023-12-14 12:01:03,856][0m Trial 19 pruned. [0m


Val Loss: 7.4951 - Val Accuracy: 0.7756

Study statistics: 
  Number of finished trials:  20
  Number of pruned trials:  13
  Number of complete trials:  7


In [22]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.9284737706184387
  Params: 
    batch_size: 129
    epochs: 72
    epsilon: 2.733741931810298e-09
    learning_rate: 7.291524057212647e-05
    loss_learning_rate: 0.0012301344876773943
    weight_decay: 0.002392860364578965


In [None]:
# ViT P8-S8 ArcFace Mean

Best trial:
Value:  0.9284737706184387
Params: 
batch_size: 129
epochs: 72
epsilon: 2.733741931810298e-09
learning_rate: 7.291524057212647e-05
loss_learning_rate: 0.0012301344876773943
weight_decay: 0.002392860364578965