In [1]:
import os
import random
import pandas as pd
import numpy as np
import mxnet as mx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.utils.data import Dataset, DataLoader
from einops import rearrange, repeat
import optuna
from optuna.trial import TrialState
from tqdm import tqdm

In [2]:
def file_to_embed(embeds, file):
    emb = []
    for f in file:
        emb.append(embeds[f][0])
    return torch.stack(emb)

In [3]:
MIN_NUM_PATCHES = 16

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
class AdienceDataset(Dataset):
    def __init__(self, annot_file, img_dir, train=False):
        self.img_lbls = pd.read_csv(annot_file, header=None)
        self.img_dir = img_dir
        self.is_train = train
    
    def __len__(self):
        return len(self.img_lbls)
    
    def __getitem__(self, idx):
        img_file = self.img_lbls.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_file)
        image = mx.image.imread(img_path)
        if image.shape[1] != 112:
            image = mx.image.resize_short(image, 112)
        image = mx.nd.transpose(image, axes=(2,0,1))
        image = torch.tensor(image.asnumpy()).type(torch.FloatTensor)
        label = self.img_lbls.iloc[idx, 1]
        
        if self.is_train:
            positive_list = self.img_lbls[self.img_lbls.iloc[:, 1] == label].index.values
            positive_list = np.setdiff1d(positive_list, np.array([idx]))
            positive_item = random.choice(positive_list)
            positive_img = self.img_lbls.iloc[positive_item, 0]
            pos_img_path = os.path.join(self.img_dir, positive_img)
            pos_image = mx.image.imread(pos_img_path)
            if pos_image.shape[1] != 112:
                pos_image = mx.image.resize_short(pos_image, 112)
            pos_image = mx.nd.transpose(pos_image, axes=(2,0,1))
            pos_image = torch.tensor(pos_image.asnumpy()).type(torch.FloatTensor)
            
            negative_list = self.img_lbls[self.img_lbls.iloc[:, 1] != label].index.values
            negative_item = random.choice(negative_list)
            negative_img = self.img_lbls.iloc[negative_item, 0]
            neg_img_path = os.path.join(self.img_dir, negative_img)
            neg_image = mx.image.imread(neg_img_path)
            if neg_image.shape[1] != 112:
                neg_image = mx.image.resize_short(neg_image, 112)
            neg_image = mx.nd.transpose(neg_image, axes=(2,0,1))
            neg_image = torch.tensor(neg_image.asnumpy()).type(torch.FloatTensor)
            
            return image, pos_image, neg_image, label, img_file, positive_img, negative_img

        return image, label, img_file

In [6]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/", train=False)
val_data = AdienceDataset("../val.csv", "../cropped_Adience/", train=False)

In [20]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/", train=True)
val_data = AdienceDataset("../val.csv", "../cropped_Adience/", train=True)

In [7]:
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor, positive, negative):
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [8]:
class CombinedLoss(nn.Module):
    def __init__(self, beta=1.0):
        super(CombinedLoss, self).__init__()
        self.beta = beta
        self.triplet = TripletLoss(margin=1.0)
        self.classification = nn.CrossEntropyLoss()
        
    def forward(self, anchor, positive, negative, classification_out, labels):
        triplet_loss = self.triplet(anchor, positive, negative)
        classification_loss = self.classification(classification_out, labels)
        total_loss = (self.beta * triplet_loss) + classification_loss
        
        return total_loss

In [9]:
class CosFace(nn.Module):
    r"""Implement of CosFace (https://arxiv.org/pdf/1801.09414.pdf):
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
        s: norm of input feature
        m: margin
        cos(theta)-m
    """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.35):
        super(CosFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id
        self.s = s
        self.m = m
        print("self.device_id", self.device_id)
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------

        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)
        phi = cosine - self.m
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot

        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features = ' + str(self.in_features) \
               + ', out_features = ' + str(self.out_features) \
               + ', s = ' + str(self.s) \
               + ', m = ' + str(self.m) + ')'

In [10]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

In [11]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [12]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [13]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        #embed()
        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, mask_value)
            del mask

        attn = dots.softmax(dim=-1)

        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)

        return out

In [14]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, mask = None):
        for attn, ff in self.layers:
            x = attn(x, mask = mask)
            #embed()
            x = ff(x)
        return x

In [15]:
class ViT_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, dim, depth, heads, mlp_dim, pool = 'mean', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")
        else:
            if self.loss_type == 'CosFace':
                self.loss = CosFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)

    def forward(self, img, label=None, mask=None):
        p = self.patch_size
        
        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        x = self.transformer(x, mask)

        # y = x[:, 0]
        z = x[:, 1:].mean(dim = 1)

        # y = self.to_latent(y)
        # emb_y = self.mlp_head(y)
        z = self.to_latent(z)
        emb_z = self.mlp_head(z)
        # emb = torch.cat((emb_y, emb_z), dim=1)
        emb = emb_z
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        else:
            return emb

In [16]:
class ViT_plus(nn.Module):
    def __init__(self):
        super(ViT_plus, self).__init__()
        
        self.fc1 = nn.Linear(in_features=512, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=2)
        
    def forward(self, x):
        x = self.fc1(x)
        x_cosface = x
        x_classification = self.fc2(x)
        
        return x_cosface, x_classification

In [17]:
model = ViT_face(
            image_size=112,
            patch_size=8,
            loss_type='CosFace',
            GPU_ID= [device],
            num_class=93431,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        ).to(device)
model.load_state_dict(
    torch.load("../Backbone_VIT_Epoch_2_Batch_20000_Time_2021-01-12-16-48_checkpoint.pth", map_location=device)
)

self.device_id [device(type='cuda', index=0)]


<All keys matched successfully>

In [18]:
for param in model.parameters():
    param.requires_grad = False

In [19]:
embeds = {}
model.eval()

with torch.no_grad():
    for img, label, file in train_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

    for img, label, file in val_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

In [21]:
best_accu = 0.9341313242912292
def objective(trial):
    model_xtr = ViT_plus().to(device)
    
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    wd = trial.suggest_float('weight_decay', 1e-4, 1e-2, log=True)
    eps = trial.suggest_float("epsilon", 1e-9, 1e-7, log=True)
    optimizer = opt.AdamW(model_xtr.parameters(), lr=lr, eps=eps, weight_decay=wd)
    
    criterion = CombinedLoss().to(device)
    
    batch_size = trial.suggest_int('batch_size', 50, 300)
    num_epochs = trial.suggest_int('epochs', 10, 100)
    
    print("Learning rate: "+ str(lr))
    print("Weight decay: "+ str(wd))
    print("Epsilon: "+ str(eps))
    print("Batch size: "+ str(batch_size))
    print("Number of epochs: "+ str(num_epochs))
    
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=4)
        
        # training loop
        running_loss = []
        running_accu = []
        
        model_xtr.train()
        for img, pos_img, neg_img, label, img_file, pos_file, neg_file in tqdm(train_loader, desc="Training", leave=False):
            img, pos_img, neg_img, label = img.to(device), pos_img.to(device), neg_img.to(device), label.to(device)

            x1 = file_to_embed(embeds, img_file)
            x2 = file_to_embed(embeds, pos_file)
            x3 = file_to_embed(embeds, neg_file)
            
            optimizer.zero_grad()
            anchor, output = model_xtr(x1)
            pos, _ = model_xtr(x2)
            neg, _ = model_xtr(x3)
            
            pred = torch.argmax(output, 1)
            accuracy = torch.eq(pred, label).sum() / len(img)

            loss = criterion(anchor, pos, neg, output, label)
            loss.backward()
            optimizer.step()

            running_accu.append(accuracy.cpu().detach().numpy())
            running_loss.append(loss.cpu().detach().numpy())
        print("Epoch: {}/{} - Loss: {:.4f} - Accuracy: {:.4f}".format(epoch+1, num_epochs, np.mean(running_loss), np.mean(running_accu)))
        
        # validation loop
        val_loss = []
        val_accu = []

        model_xtr.eval()
        with torch.no_grad():
            for img, pos_img, neg_img, label, img_file, pos_file, neg_file in tqdm(val_loader):
                img, pos_img, neg_img, label = img.to(device), pos_img.to(device), neg_img.to(device), label.to(device)
                
                x1 = file_to_embed(embeds, img_file)
                x2 = file_to_embed(embeds, pos_file)
                x3 = file_to_embed(embeds, neg_file)
                
                anchor, output = model_xtr(x1)
                pos, _ = model_xtr(x2)
                neg, _ = model_xtr(x3)
                
                pred = torch.argmax(output, 1)
                accuracy = torch.eq(pred, label).sum() / len(img)
                
                loss = criterion(anchor, pos, neg, output, label)
                
                val_accu.append(accuracy.cpu().detach().numpy())
                val_loss.append(loss.cpu().detach().numpy())
        val_accu = np.mean(val_accu)
        val_loss = np.mean(val_loss)
        print("Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(val_loss, val_accu))
        
        trial.report(val_accu, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    global best_accu
    if val_accu > best_accu:
        best_accu = val_accu
        print("Saving best model...")
        torch.save(model_xtr.state_dict(), "../vit_8-8_triplet_mean_only.pt")
            
    return val_accu

In [24]:
study = optuna.create_study(direction='maximize',
                            study_name='triplet-8-8-mean-only-vit-study',
                            storage='sqlite:///study.db',
                            load_if_exists=True)
study.optimize(objective, n_trials=5)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# Display the study statistics
print("\nStudy statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

[32m[I 2023-12-14 14:06:02,661][0m Using an existing study with name 'triplet-8-8-mean-only-vit-study' instead of creating a new one.[0m


Learning rate: 3.75080952256679e-05
Weight decay: 0.0028493538344439326
Epsilon: 4.413367650088382e-08
Batch size: 83
Number of epochs: 45


Epochs:   0%|          | 0/45 [00:00<?, ?it/s]
Training:   0%|          | 0/169 [00:00<?, ?it/s][A
Training:   1%|          | 1/169 [00:01<05:20,  1.91s/it][A
Training:   3%|▎         | 5/169 [00:03<01:33,  1.75it/s][A
Training:   5%|▍         | 8/169 [00:03<00:50,  3.16it/s][A
Training:   6%|▌         | 10/169 [00:04<01:06,  2.41it/s][A
Training:   8%|▊         | 13/169 [00:05<01:05,  2.38it/s][A
Training:   9%|▉         | 16/169 [00:06<00:43,  3.55it/s][A
Training:  11%|█         | 18/169 [00:07<00:56,  2.68it/s][A
Training:  12%|█▏        | 21/169 [00:08<00:58,  2.54it/s][A
Training:  13%|█▎        | 22/169 [00:08<00:52,  2.81it/s][A
Training:  14%|█▍        | 24/169 [00:08<00:38,  3.75it/s][A
Training:  15%|█▌        | 26/169 [00:10<00:53,  2.66it/s][A
Training:  17%|█▋        | 29/169 [00:11<00:54,  2.57it/s][A
Training:  18%|█▊        | 30/169 [00:11<00:48,  2.86it/s][A
Training:  20%|█▉        | 33/169 [00:12<00:51,  2.65it/s][A
Training:  21%|██        | 35/169 [

Epoch: 1/45 - Loss: 3.6568 - Accuracy: 0.5617



  0%|          | 0/22 [00:00<?, ?it/s][A
  5%|▍         | 1/22 [00:01<00:35,  1.68s/it][A
 23%|██▎       | 5/22 [00:02<00:08,  1.93it/s][A
 41%|████      | 9/22 [00:04<00:05,  2.38it/s][A
 55%|█████▍    | 12/22 [00:04<00:02,  3.59it/s][A
 64%|██████▎   | 14/22 [00:05<00:03,  2.64it/s][A
 77%|███████▋  | 17/22 [00:06<00:01,  2.73it/s][A
 82%|████████▏ | 18/22 [00:07<00:01,  2.88it/s][A
100%|██████████| 22/22 [00:07<00:00,  2.77it/s][A
Epochs:   0%|          | 0/45 [01:05<?, ?it/s]
[32m[I 2023-12-14 14:07:09,071][0m Trial 15 pruned. [0m


Val Loss: 3.0701 - Val Accuracy: 0.6777
Learning rate: 0.0023459311141505977
Weight decay: 0.00016674470588385387
Epsilon: 1.5170457868268162e-08
Batch size: 289
Number of epochs: 64


Epochs:   0%|          | 0/64 [00:00<?, ?it/s]
Training:   0%|          | 0/49 [00:00<?, ?it/s][A
Training:   2%|▏         | 1/49 [00:05<04:10,  5.22s/it][A
Training:   6%|▌         | 3/49 [00:05<01:04,  1.41s/it][A
Training:  10%|█         | 5/49 [00:09<01:22,  1.87s/it][A
Training:  14%|█▍        | 7/49 [00:10<00:46,  1.11s/it][A
Training:  18%|█▊        | 9/49 [00:14<01:01,  1.54s/it][A
Training:  20%|██        | 10/49 [00:14<00:48,  1.24s/it][A
Training:  24%|██▍       | 12/49 [00:14<00:29,  1.25it/s][A
Training:  27%|██▋       | 13/49 [00:19<00:56,  1.56s/it][A
Training:  31%|███       | 15/49 [00:19<00:33,  1.00it/s][A
Training:  35%|███▍      | 17/49 [00:23<00:44,  1.40s/it][A
Training:  39%|███▉      | 19/49 [00:23<00:28,  1.04it/s][A
Training:  43%|████▎     | 21/49 [00:28<00:38,  1.37s/it][A
Training:  47%|████▋     | 23/49 [00:28<00:25,  1.04it/s][A
Training:  51%|█████     | 25/49 [00:32<00:32,  1.36s/it][A
Training:  55%|█████▌    | 27/49 [00:33<00:21,  1.03

Epoch: 1/64 - Loss: 2.4162 - Accuracy: 0.7651



  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:04<00:29,  4.88s/it][A
 29%|██▊       | 2/7 [00:04<00:10,  2.07s/it][A
 43%|████▎     | 3/7 [00:05<00:04,  1.18s/it][A
 71%|███████▏  | 5/7 [00:22<00:10,  5.34s/it][A
100%|██████████| 7/7 [00:22<00:00,  3.19s/it][A
Epochs:   0%|          | 0/64 [01:20<?, ?it/s]
[32m[I 2023-12-14 14:08:29,594][0m Trial 16 pruned. [0m


Val Loss: 2.0107 - Val Accuracy: 0.8895
Learning rate: 0.00046688870570582087
Weight decay: 0.0003123198452835231
Epsilon: 5.0252826743915564e-08
Batch size: 97
Number of epochs: 98


Epochs:   0%|          | 0/98 [00:00<?, ?it/s]
Training:   0%|          | 0/145 [00:00<?, ?it/s][A
Training:   1%|          | 1/145 [00:01<04:33,  1.90s/it][A
Training:   3%|▎         | 4/145 [00:02<00:54,  2.57it/s][A
Training:   3%|▎         | 4/145 [00:13<00:54,  2.57it/s][A
Training:   3%|▎         | 5/145 [00:16<10:04,  4.31s/it][A
Training:   6%|▌         | 8/145 [00:16<04:34,  2.00s/it][A
Training:   7%|▋         | 10/145 [00:18<03:31,  1.57s/it][A
Training:   8%|▊         | 12/145 [00:18<02:22,  1.07s/it][A
Training:  10%|▉         | 14/145 [00:19<02:06,  1.04it/s][A
Training:  11%|█         | 16/145 [00:19<01:27,  1.47it/s][A
Training:  12%|█▏        | 18/145 [00:21<01:29,  1.42it/s][A
Training:  13%|█▎        | 19/145 [00:21<01:15,  1.67it/s][A
Training:  14%|█▍        | 21/145 [00:22<01:18,  1.59it/s][A
Training:  16%|█▌        | 23/145 [00:22<00:53,  2.27it/s][A
Training:  17%|█▋        | 25/145 [00:24<01:04,  1.87it/s][A
Training:  19%|█▉        | 28/145 [00

Epoch: 1/98 - Loss: 2.2876 - Accuracy: 0.8153



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:01<00:31,  1.86s/it][A
 22%|██▏       | 4/18 [00:01<00:05,  2.58it/s][A
 33%|███▎      | 6/18 [00:03<00:06,  1.92it/s][A
 50%|█████     | 9/18 [00:04<00:04,  1.98it/s][A
 67%|██████▋   | 12/18 [00:04<00:01,  3.19it/s][A
 78%|███████▊  | 14/18 [00:06<00:01,  2.36it/s][A
 89%|████████▉ | 16/18 [00:06<00:00,  3.14it/s][A
100%|██████████| 18/18 [00:07<00:00,  2.33it/s][A
Epochs:   0%|          | 0/98 [01:18<?, ?it/s]
[32m[I 2023-12-14 14:09:48,075][0m Trial 17 pruned. [0m


Val Loss: 1.5143 - Val Accuracy: 0.8888
Learning rate: 0.0012027201009631523
Weight decay: 0.0010233374008044094
Epsilon: 5.814687699948533e-09
Batch size: 143
Number of epochs: 26


Epochs:   0%|          | 0/26 [00:00<?, ?it/s]
Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:26,  2.74s/it][A
Training:   4%|▍         | 4/98 [00:02<00:51,  1.82it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.25it/s][A
Training:   9%|▉         | 9/98 [00:07<01:07,  1.32it/s][A
Training:  10%|█         | 10/98 [00:07<00:57,  1.52it/s][A
Training:  13%|█▎        | 13/98 [00:09<00:57,  1.47it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:49,  1.70it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:51,  1.58it/s][A
Training:  18%|█▊        | 18/98 [00:11<00:44,  1.81it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:47,  1.61it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:29,  2.48it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:48,  1.51it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:40,  1.77it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:43,  1.59it/s][A
Training:  31%|███       | 30/98 [00:18<00:36,  1.8

Epoch: 1/26 - Loss: 2.0620 - Accuracy: 0.8212



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:30,  2.56s/it][A
 15%|█▌        | 2/13 [00:02<00:12,  1.13s/it][A
 31%|███       | 4/13 [00:02<00:04,  2.20it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.39it/s][A
 69%|██████▉   | 9/13 [00:06<00:02,  1.45it/s][A
 77%|███████▋  | 10/13 [00:06<00:01,  1.70it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.78it/s][A
Epochs:   4%|▍         | 1/26 [01:04<26:52, 64.51s/it]

Val Loss: 1.5190 - Val Accuracy: 0.9135



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:23,  2.72s/it][A
Training:   4%|▍         | 4/98 [00:02<00:51,  1.84it/s][A
Training:   6%|▌         | 6/98 [00:05<01:11,  1.28it/s][A
Training:   9%|▉         | 9/98 [00:07<01:07,  1.32it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:39,  2.15it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:54,  1.55it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:39,  2.06it/s][A
Training:  17%|█▋        | 17/98 [00:11<01:02,  1.29it/s][A
Training:  19%|█▉        | 19/98 [00:12<00:44,  1.79it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:53,  1.43it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:40,  1.85it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:34,  2.17it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:55,  1.33it/s][A
Training:  28%|██▊       | 27/98 [00:16<00:38,  1.84it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:46,  1.49it/s][A
Training:  32%|███▏      | 31/98 [00

Epoch: 2/26 - Loss: 1.2649 - Accuracy: 0.9109



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:33,  2.81s/it][A
 31%|███       | 4/13 [00:02<00:05,  1.77it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.32it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  2.04it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.48it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.73it/s][A
Epochs:   8%|▊         | 2/26 [02:09<25:56, 64.86s/it]

Val Loss: 1.4186 - Val Accuracy: 0.8915



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:13,  2.61s/it][A
Training:   3%|▎         | 3/98 [00:02<01:09,  1.37it/s][A
Training:   5%|▌         | 5/98 [00:04<01:22,  1.12it/s][A
Training:   7%|▋         | 7/98 [00:05<00:50,  1.80it/s][A
Training:   9%|▉         | 9/98 [00:07<01:06,  1.35it/s][A
Training:  10%|█         | 10/98 [00:07<00:54,  1.61it/s][A
Training:  11%|█         | 11/98 [00:07<00:43,  1.98it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.40it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:40,  2.04it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:54,  1.48it/s][A
Training:  18%|█▊        | 18/98 [00:11<00:46,  1.73it/s][A
Training:  20%|██        | 20/98 [00:12<00:30,  2.55it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:56,  1.37it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:36,  2.06it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:50,  1.44it/s][A
Training:  27%|██▋       | 26/98 [00:

Epoch: 3/26 - Loss: 1.1569 - Accuracy: 0.9179



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.73s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.81it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.27it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.33it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.70it/s][A
Epochs:  12%|█▏        | 3/26 [03:14<24:56, 65.06s/it]

Val Loss: 1.1047 - Val Accuracy: 0.9253



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:24,  2.73s/it][A
Training:   3%|▎         | 3/98 [00:02<01:11,  1.33it/s][A
Training:   5%|▌         | 5/98 [00:05<01:28,  1.05it/s][A
Training:   8%|▊         | 8/98 [00:05<00:42,  2.10it/s][A
Training:  10%|█         | 10/98 [00:07<00:59,  1.49it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:40,  2.11it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:56,  1.49it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:56,  1.42it/s][A
Training:  20%|██        | 20/98 [00:12<00:35,  2.17it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:47,  1.61it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:48,  1.51it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:31,  2.21it/s][A
Training:  31%|███       | 30/98 [00:18<00:41,  1.65it/s][A
Training:  34%|███▎      | 33/98 [00:20<00:41,  1.56it/s][A
Training:  35%|███▍      | 34/98 [00:21<00:37,  1.72it/s][A
Training:  38%|███▊      | 37/98 [00

Epoch: 4/26 - Loss: 1.0698 - Accuracy: 0.9212



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:34,  2.87s/it][A
 31%|███       | 4/13 [00:02<00:05,  1.74it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.29it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.32it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.70it/s][A
Epochs:  15%|█▌        | 4/26 [04:20<23:52, 65.13s/it]

Val Loss: 1.1080 - Val Accuracy: 0.9211



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:16,  2.65s/it][A
Training:   3%|▎         | 3/98 [00:02<01:09,  1.38it/s][A
Training:   5%|▌         | 5/98 [00:04<01:22,  1.13it/s][A
Training:   6%|▌         | 6/98 [00:04<01:01,  1.48it/s][A
Training:   8%|▊         | 8/98 [00:05<00:37,  2.42it/s][A
Training:   9%|▉         | 9/98 [00:07<01:10,  1.26it/s][A
Training:  10%|█         | 10/98 [00:07<00:55,  1.58it/s][A
Training:  11%|█         | 11/98 [00:07<00:43,  1.98it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:02,  1.37it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:40,  2.06it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:54,  1.48it/s][A
Training:  19%|█▉        | 19/98 [00:11<00:37,  2.09it/s][A
Training:  21%|██▏       | 21/98 [00:13<00:50,  1.54it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:35,  2.10it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:47,  1.54it/s][A
Training:  28%|██▊       | 27/98 [00:1

Epoch: 5/26 - Loss: 1.0017 - Accuracy: 0.9209



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.64s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.37it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.13it/s][A
 62%|██████▏   | 8/13 [00:04<00:02,  2.27it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.55it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.75it/s][A
Epochs:  19%|█▉        | 5/26 [05:24<22:43, 64.91s/it]

Val Loss: 1.1825 - Val Accuracy: 0.9086



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:23,  2.72s/it][A
Training:   3%|▎         | 3/98 [00:02<01:11,  1.34it/s][A
Training:   5%|▌         | 5/98 [00:04<01:23,  1.11it/s][A
Training:   8%|▊         | 8/98 [00:05<00:40,  2.23it/s][A
Training:  10%|█         | 10/98 [00:07<00:57,  1.53it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:40,  2.13it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:05,  1.29it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:43,  1.91it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:56,  1.43it/s][A
Training:  19%|█▉        | 19/98 [00:11<00:39,  1.99it/s][A
Training:  21%|██▏       | 21/98 [00:13<00:52,  1.47it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:37,  2.02it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:47,  1.52it/s][A
Training:  28%|██▊       | 27/98 [00:16<00:36,  1.95it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:44,  1.56it/s][A
Training:  32%|███▏      | 31/98 [00

Epoch: 6/26 - Loss: 0.9777 - Accuracy: 0.9229



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:33,  2.77s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.80it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.26it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  1.91it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.15it/s][A
 92%|█████████▏| 12/13 [00:07<00:00,  2.07it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.67it/s][A
Epochs:  23%|██▎       | 6/26 [06:29<21:36, 64.80s/it]

Val Loss: 1.2946 - Val Accuracy: 0.8780



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:30,  2.79s/it][A
Training:   4%|▍         | 4/98 [00:02<00:53,  1.77it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.25it/s][A
Training:   9%|▉         | 9/98 [00:07<01:09,  1.28it/s][A
Training:  11%|█         | 11/98 [00:07<00:47,  1.82it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:02,  1.37it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:43,  1.91it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:55,  1.45it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:47,  1.69it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:49,  1.55it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:42,  1.78it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:45,  1.59it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:39,  1.84it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:43,  1.59it/s][A
Training:  31%|███       | 30/98 [00:19<00:36,  1.85it/s][A
Training:  34%|███▎      | 33/98 [00

Epoch: 7/26 - Loss: 0.9515 - Accuracy: 0.9233



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.62s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.89it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.31it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.32it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.72it/s][A
Epochs:  27%|██▋       | 7/26 [07:35<20:37, 65.13s/it]

Val Loss: 1.0883 - Val Accuracy: 0.9210



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:35,  2.84s/it][A
Training:   4%|▍         | 4/98 [00:02<00:53,  1.75it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.24it/s][A
Training:   9%|▉         | 9/98 [00:07<01:09,  1.29it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:40,  2.10it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:54,  1.54it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:55,  1.45it/s][A
Training:  20%|██        | 20/98 [00:12<00:35,  2.18it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:47,  1.59it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:48,  1.50it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:32,  2.18it/s][A
Training:  31%|███       | 30/98 [00:19<00:42,  1.60it/s][A
Training:  34%|███▎      | 33/98 [00:21<00:43,  1.49it/s][A
Training:  37%|███▋      | 36/98 [00:21<00:28,  2.15it/s][A
Training:  39%|███▉      | 38/98 [00:23<00:37,  1.60it/s][A
Training:  41%|████      | 40/98 [00

Epoch: 8/26 - Loss: 0.9484 - Accuracy: 0.9241



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.60s/it][A
 15%|█▌        | 2/13 [00:02<00:12,  1.15s/it][A
 31%|███       | 4/13 [00:02<00:04,  2.16it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.37it/s][A
 69%|██████▉   | 9/13 [00:06<00:02,  1.41it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.65it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.75it/s][A
Epochs:  31%|███       | 8/26 [08:40<19:34, 65.27s/it]

Val Loss: 1.0573 - Val Accuracy: 0.9183



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:30,  2.79s/it][A
Training:   4%|▍         | 4/98 [00:02<00:52,  1.78it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.25it/s][A
Training:   9%|▉         | 9/98 [00:07<01:08,  1.30it/s][A
Training:  11%|█         | 11/98 [00:07<00:47,  1.83it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.40it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:36,  2.24it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:51,  1.57it/s][A
Training:  20%|██        | 20/98 [00:12<00:37,  2.08it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:55,  1.38it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:46,  1.65it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:30,  2.44it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:46,  1.55it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:46,  1.49it/s][A
Training:  31%|███       | 30/98 [00:18<00:39,  1.72it/s][A
Training:  34%|███▎      | 33/98 [00

Epoch: 9/26 - Loss: 0.9392 - Accuracy: 0.9221



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.71s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.83it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.28it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  1.99it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.43it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.70it/s][A
Epochs:  35%|███▍      | 9/26 [09:45<18:29, 65.26s/it]

Val Loss: 1.0088 - Val Accuracy: 0.9205



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:30,  2.79s/it][A
Training:   4%|▍         | 4/98 [00:02<00:52,  1.79it/s][A
Training:   6%|▌         | 6/98 [00:05<01:12,  1.26it/s][A
Training:   9%|▉         | 9/98 [00:07<01:10,  1.27it/s][A
Training:  11%|█         | 11/98 [00:07<00:48,  1.79it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.40it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:43,  1.90it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:55,  1.46it/s][A
Training:  19%|█▉        | 19/98 [00:12<00:39,  1.99it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:52,  1.46it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:37,  2.01it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:50,  1.45it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:30,  2.29it/s][A
Training:  31%|███       | 30/98 [00:19<00:42,  1.61it/s][A
Training:  33%|███▎      | 32/98 [00:19<00:30,  2.17it/s][A
Training:  35%|███▍      | 34/98 [00

Epoch: 10/26 - Loss: 0.9341 - Accuracy: 0.9278



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.67s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.83it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.30it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  2.02it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.41it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.68it/s][A
Epochs:  38%|███▊      | 10/26 [10:51<17:27, 65.46s/it]

Val Loss: 0.9926 - Val Accuracy: 0.9275



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:10,  2.59s/it][A
Training:   2%|▏         | 2/98 [00:02<01:48,  1.13s/it][A
Training:   5%|▌         | 5/98 [00:04<01:18,  1.19it/s][A
Training:   7%|▋         | 7/98 [00:05<00:48,  1.89it/s][A
Training:   9%|▉         | 9/98 [00:07<01:07,  1.32it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:38,  2.25it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:55,  1.53it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:56,  1.44it/s][A
Training:  20%|██        | 20/98 [00:12<00:36,  2.16it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:47,  1.61it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:34,  2.13it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:45,  1.57it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:34,  2.04it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:51,  1.35it/s][A
Training:  33%|███▎      | 32/98 [00:19<00:31,  2.08it/s][A
Training:  34%|███▎      | 33/98 [00:

Epoch: 11/26 - Loss: 0.9340 - Accuracy: 0.9187



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.71s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.83it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.28it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.32it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.70it/s][A
Epochs:  42%|████▏     | 11/26 [11:57<16:21, 65.46s/it]

Val Loss: 1.0068 - Val Accuracy: 0.9039



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:18,  2.66s/it][A
Training:   2%|▏         | 2/98 [00:02<01:51,  1.16s/it][A
Training:   3%|▎         | 3/98 [00:02<01:05,  1.46it/s][A
Training:   5%|▌         | 5/98 [00:04<01:21,  1.14it/s][A
Training:   6%|▌         | 6/98 [00:05<01:01,  1.50it/s][A
Training:   9%|▉         | 9/98 [00:07<01:02,  1.42it/s][A
Training:  11%|█         | 11/98 [00:07<00:43,  2.02it/s][A
Training:  13%|█▎        | 13/98 [00:09<00:58,  1.46it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:40,  2.03it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:55,  1.45it/s][A
Training:  20%|██        | 20/98 [00:12<00:33,  2.33it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:46,  1.62it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:33,  2.19it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:46,  1.55it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:47,  1.46it/s][A
Training:  32%|███▏      | 31/98 [00:1

Epoch: 12/26 - Loss: 0.9051 - Accuracy: 0.9229



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:30,  2.56s/it][A
 15%|█▌        | 2/13 [00:02<00:12,  1.15s/it][A
 38%|███▊      | 5/13 [00:04<00:06,  1.22it/s][A
 46%|████▌     | 6/13 [00:04<00:04,  1.52it/s][A
 54%|█████▍    | 7/13 [00:05<00:03,  1.93it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.38it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.68it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.72it/s][A
Epochs:  46%|████▌     | 12/26 [13:03<15:19, 65.69s/it]

Val Loss: 1.0469 - Val Accuracy: 0.9259



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:39,  2.88s/it][A
Training:   3%|▎         | 3/98 [00:03<01:16,  1.25it/s][A
Training:   5%|▌         | 5/98 [00:05<01:28,  1.05it/s][A
Training:   8%|▊         | 8/98 [00:05<00:42,  2.11it/s][A
Training:  10%|█         | 10/98 [00:07<00:59,  1.47it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:41,  2.08it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:57,  1.47it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:39,  2.06it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:53,  1.49it/s][A
Training:  20%|██        | 20/98 [00:12<00:38,  2.05it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:51,  1.48it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:36,  2.02it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:56,  1.29it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:46,  1.55it/s][A
Training:  30%|██▉       | 29/98 [00:19<00:47,  1.44it/s][A
Training:  31%|███       | 30/98 [00

Epoch: 13/26 - Loss: 0.9222 - Accuracy: 0.9268



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:35,  2.96s/it][A
 23%|██▎       | 3/13 [00:03<00:08,  1.24it/s][A
 38%|███▊      | 5/13 [00:05<00:07,  1.03it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  2.07it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.43it/s][A
100%|██████████| 13/13 [00:08<00:00,  1.60it/s][A
Epochs:  50%|█████     | 13/26 [14:10<14:20, 66.20s/it]

Val Loss: 1.0770 - Val Accuracy: 0.9205



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:33,  2.82s/it][A
Training:   3%|▎         | 3/98 [00:02<01:13,  1.30it/s][A
Training:   5%|▌         | 5/98 [00:05<01:28,  1.05it/s][A
Training:   7%|▋         | 7/98 [00:05<00:51,  1.75it/s][A
Training:   9%|▉         | 9/98 [00:07<01:10,  1.25it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:39,  2.18it/s][A
Training:  14%|█▍        | 14/98 [00:10<00:55,  1.51it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:57,  1.42it/s][A
Training:  20%|██        | 20/98 [00:12<00:36,  2.13it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:50,  1.50it/s][A
Training:  24%|██▍       | 24/98 [00:15<00:37,  1.99it/s][A
Training:  27%|██▋       | 26/98 [00:17<00:47,  1.50it/s][A
Training:  28%|██▊       | 27/98 [00:17<00:41,  1.72it/s][A
Training:  30%|██▉       | 29/98 [00:19<00:51,  1.35it/s][A
Training:  32%|███▏      | 31/98 [00:19<00:36,  1.82it/s][A
Training:  34%|███▎      | 33/98 [00:

Epoch: 14/26 - Loss: 0.9018 - Accuracy: 0.9255



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:33,  2.78s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.31it/s][A
 38%|███▊      | 5/13 [00:05<00:07,  1.08it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  2.14it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.49it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.68it/s][A
Epochs:  54%|█████▍    | 14/26 [15:17<13:17, 66.49s/it]

Val Loss: 1.0112 - Val Accuracy: 0.9049



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:23,  2.71s/it][A
Training:   3%|▎         | 3/98 [00:02<01:11,  1.33it/s][A
Training:   5%|▌         | 5/98 [00:04<01:23,  1.11it/s][A
Training:   7%|▋         | 7/98 [00:05<00:50,  1.81it/s][A
Training:   9%|▉         | 9/98 [00:07<01:08,  1.31it/s][A
Training:  11%|█         | 11/98 [00:07<00:47,  1.84it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:01,  1.39it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:42,  1.97it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:55,  1.46it/s][A
Training:  19%|█▉        | 19/98 [00:12<00:39,  2.02it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:51,  1.50it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:43,  1.75it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:46,  1.58it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:40,  1.77it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:42,  1.62it/s][A
Training:  31%|███       | 30/98 [00:

Epoch: 15/26 - Loss: 0.8856 - Accuracy: 0.9265



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.64s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.36it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.11it/s][A
 46%|████▌     | 6/13 [00:05<00:04,  1.45it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.44it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.71it/s][A
Epochs:  58%|█████▊    | 15/26 [16:23<12:07, 66.17s/it]

Val Loss: 1.0129 - Val Accuracy: 0.9167



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:26,  2.74s/it][A
Training:   4%|▍         | 4/98 [00:02<00:52,  1.79it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.25it/s][A
Training:   9%|▉         | 9/98 [00:07<01:09,  1.29it/s][A
Training:  11%|█         | 11/98 [00:07<00:48,  1.80it/s][A
Training:  13%|█▎        | 13/98 [00:09<00:59,  1.44it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:36,  2.24it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:57,  1.40it/s][A
Training:  20%|██        | 20/98 [00:12<00:35,  2.19it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:53,  1.44it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:44,  1.69it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:29,  2.48it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:52,  1.40it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:43,  1.65it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:27,  2.54it/s][A
Training:  30%|██▉       | 29/98 [00

Epoch: 16/26 - Loss: 0.8944 - Accuracy: 0.9281



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:30,  2.55s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.40it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.11it/s][A
 62%|██████▏   | 8/13 [00:04<00:02,  2.22it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.55it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.72it/s][A
Epochs:  62%|██████▏   | 16/26 [17:28<10:58, 65.87s/it]

Val Loss: 0.9794 - Val Accuracy: 0.9210



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:19,  2.67s/it][A
Training:   3%|▎         | 3/98 [00:02<01:09,  1.36it/s][A
Training:   5%|▌         | 5/98 [00:04<01:25,  1.09it/s][A
Training:   8%|▊         | 8/98 [00:05<00:41,  2.18it/s][A
Training:  10%|█         | 10/98 [00:07<00:59,  1.48it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.40it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:37,  2.17it/s][A
Training:  18%|█▊        | 18/98 [00:11<00:50,  1.58it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:51,  1.49it/s][A
Training:  23%|██▎       | 23/98 [00:14<00:38,  1.94it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:48,  1.51it/s][A
Training:  28%|██▊       | 27/98 [00:16<00:35,  2.02it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:45,  1.51it/s][A
Training:  33%|███▎      | 32/98 [00:18<00:28,  2.33it/s][A
Training:  35%|███▍      | 34/98 [00:21<00:38,  1.65it/s][A
Training:  38%|███▊      | 37/98 [00

Epoch: 17/26 - Loss: 0.9047 - Accuracy: 0.9160



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.63s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.37it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.12it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.46it/s][A
 85%|████████▍ | 11/13 [00:07<00:00,  2.01it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.74it/s][A
Epochs:  65%|██████▌   | 17/26 [18:34<09:52, 65.82s/it]

Val Loss: 1.0632 - Val Accuracy: 0.9210



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:41,  2.90s/it][A
Training:   3%|▎         | 3/98 [00:03<01:15,  1.26it/s][A
Training:   5%|▌         | 5/98 [00:05<01:27,  1.06it/s][A
Training:   8%|▊         | 8/98 [00:05<00:42,  2.12it/s][A
Training:  10%|█         | 10/98 [00:07<01:01,  1.42it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:01,  1.39it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:38,  2.15it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:50,  1.58it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:52,  1.47it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:34,  2.17it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:44,  1.63it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:45,  1.52it/s][A
Training:  32%|███▏      | 31/98 [00:19<00:34,  1.95it/s][A
Training:  34%|███▎      | 33/98 [00:21<00:44,  1.47it/s][A
Training:  37%|███▋      | 36/98 [00:21<00:27,  2.22it/s][A
Training:  39%|███▉      | 38/98 [00

Epoch: 18/26 - Loss: 0.9124 - Accuracy: 0.9250



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.71s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.83it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.27it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.31it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.69it/s][A
Epochs:  69%|██████▉   | 18/26 [19:40<08:47, 65.99s/it]

Val Loss: 0.9975 - Val Accuracy: 0.9275



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:36,  2.85s/it][A
Training:   3%|▎         | 3/98 [00:02<01:14,  1.27it/s][A
Training:   5%|▌         | 5/98 [00:05<01:27,  1.07it/s][A
Training:   8%|▊         | 8/98 [00:05<00:42,  2.14it/s][A
Training:  10%|█         | 10/98 [00:07<01:00,  1.46it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.41it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:37,  2.18it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:51,  1.56it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:51,  1.49it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:45,  1.67it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:32,  2.28it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:51,  1.42it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:44,  1.63it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:28,  2.44it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:50,  1.36it/s][A
Training:  31%|███       | 30/98 [00

Epoch: 19/26 - Loss: 0.9202 - Accuracy: 0.9265



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:33,  2.78s/it][A
 31%|███       | 4/13 [00:02<00:05,  1.79it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.30it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.34it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.72it/s][A
Epochs:  73%|███████▎  | 19/26 [20:46<07:41, 65.99s/it]

Val Loss: 1.0517 - Val Accuracy: 0.9227



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:39,  2.88s/it][A
Training:   4%|▍         | 4/98 [00:03<00:54,  1.72it/s][A
Training:   6%|▌         | 6/98 [00:05<01:15,  1.23it/s][A
Training:   9%|▉         | 9/98 [00:07<01:10,  1.27it/s][A
Training:  12%|█▏        | 12/98 [00:07<00:41,  2.06it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:55,  1.51it/s][A
Training:  16%|█▋        | 16/98 [00:10<00:40,  2.04it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:54,  1.48it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:54,  1.40it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:35,  2.11it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:45,  1.57it/s][A
Training:  29%|██▊       | 28/98 [00:17<00:33,  2.07it/s][A
Training:  31%|███       | 30/98 [00:19<00:44,  1.54it/s][A
Training:  34%|███▎      | 33/98 [00:21<00:44,  1.46it/s][A
Training:  37%|███▋      | 36/98 [00:21<00:28,  2.18it/s][A
Training:  39%|███▉      | 38/98 [00

Epoch: 20/26 - Loss: 0.9235 - Accuracy: 0.9295



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.64s/it][A
 23%|██▎       | 3/13 [00:02<00:07,  1.38it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.14it/s][A
 46%|████▌     | 6/13 [00:04<00:04,  1.49it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.44it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.74it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.74it/s][A
Epochs:  77%|███████▋  | 20/26 [21:52<06:36, 66.06s/it]

Val Loss: 1.0434 - Val Accuracy: 0.9205



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:29,  2.78s/it][A
Training:   2%|▏         | 2/98 [00:02<01:59,  1.25s/it][A
Training:   5%|▌         | 5/98 [00:05<01:19,  1.17it/s][A
Training:   6%|▌         | 6/98 [00:05<01:01,  1.50it/s][A
Training:   8%|▊         | 8/98 [00:05<00:36,  2.46it/s][A
Training:  10%|█         | 10/98 [00:07<00:59,  1.49it/s][A
Training:  13%|█▎        | 13/98 [00:09<00:59,  1.42it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:36,  2.24it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:49,  1.60it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:50,  1.51it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:44,  1.70it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:46,  1.56it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:41,  1.74it/s][A
Training:  29%|██▊       | 28/98 [00:16<00:29,  2.41it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:47,  1.44it/s][A
Training:  31%|███       | 30/98 [00:

Epoch: 21/26 - Loss: 0.8863 - Accuracy: 0.9317



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:32,  2.71s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.84it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.28it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.31it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.68it/s][A
Epochs:  81%|████████  | 21/26 [22:58<05:29, 66.00s/it]

Val Loss: 0.9337 - Val Accuracy: 0.9253



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:29,  2.78s/it][A
Training:   4%|▍         | 4/98 [00:02<00:52,  1.79it/s][A
Training:   6%|▌         | 6/98 [00:05<01:13,  1.26it/s][A
Training:   9%|▉         | 9/98 [00:07<01:09,  1.29it/s][A
Training:  10%|█         | 10/98 [00:07<00:58,  1.52it/s][A
Training:  13%|█▎        | 13/98 [00:09<00:59,  1.43it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:50,  1.67it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:51,  1.56it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:45,  1.75it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:47,  1.61it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:42,  1.80it/s][A
Training:  26%|██▌       | 25/98 [00:16<00:44,  1.62it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:40,  1.80it/s][A
Training:  30%|██▉       | 29/98 [00:18<00:42,  1.64it/s][A
Training:  31%|███       | 30/98 [00:19<00:37,  1.80it/s][A
Training:  34%|███▎      | 33/98 [00

Epoch: 22/26 - Loss: 0.9037 - Accuracy: 0.9267



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.65s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.88it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.33it/s][A
 69%|██████▉   | 9/13 [00:06<00:02,  1.38it/s][A
 85%|████████▍ | 11/13 [00:07<00:01,  1.92it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.76it/s][A
Epochs:  85%|████████▍ | 22/26 [24:04<04:23, 65.93s/it]

Val Loss: 1.1881 - Val Accuracy: 0.8925



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:15,  2.63s/it][A
Training:   2%|▏         | 2/98 [00:02<01:54,  1.19s/it][A
Training:   5%|▌         | 5/98 [00:04<01:19,  1.18it/s][A
Training:   6%|▌         | 6/98 [00:05<01:02,  1.48it/s][A
Training:   8%|▊         | 8/98 [00:05<00:37,  2.38it/s][A
Training:   9%|▉         | 9/98 [00:07<01:10,  1.27it/s][A
Training:  10%|█         | 10/98 [00:07<00:55,  1.59it/s][A
Training:  11%|█         | 11/98 [00:07<00:42,  2.03it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.41it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:49,  1.70it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:39,  2.11it/s][A
Training:  17%|█▋        | 17/98 [00:11<00:57,  1.42it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:46,  1.72it/s][A
Training:  19%|█▉        | 19/98 [00:12<00:36,  2.15it/s][A
Training:  21%|██▏       | 21/98 [00:14<00:54,  1.42it/s][A
Training:  22%|██▏       | 22/98 [00:1

Epoch: 23/26 - Loss: 0.8864 - Accuracy: 0.9267



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:30,  2.55s/it][A
 15%|█▌        | 2/13 [00:02<00:12,  1.16s/it][A
 31%|███       | 4/13 [00:02<00:04,  2.12it/s][A
 38%|███▊      | 5/13 [00:04<00:07,  1.11it/s][A
 46%|████▌     | 6/13 [00:04<00:04,  1.45it/s][A
 69%|██████▉   | 9/13 [00:07<00:02,  1.43it/s][A
 77%|███████▋  | 10/13 [00:07<00:01,  1.74it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.73it/s][A
Epochs:  88%|████████▊ | 23/26 [25:10<03:17, 65.91s/it]

Val Loss: 0.9071 - Val Accuracy: 0.9146



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:23,  2.72s/it][A
Training:   3%|▎         | 3/98 [00:02<01:11,  1.33it/s][A
Training:   5%|▌         | 5/98 [00:05<01:27,  1.06it/s][A
Training:   8%|▊         | 8/98 [00:05<00:42,  2.12it/s][A
Training:  10%|█         | 10/98 [00:07<01:00,  1.45it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:00,  1.42it/s][A
Training:  15%|█▌        | 15/98 [00:09<00:43,  1.92it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:56,  1.44it/s][A
Training:  20%|██        | 20/98 [00:12<00:34,  2.23it/s][A
Training:  22%|██▏       | 22/98 [00:14<00:47,  1.60it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:34,  2.14it/s][A
Training:  27%|██▋       | 26/98 [00:16<00:46,  1.54it/s][A
Training:  30%|██▉       | 29/98 [00:19<00:48,  1.42it/s][A
Training:  32%|███▏      | 31/98 [00:19<00:35,  1.88it/s][A
Training:  34%|███▎      | 33/98 [00:21<00:44,  1.48it/s][A
Training:  35%|███▍      | 34/98 [00

Epoch: 24/26 - Loss: 0.8842 - Accuracy: 0.9302



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:33,  2.79s/it][A
 15%|█▌        | 2/13 [00:02<00:13,  1.23s/it][A
 31%|███       | 4/13 [00:03<00:04,  2.02it/s][A
 46%|████▌     | 6/13 [00:05<00:05,  1.32it/s][A
 62%|██████▏   | 8/13 [00:05<00:02,  2.12it/s][A
 77%|███████▋  | 10/13 [00:07<00:02,  1.48it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.68it/s][A
Epochs:  92%|█████████▏| 24/26 [26:17<02:12, 66.25s/it]

Val Loss: 1.0224 - Val Accuracy: 0.9237



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:28,  2.77s/it][A
Training:   4%|▍         | 4/98 [00:02<00:52,  1.79it/s][A
Training:   6%|▌         | 6/98 [00:05<01:15,  1.22it/s][A
Training:   9%|▉         | 9/98 [00:07<01:12,  1.23it/s][A
Training:  11%|█         | 11/98 [00:07<00:50,  1.74it/s][A
Training:  13%|█▎        | 13/98 [00:10<01:05,  1.29it/s][A
Training:  15%|█▌        | 15/98 [00:10<00:45,  1.81it/s][A
Training:  17%|█▋        | 17/98 [00:12<00:59,  1.36it/s][A
Training:  19%|█▉        | 19/98 [00:12<00:41,  1.90it/s][A
Training:  21%|██▏       | 21/98 [00:15<00:56,  1.36it/s][A
Training:  23%|██▎       | 23/98 [00:15<00:39,  1.88it/s][A
Training:  26%|██▌       | 25/98 [00:17<00:52,  1.40it/s][A
Training:  29%|██▊       | 28/98 [00:17<00:31,  2.22it/s][A
Training:  31%|███       | 30/98 [00:19<00:43,  1.57it/s][A
Training:  34%|███▎      | 33/98 [00:22<00:46,  1.41it/s][A
Training:  36%|███▌      | 35/98 [00

Epoch: 25/26 - Loss: 0.8841 - Accuracy: 0.9303



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.64s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.84it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.29it/s][A
 69%|██████▉   | 9/13 [00:07<00:03,  1.33it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.71it/s][A
Epochs:  96%|█████████▌| 25/26 [27:24<01:06, 66.49s/it]

Val Loss: 1.0244 - Val Accuracy: 0.9195



Training:   0%|          | 0/98 [00:00<?, ?it/s][A
Training:   1%|          | 1/98 [00:02<04:12,  2.60s/it][A
Training:   2%|▏         | 2/98 [00:02<01:50,  1.15s/it][A
Training:   4%|▍         | 4/98 [00:02<00:43,  2.16it/s][A
Training:   6%|▌         | 6/98 [00:05<01:10,  1.30it/s][A
Training:   8%|▊         | 8/98 [00:05<00:43,  2.08it/s][A
Training:  10%|█         | 10/98 [00:07<01:03,  1.38it/s][A
Training:  13%|█▎        | 13/98 [00:09<01:01,  1.38it/s][A
Training:  14%|█▍        | 14/98 [00:09<00:52,  1.59it/s][A
Training:  16%|█▋        | 16/98 [00:09<00:35,  2.28it/s][A
Training:  17%|█▋        | 17/98 [00:12<01:01,  1.32it/s][A
Training:  18%|█▊        | 18/98 [00:12<00:49,  1.62it/s][A
Training:  20%|██        | 20/98 [00:12<00:32,  2.39it/s][A
Training:  21%|██▏       | 21/98 [00:14<01:01,  1.24it/s][A
Training:  24%|██▍       | 24/98 [00:14<00:32,  2.28it/s][A
Training:  27%|██▋       | 26/98 [00:17<00:48,  1.49it/s][A
Training:  30%|██▉       | 29/98 [00:

Epoch: 26/26 - Loss: 0.8973 - Accuracy: 0.9293



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:31,  2.60s/it][A
 31%|███       | 4/13 [00:02<00:04,  1.85it/s][A
 46%|████▌     | 6/13 [00:04<00:05,  1.33it/s][A
 62%|██████▏   | 8/13 [00:04<00:02,  2.06it/s][A
 77%|███████▋  | 10/13 [00:06<00:02,  1.50it/s][A
 92%|█████████▏| 12/13 [00:07<00:00,  2.11it/s][A
100%|██████████| 13/13 [00:07<00:00,  1.78it/s][A
Epochs: 100%|██████████| 26/26 [28:30<00:00, 65.78s/it]
[32m[I 2023-12-14 14:38:18,731][0m Trial 18 finished with value: 0.9146185517311096 and parameters: {'learning_rate': 0.0012027201009631523, 'weight_decay': 0.0010233374008044094, 'epsilon': 5.814687699948533e-09, 'batch_size': 143, 'epochs': 26}. Best is trial 0 with value: 0.9341313242912292.[0m


Val Loss: 1.3634 - Val Accuracy: 0.9146
Learning rate: 0.005462467829433099
Weight decay: 0.00015276509357746738
Epsilon: 2.2222063767151552e-08
Batch size: 50
Number of epochs: 50


Epochs:   0%|          | 0/50 [00:00<?, ?it/s]
Training:   0%|          | 0/280 [00:00<?, ?it/s][A
Training:   0%|          | 1/280 [00:01<05:38,  1.21s/it][A
Training:   1%|          | 3/280 [00:01<01:39,  2.79it/s][A
Training:   2%|▏         | 5/280 [00:01<01:32,  2.98it/s][A
Training:   3%|▎         | 8/280 [00:02<00:49,  5.50it/s][A
Training:   4%|▎         | 10/280 [00:02<01:04,  4.17it/s][A
Training:   5%|▍         | 13/280 [00:03<01:09,  3.86it/s][A
Training:   6%|▌         | 16/280 [00:03<00:46,  5.71it/s][A
Training:   6%|▋         | 18/280 [00:04<00:56,  4.68it/s][A
Training:   8%|▊         | 21/280 [00:05<01:02,  4.15it/s][A
Training:   9%|▉         | 25/280 [00:06<00:56,  4.54it/s][A
Training:  10%|█         | 29/280 [00:06<00:53,  4.67it/s][A
Training:  12%|█▏        | 33/280 [00:07<00:52,  4.68it/s][A
Training:  13%|█▎        | 37/280 [00:08<00:48,  5.01it/s][A
Training:  15%|█▍        | 41/280 [00:09<00:49,  4.80it/s][A
Training:  16%|█▌        | 45/280 [0

Epoch: 1/50 - Loss: 13.7616 - Accuracy: 0.8528



  0%|          | 0/35 [00:00<?, ?it/s][A
  3%|▎         | 1/35 [00:01<00:39,  1.16s/it][A
 14%|█▍        | 5/35 [00:02<00:10,  2.79it/s][A
 26%|██▌       | 9/35 [00:02<00:06,  3.78it/s][A
 37%|███▋      | 13/35 [00:03<00:05,  4.28it/s][A
 49%|████▊     | 17/35 [00:04<00:03,  4.53it/s][A
 60%|██████    | 21/35 [00:05<00:02,  4.82it/s][A
 71%|███████▏  | 25/35 [00:05<00:02,  4.94it/s][A
 83%|████████▎ | 29/35 [00:06<00:01,  4.96it/s][A
100%|██████████| 35/35 [00:07<00:00,  4.72it/s][A
Epochs:   0%|          | 0/50 [01:04<?, ?it/s]
[32m[I 2023-12-14 14:39:23,712][0m Trial 19 pruned. [0m


Val Loss: 21.4897 - Val Accuracy: 0.8641

Study statistics: 
  Number of finished trials:  20
  Number of pruned trials:  12
  Number of complete trials:  8


In [25]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.9341313242912292
  Params: 
    batch_size: 94
    epochs: 44
    epsilon: 6.245854766658258e-08
    learning_rate: 0.0007271403431439114
    weight_decay: 0.00017653547772457346


In [None]:
# ViT P8-S8 Triplet Mean

Best trial:
Value:  0.9341313242912292
Params: 
batch_size: 94
epochs: 44
epsilon: 6.245854766658258e-08
learning_rate: 0.0007271403431439114
weight_decay: 0.00017653547772457346