In [1]:
import os
import random
import pandas as pd
import numpy as np
import mxnet as mx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.utils.data import Dataset, DataLoader
from pytorch_metric_learning import losses
from einops import rearrange, repeat
import optuna
from optuna.trial import TrialState
from tqdm.notebook import tqdm

In [2]:
def file_to_embed(embeds, file):
    emb = []
    for f in file:
        emb.append(embeds[f][0])
    return torch.stack(emb)

In [3]:
MIN_NUM_PATCHES = 16

In [4]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

In [5]:
class AdienceDataset(Dataset):
    def __init__(self, annot_file, img_dir, train=False):
        self.img_lbls = pd.read_csv(annot_file, header=None)
        self.img_dir = img_dir
        self.is_train = train
    
    def __len__(self):
        return len(self.img_lbls)
    
    def __getitem__(self, idx):
        img_file = self.img_lbls.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_file)
        image = mx.image.imread(img_path)
        if image.shape[1] != 112:
            image = mx.image.resize_short(image, 112)
        image = mx.nd.transpose(image, axes=(2,0,1))
        image = torch.tensor(image.asnumpy()).type(torch.FloatTensor)
        label = self.img_lbls.iloc[idx, 1]
        
        if self.is_train:
            positive_list = self.img_lbls[self.img_lbls.iloc[:, 1] == label].index.values
            positive_list = np.setdiff1d(positive_list, np.array([idx]))
            positive_item = random.choice(positive_list)
            positive_img = self.img_lbls.iloc[positive_item, 0]
            pos_img_path = os.path.join(self.img_dir, positive_img)
            pos_image = mx.image.imread(pos_img_path)
            if pos_image.shape[1] != 112:
                pos_image = mx.image.resize_short(pos_image, 112)
            pos_image = mx.nd.transpose(pos_image, axes=(2,0,1))
            pos_image = torch.tensor(pos_image.asnumpy()).type(torch.FloatTensor)
            
            negative_list = self.img_lbls[self.img_lbls.iloc[:, 1] != label].index.values
            negative_item = random.choice(negative_list)
            negative_img = self.img_lbls.iloc[negative_item, 0]
            neg_img_path = os.path.join(self.img_dir, negative_img)
            neg_image = mx.image.imread(neg_img_path)
            if neg_image.shape[1] != 112:
                neg_image = mx.image.resize_short(neg_image, 112)
            neg_image = mx.nd.transpose(neg_image, axes=(2,0,1))
            neg_image = torch.tensor(neg_image.asnumpy()).type(torch.FloatTensor)
            
            return image, pos_image, neg_image, label, img_file, positive_img, negative_img

        return image, label, img_file

In [6]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/", train=False)
val_data = AdienceDataset("../val.csv", "../cropped_Adience/", train=False)

In [20]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/", train=True)
val_data = AdienceDataset("../val.csv", "../cropped_Adience/", train=True)

In [7]:
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor, positive, negative):
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [8]:
class CombinedLoss(nn.Module):
    def __init__(self, beta=1.0):
        super(CombinedLoss, self).__init__()
        self.beta = beta
        self.triplet = TripletLoss(margin=1.0)
        self.classification = nn.CrossEntropyLoss()
        
    def forward(self, anchor, positive, negative, classification_out, labels):
        triplet_loss = self.triplet(anchor, positive, negative)
        classification_loss = self.classification(classification_out, labels)
        total_loss = (self.beta * triplet_loss) + classification_loss
        
        return total_loss

In [9]:
class CosFace(nn.Module):
    r"""Implement of CosFace (https://arxiv.org/pdf/1801.09414.pdf):
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
        s: norm of input feature
        m: margin
        cos(theta)-m
    """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.35):
        super(CosFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id
        self.s = s
        self.m = m
        print("self.device_id", self.device_id)
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------

        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)
        phi = cosine - self.m
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot

        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features = ' + str(self.in_features) \
               + ', out_features = ' + str(self.out_features) \
               + ', s = ' + str(self.s) \
               + ', m = ' + str(self.m) + ')'

In [10]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

In [11]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [12]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [13]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        #embed()
        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, mask_value)
            del mask

        attn = dots.softmax(dim=-1)

        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)

        return out

In [14]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, mask = None):
        for attn, ff in self.layers:
            x = attn(x, mask = mask)
            #embed()
            x = ff(x)
        return x

In [15]:
class ViT_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, dim, depth, heads, mlp_dim, pool = 'mean', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")
        else:
            if self.loss_type == 'CosFace':
                self.loss = CosFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)

    def forward(self, img, label=None, mask=None):
        p = self.patch_size
        
        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        x = self.transformer(x, mask)

        # y = x[:, 0]
        z = x[:, 1:].mean(dim = 1)

        # y = self.to_latent(y)
        # emb_y = self.mlp_head(y)
        z = self.to_latent(z)
        emb_z = self.mlp_head(z)
        # emb = torch.cat((emb_y, emb_z), dim=1)
        emb = emb_z
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        else:
            return emb

In [16]:
class ViT_plus(nn.Module):
    def __init__(self):
        super(ViT_plus, self).__init__()
        
        self.fc1 = nn.Linear(in_features=512, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=2)
        
    def forward(self, x):
        x = self.fc1(x)
        x_cosface = x
        x_classification = self.fc2(x)
        
        return x_cosface, x_classification

In [17]:
model = ViT_face(
            image_size=112,
            patch_size=8,
            loss_type='CosFace',
            GPU_ID= [device],
            num_class=93431,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        ).to(device)
model.load_state_dict(
    torch.load("../Face-Transformer/results/ViT-P8S8_ms1m_cosface/Backbone_VIT_Epoch_2_Batch_20000_Time_2021-01-12-16-48_checkpoint.pth", map_location=device)
)

self.device_id [device(type='cuda', index=1)]


<All keys matched successfully>

In [18]:
for param in model.parameters():
    param.requires_grad = False

In [19]:
embeds = {}
model.eval()

with torch.no_grad():
    for img, _, file in train_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

    for img, _, file in val_data:
        img = img.to(device)
        embeds[file] = model(torch.unsqueeze(img, 0))

In [21]:
best_accu = 0.9349909424781799
def objective(trial):
    model_xtr = ViT_plus().to(device)
    
    loss_lr = trial.suggest_float("loss_learning_rate", 1e-4, 1e-2, log=True)
    arc_margin = losses.ArcFaceLoss(2, 512).to(device)
    cos_margin = losses.CosFaceLoss(2, 512).to(device)
    loss_optimizer_1 = opt.AdamW(arc_margin.parameters(), lr=loss_lr)
    loss_optimizer_2 = opt.AdamW(cos_margin.parameters(), lr=loss_lr)
    
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    wd = trial.suggest_float('weight_decay', 1e-4, 1e-2, log=True)
    eps = trial.suggest_float("epsilon", 1e-9, 1e-7, log=True)
    optimizer = opt.AdamW(model_xtr.parameters(), lr=lr, eps=eps, weight_decay=wd)
    
    beta = trial.suggest_float("beta", 0.1, 1.0, step=0.1)
    gamma = trial.suggest_float("gamma", 0.1, 1.0, step=0.1)
    theta = trial.suggest_float("theta", 0.1, 1.0, step=0.1)
    criterion = CombinedLoss(beta=beta)
    
    batch_size = trial.suggest_int('batch_size', 50, 300)
    num_epochs = trial.suggest_int('epochs', 10, 100)
    
    print("Learning rate for Loss: "+ str(loss_lr))
    print("Learning rate: "+ str(lr))
    print("Weight decay: "+ str(wd))
    print("Epsilon: "+ str(eps))
    print("Beta: "+ str(beta))
    print("Gamma: "+ str(gamma))
    print("Theta: "+ str(theta))
    print("Batch size: "+ str(batch_size))
    print("Number of epochs: "+ str(num_epochs))
    
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=4)
        
        # training loop
        running_loss = []
        running_accu = []
        
        model_xtr.train()
        for img, pos_img, neg_img, label, img_file, pos_file, neg_file in tqdm(train_loader, desc="Training", leave=False):
            img, pos_img, neg_img, label = img.to(device), pos_img.to(device), neg_img.to(device), label.to(device)

            x1 = file_to_embed(embeds, img_file)
            x2 = file_to_embed(embeds, pos_file)
            x3 = file_to_embed(embeds, neg_file)
            
            optimizer.zero_grad()
            anchor, output = model_xtr(x1)
            pos, _ = model_xtr(x2)
            neg, _ = model_xtr(x3)
            
            pred = torch.argmax(output, 1)
            accuracy = torch.eq(pred, label).sum() / len(img)

            class_triplet_loss = criterion(anchor, pos, neg, output, label)
            arc_loss = arc_margin(anchor, label)
            cos_loss = cos_margin(anchor, label)
            loss = (theta * cos_loss) + (gamma * arc_loss) + class_triplet_loss
            loss.backward()
            loss_optimizer_1.step()
            loss_optimizer_2.step()
            optimizer.step()

            running_accu.append(accuracy.cpu().detach().numpy())
            running_loss.append(loss.cpu().detach().numpy())
        print("Epoch: {}/{} - Loss: {:.4f} - Accuracy: {:.4f}".format(epoch+1, num_epochs, np.mean(running_loss), np.mean(running_accu)))
        
        # validation loop
        val_loss = []
        val_accu = []

        model_xtr.eval()
        with torch.no_grad():
            for img, pos_img, neg_img, label, img_file, pos_file, neg_file in tqdm(val_loader):
                img, pos_img, neg_img, label = img.to(device), pos_img.to(device), neg_img.to(device), label.to(device)
                
                x1 = file_to_embed(embeds, img_file)
                x2 = file_to_embed(embeds, pos_file)
                x3 = file_to_embed(embeds, neg_file)
                
                anchor, output = model_xtr(x1)
                pos, _ = model_xtr(x2)
                neg, _ = model_xtr(x3)
                
                pred = torch.argmax(output, 1)
                accuracy = torch.eq(pred, label).sum() / len(img)
                
                class_triplet_loss = criterion(anchor, pos, neg, output, label)
                arc_loss = arc_margin(anchor, label)
                cos_loss = cos_margin(anchor, label)
                loss = (theta * cos_loss) + (gamma * arc_loss) + class_triplet_loss
                
                val_accu.append(accuracy.cpu().detach().numpy())
                val_loss.append(loss.cpu().detach().numpy())
        val_accu = np.mean(val_accu)
        val_loss = np.mean(val_loss)
        print("Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(val_loss, val_accu))
        
        trial.report(val_accu, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    global best_accu
    if val_accu > best_accu:
        best_accu = val_accu
        print("Saving best model...")
        torch.save(model_xtr.state_dict(), "../vit_8-8_triplet_arcface_cosface_mean_only.pt")
            
    return val_accu

In [24]:
study = optuna.create_study(direction='maximize',
                            study_name='triplet-arcface-cosface-8-8-mean-only-vit-study',
                            storage='sqlite:///study1.db',
                            load_if_exists=True)
study.optimize(objective, n_trials=5)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# Display the study statistics
print("\nStudy statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

[32m[I 2023-12-14 18:36:10,203][0m Using an existing study with name 'triplet-arcface-cosface-8-8-mean-only-vit-study' instead of creating a new one.[0m


Learning rate for Loss: 0.002459889186968526
Learning rate: 0.001298428103854156
Weight decay: 0.0014810737671563491
Epsilon: 6.9334420738657474e-09
Beta: 0.4
Gamma: 0.1
Theta: 0.8
Batch size: 136
Number of epochs: 85


Epochs:   0%|          | 0/85 [00:00<?, ?it/s]

Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 1/85 - Loss: 10.3047 - Accuracy: 0.8232


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 6.1831 - Val Accuracy: 0.8967


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 2/85 - Loss: 5.4787 - Accuracy: 0.9093


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.2036 - Val Accuracy: 0.9180


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 3/85 - Loss: 5.0828 - Accuracy: 0.9173


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5821 - Val Accuracy: 0.9285


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 4/85 - Loss: 4.9811 - Accuracy: 0.9193


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.7421 - Val Accuracy: 0.9233


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 5/85 - Loss: 4.7186 - Accuracy: 0.9213


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.5996 - Val Accuracy: 0.9082


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 6/85 - Loss: 4.4440 - Accuracy: 0.9274


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5889 - Val Accuracy: 0.9275


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 7/85 - Loss: 4.6728 - Accuracy: 0.9232


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4130 - Val Accuracy: 0.9290


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 8/85 - Loss: 4.5568 - Accuracy: 0.9245


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3241 - Val Accuracy: 0.9286


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 9/85 - Loss: 4.5744 - Accuracy: 0.9237


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.6026 - Val Accuracy: 0.9237


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 10/85 - Loss: 4.6116 - Accuracy: 0.9248


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4876 - Val Accuracy: 0.9269


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 11/85 - Loss: 4.2883 - Accuracy: 0.9321


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.8678 - Val Accuracy: 0.9108


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 12/85 - Loss: 4.2627 - Accuracy: 0.9300


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5946 - Val Accuracy: 0.9258


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 13/85 - Loss: 4.1763 - Accuracy: 0.9331


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.8699 - Val Accuracy: 0.9168


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 14/85 - Loss: 4.4202 - Accuracy: 0.9276


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.0699 - Val Accuracy: 0.9179


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 15/85 - Loss: 4.0982 - Accuracy: 0.9338


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.9564 - Val Accuracy: 0.9165


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 16/85 - Loss: 4.1902 - Accuracy: 0.9301


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4986 - Val Accuracy: 0.9165


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 17/85 - Loss: 4.0555 - Accuracy: 0.9338


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1696 - Val Accuracy: 0.9327


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 18/85 - Loss: 4.3774 - Accuracy: 0.9264


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3214 - Val Accuracy: 0.9294


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 19/85 - Loss: 4.4563 - Accuracy: 0.9271


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2560 - Val Accuracy: 0.9286


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 20/85 - Loss: 4.2290 - Accuracy: 0.9275


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2263 - Val Accuracy: 0.9280


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 21/85 - Loss: 4.2375 - Accuracy: 0.9293


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2289 - Val Accuracy: 0.9309


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 22/85 - Loss: 3.9249 - Accuracy: 0.9360


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4124 - Val Accuracy: 0.9251


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 23/85 - Loss: 4.0438 - Accuracy: 0.9325


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.8683 - Val Accuracy: 0.9186


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 24/85 - Loss: 4.0447 - Accuracy: 0.9354


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.7934 - Val Accuracy: 0.9170


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 25/85 - Loss: 4.3279 - Accuracy: 0.9297


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1676 - Val Accuracy: 0.9332


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 26/85 - Loss: 4.1543 - Accuracy: 0.9312


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5029 - Val Accuracy: 0.9257


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 27/85 - Loss: 4.0008 - Accuracy: 0.9330


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.9807 - Val Accuracy: 0.9181


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 28/85 - Loss: 3.9314 - Accuracy: 0.9336


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1082 - Val Accuracy: 0.9315


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 29/85 - Loss: 3.8707 - Accuracy: 0.9360


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4051 - Val Accuracy: 0.9276


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 30/85 - Loss: 4.0453 - Accuracy: 0.9336


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3453 - Val Accuracy: 0.9285


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 31/85 - Loss: 3.9533 - Accuracy: 0.9359


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3415 - Val Accuracy: 0.9298


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 32/85 - Loss: 4.1019 - Accuracy: 0.9305


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.0181 - Val Accuracy: 0.9322


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 33/85 - Loss: 4.0094 - Accuracy: 0.9328


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1257 - Val Accuracy: 0.9327


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 34/85 - Loss: 4.0189 - Accuracy: 0.9312


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4260 - Val Accuracy: 0.9251


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 35/85 - Loss: 3.8741 - Accuracy: 0.9359


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2023 - Val Accuracy: 0.9320


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 36/85 - Loss: 4.0375 - Accuracy: 0.9337


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.7971 - Val Accuracy: 0.9224


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 37/85 - Loss: 4.3798 - Accuracy: 0.9274


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.3352 - Val Accuracy: 0.9089


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 38/85 - Loss: 4.0610 - Accuracy: 0.9336


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.0877 - Val Accuracy: 0.9292


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 39/85 - Loss: 4.0566 - Accuracy: 0.9335


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3662 - Val Accuracy: 0.9265


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 40/85 - Loss: 3.9495 - Accuracy: 0.9342


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1013 - Val Accuracy: 0.9274


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 41/85 - Loss: 3.9979 - Accuracy: 0.9357


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4644 - Val Accuracy: 0.9348


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 42/85 - Loss: 4.1647 - Accuracy: 0.9332


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.3182 - Val Accuracy: 0.9084


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 43/85 - Loss: 4.0820 - Accuracy: 0.9307


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2722 - Val Accuracy: 0.9304


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 44/85 - Loss: 3.8621 - Accuracy: 0.9375


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3163 - Val Accuracy: 0.9290


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 45/85 - Loss: 3.8492 - Accuracy: 0.9359


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3103 - Val Accuracy: 0.9305


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 46/85 - Loss: 3.8899 - Accuracy: 0.9357


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4201 - Val Accuracy: 0.9281


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 47/85 - Loss: 3.8660 - Accuracy: 0.9349


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.6342 - Val Accuracy: 0.9253


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 48/85 - Loss: 4.0115 - Accuracy: 0.9333


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1705 - Val Accuracy: 0.9349


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 49/85 - Loss: 4.0302 - Accuracy: 0.9341


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3463 - Val Accuracy: 0.9298


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 50/85 - Loss: 3.7809 - Accuracy: 0.9377


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1829 - Val Accuracy: 0.9317


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 51/85 - Loss: 3.8820 - Accuracy: 0.9366


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2052 - Val Accuracy: 0.9286


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 52/85 - Loss: 4.1363 - Accuracy: 0.9329


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1172 - Val Accuracy: 0.9328


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 53/85 - Loss: 4.0604 - Accuracy: 0.9327


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1782 - Val Accuracy: 0.9345


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 54/85 - Loss: 3.9248 - Accuracy: 0.9349


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1478 - Val Accuracy: 0.9310


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 55/85 - Loss: 3.8585 - Accuracy: 0.9384


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1252 - Val Accuracy: 0.9344


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 56/85 - Loss: 3.9380 - Accuracy: 0.9343


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 5.4770 - Val Accuracy: 0.9167


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 57/85 - Loss: 3.8857 - Accuracy: 0.9376


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2672 - Val Accuracy: 0.9287


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 58/85 - Loss: 4.0278 - Accuracy: 0.9322


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.0634 - Val Accuracy: 0.9321


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 59/85 - Loss: 3.9304 - Accuracy: 0.9365


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3404 - Val Accuracy: 0.9293


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 60/85 - Loss: 3.7687 - Accuracy: 0.9371


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3377 - Val Accuracy: 0.9221


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 61/85 - Loss: 3.9120 - Accuracy: 0.9377


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.6131 - Val Accuracy: 0.9255


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 62/85 - Loss: 4.0011 - Accuracy: 0.9343


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1971 - Val Accuracy: 0.9280


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 63/85 - Loss: 4.0672 - Accuracy: 0.9315


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.6614 - Val Accuracy: 0.9222


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 64/85 - Loss: 3.9443 - Accuracy: 0.9358


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1966 - Val Accuracy: 0.9287


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 65/85 - Loss: 3.7947 - Accuracy: 0.9391


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.3846 - Val Accuracy: 0.9283


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 66/85 - Loss: 3.7777 - Accuracy: 0.9383


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.8076 - Val Accuracy: 0.9215


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 67/85 - Loss: 3.9496 - Accuracy: 0.9355


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5589 - Val Accuracy: 0.9269


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 68/85 - Loss: 3.7825 - Accuracy: 0.9370


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4669 - Val Accuracy: 0.9167


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 69/85 - Loss: 4.0324 - Accuracy: 0.9348


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.6189 - Val Accuracy: 0.9206


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 70/85 - Loss: 3.9867 - Accuracy: 0.9348


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1995 - Val Accuracy: 0.9332


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 71/85 - Loss: 3.8307 - Accuracy: 0.9378


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.1445 - Val Accuracy: 0.9290


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 72/85 - Loss: 3.7762 - Accuracy: 0.9383


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4622 - Val Accuracy: 0.9228


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 73/85 - Loss: 3.7435 - Accuracy: 0.9392


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.4296 - Val Accuracy: 0.9252


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 74/85 - Loss: 4.0281 - Accuracy: 0.9357


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.2281 - Val Accuracy: 0.9263


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 75/85 - Loss: 3.8190 - Accuracy: 0.9371


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 4.5630 - Val Accuracy: 0.9274


Training:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 76/85 - Loss: 3.8497 - Accuracy: 0.9363


  0%|          | 0/13 [00:00<?, ?it/s]

[32m[I 2023-12-14 19:15:49,613][0m Trial 15 pruned. [0m


Val Loss: 4.1669 - Val Accuracy: 0.9298
Learning rate for Loss: 0.0009980475590442442
Learning rate: 0.005660712121161554
Weight decay: 0.0002635035929503344
Epsilon: 2.5406084146161566e-08
Beta: 0.7000000000000001
Gamma: 0.4
Theta: 0.1
Batch size: 82
Number of epochs: 67


Epochs:   0%|          | 0/67 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 1/67 - Loss: 10.6019 - Accuracy: 0.8222


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 9.7049 - Val Accuracy: 0.8994


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 2/67 - Loss: 10.2187 - Accuracy: 0.9080


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 10.0386 - Val Accuracy: 0.9150


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 3/67 - Loss: 11.3384 - Accuracy: 0.8905


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 12.7840 - Val Accuracy: 0.8507


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 4/67 - Loss: 11.8642 - Accuracy: 0.9055


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 13.8714 - Val Accuracy: 0.8607


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 5/67 - Loss: 12.8892 - Accuracy: 0.9098


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 14.7530 - Val Accuracy: 0.9159


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 6/67 - Loss: 14.1773 - Accuracy: 0.9135


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 14.1480 - Val Accuracy: 0.8971


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 7/67 - Loss: 14.0526 - Accuracy: 0.9113


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 13.8827 - Val Accuracy: 0.8203


Training:   0%|          | 0/171 [00:00<?, ?it/s]

Epoch: 8/67 - Loss: 13.5868 - Accuracy: 0.9118


  0%|          | 0/22 [00:00<?, ?it/s]

[32m[I 2023-12-14 19:19:53,166][0m Trial 16 pruned. [0m


Val Loss: 13.2968 - Val Accuracy: 0.9156
Learning rate for Loss: 0.0025526305692198685
Learning rate: 8.903784902873777e-05
Weight decay: 0.0005806300863680938
Epsilon: 9.555074515186742e-08
Beta: 0.4
Gamma: 0.2
Theta: 0.6
Batch size: 164
Number of epochs: 38


Epochs:   0%|          | 0/38 [00:00<?, ?it/s]

Training:   0%|          | 0/86 [00:00<?, ?it/s]

Epoch: 1/38 - Loss: 18.3767 - Accuracy: 0.6446


  0%|          | 0/11 [00:00<?, ?it/s]

[32m[I 2023-12-14 19:20:22,579][0m Trial 17 pruned. [0m


Val Loss: 14.6289 - Val Accuracy: 0.7628
Learning rate for Loss: 0.000332911341794275
Learning rate: 0.0003825693752576565
Weight decay: 0.0012580399112685702
Epsilon: 1.0775249790421578e-09
Beta: 0.7000000000000001
Gamma: 0.5
Theta: 0.4
Batch size: 71
Number of epochs: 21


Epochs:   0%|          | 0/21 [00:00<?, ?it/s]

Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 1/21 - Loss: 13.3270 - Accuracy: 0.8097


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 7.9922 - Val Accuracy: 0.8829


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 2/21 - Loss: 6.3626 - Accuracy: 0.9126


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 5.6033 - Val Accuracy: 0.9230


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 3/21 - Loss: 5.2845 - Accuracy: 0.9229


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.9898 - Val Accuracy: 0.9240


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 4/21 - Loss: 4.8423 - Accuracy: 0.9214


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.3623 - Val Accuracy: 0.9298


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 5/21 - Loss: 4.3711 - Accuracy: 0.9276


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.4174 - Val Accuracy: 0.9281


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 6/21 - Loss: 4.1593 - Accuracy: 0.9289


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.0052 - Val Accuracy: 0.9315


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 7/21 - Loss: 4.1167 - Accuracy: 0.9270


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.9588 - Val Accuracy: 0.9270


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 8/21 - Loss: 3.9911 - Accuracy: 0.9292


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.9197 - Val Accuracy: 0.9305


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 9/21 - Loss: 3.6789 - Accuracy: 0.9324


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.6013 - Val Accuracy: 0.9315


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 10/21 - Loss: 3.6739 - Accuracy: 0.9311


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.6745 - Val Accuracy: 0.9326


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 11/21 - Loss: 3.5842 - Accuracy: 0.9332


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.6340 - Val Accuracy: 0.9309


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 12/21 - Loss: 3.5241 - Accuracy: 0.9351


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.4984 - Val Accuracy: 0.9326


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 13/21 - Loss: 3.5286 - Accuracy: 0.9334


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.7480 - Val Accuracy: 0.9241


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 14/21 - Loss: 3.5832 - Accuracy: 0.9316


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.9112 - Val Accuracy: 0.9226


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 15/21 - Loss: 3.4517 - Accuracy: 0.9358


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.9379 - Val Accuracy: 0.9283


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 16/21 - Loss: 3.6484 - Accuracy: 0.9313


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.5584 - Val Accuracy: 0.9360


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 17/21 - Loss: 3.6476 - Accuracy: 0.9334


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.0473 - Val Accuracy: 0.9202


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 18/21 - Loss: 3.4444 - Accuracy: 0.9350


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.5114 - Val Accuracy: 0.9292


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 19/21 - Loss: 3.4584 - Accuracy: 0.9344


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 3.4121 - Val Accuracy: 0.9337


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 20/21 - Loss: 3.4652 - Accuracy: 0.9342


  0%|          | 0/25 [00:00<?, ?it/s]

Val Loss: 4.7280 - Val Accuracy: 0.9106


Training:   0%|          | 0/197 [00:00<?, ?it/s]

Epoch: 21/21 - Loss: 3.5625 - Accuracy: 0.9343


  0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2023-12-14 19:30:49,476][0m Trial 18 finished with value: 0.9201922416687012 and parameters: {'loss_learning_rate': 0.000332911341794275, 'learning_rate': 0.0003825693752576565, 'weight_decay': 0.0012580399112685702, 'epsilon': 1.0775249790421578e-09, 'beta': 0.7000000000000001, 'gamma': 0.5, 'theta': 0.4, 'batch_size': 71, 'epochs': 21}. Best is trial 0 with value: 0.9349909424781799.[0m


Val Loss: 4.2118 - Val Accuracy: 0.9202
Learning rate for Loss: 0.008090726976531503
Learning rate: 6.511512087637365e-05
Weight decay: 0.00022943914110879211
Epsilon: 2.3726545160206325e-09
Beta: 0.1
Gamma: 0.2
Theta: 0.8
Batch size: 110
Number of epochs: 60


Epochs:   0%|          | 0/60 [00:00<?, ?it/s]

Training:   0%|          | 0/128 [00:00<?, ?it/s]

Epoch: 1/60 - Loss: 19.2282 - Accuracy: 0.5957


  0%|          | 0/16 [00:00<?, ?it/s]

[32m[I 2023-12-14 19:31:23,778][0m Trial 19 pruned. [0m


Val Loss: 11.2245 - Val Accuracy: 0.7441

Study statistics: 
  Number of finished trials:  20
  Number of pruned trials:  10
  Number of complete trials:  10


In [25]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.9349909424781799
  Params: 
    batch_size: 65
    beta: 0.4
    epochs: 20
    epsilon: 1.0709899116448299e-09
    gamma: 0.1
    learning_rate: 3.3017711667502184e-05
    loss_learning_rate: 0.003155816486445591
    theta: 0.5
    weight_decay: 0.0005152733886150015


In [None]:
# ViT P8-S8 Triplet ArcFace CosFace Mean

Best trial:
Value:  0.9349909424781799
Params: 
batch_size: 65
beta: 0.4
epochs: 20
epsilon: 1.0709899116448299e-09
gamma: 0.1
learning_rate: 3.3017711667502184e-05
loss_learning_rate: 0.003155816486445591
theta: 0.5
weight_decay: 0.0005152733886150015