In [1]:
import os
import math
import numpy as np
import pandas as pd
import mxnet as mx
from einops import rearrange, repeat
from pytorch_metric_learning import losses
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import optuna
from optuna.trial import TrialState

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
MIN_NUM_PATCHES = 16

In [4]:
class AdienceDataset(Dataset):
    def __init__(self, annot_file, img_dir):
        self.img_lbls = pd.read_csv(annot_file, header=None)
        self.img_dir = img_dir
    
    def __len__(self):
        return len(self.img_lbls)
    
    def __getitem__(self, idx):
        img_file = self.img_lbls.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_file)
        image = mx.image.imread(img_path)
        if image.shape[1] != 112:
            image = mx.image.resize_short(image, 112)
        image = mx.nd.transpose(image, axes=(2,0,1))
        image = torch.tensor(image.asnumpy()).type(torch.FloatTensor)
        label = self.img_lbls.iloc[idx, 1]

        return image, label

In [5]:
train_data = AdienceDataset("../train.csv", "../cropped_Adience/")
test_data = AdienceDataset("../test.csv", "../cropped_Adience/")
val_data = AdienceDataset("../val.csv", "../cropped_Adience/")

In [6]:
class CosFace(nn.Module):
    r"""Implement of CosFace (https://arxiv.org/pdf/1801.09414.pdf):
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
        s: norm of input feature
        m: margin
        cos(theta)-m
    """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.35):
        super(CosFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id
        self.s = s
        self.m = m
        print("self.device_id", self.device_id)
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------

        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)
        phi = cosine - self.m
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        # one_hot = one_hot.cuda() if cosine.is_cuda else one_hot

        one_hot.scatter_(1, label.cuda(self.device_id[0]).view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

    def __repr__(self):
        return self.__class__.__name__ + '(' \
               + 'in_features = ' + str(self.in_features) \
               + ', out_features = ' + str(self.out_features) \
               + ', s = ' + str(self.s) \
               + ', m = ' + str(self.m) + ')'

In [7]:
class ArcFace(nn.Module):
    r"""Implement of ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf):
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            device_id: the ID of GPU where the model will be trained by model parallel.
                       if device_id=None, it will be trained on CPU without model parallel.
            s: norm of input feature
            m: margin
            cos(theta+m)
        """

    def __init__(self, in_features, out_features, device_id, s=64.0, m=0.50, easy_margin=False):
        super(ArcFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.device_id = device_id

        self.s = s
        self.m = m

        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if self.device_id == None:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        else:
            x = input
            sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0)
            temp_x = x.cuda(self.device_id[0])
            weight = sub_weights[0].cuda(self.device_id[0])
            cosine = F.linear(F.normalize(temp_x), F.normalize(weight))
            for i in range(1, len(self.device_id)):
                temp_x = x.cuda(self.device_id[i])
                weight = sub_weights[i].cuda(self.device_id[i])
                cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])),
                                   dim=1)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size())
        if self.device_id != None:
            one_hot = one_hot.cuda(self.device_id[0])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + (
                    (1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

In [8]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

In [9]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [10]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [11]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
        mask_value = -torch.finfo(dots.dtype).max
        #embed()
        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, mask_value)
            del mask

        attn = dots.softmax(dim=-1)

        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)

        return out

In [12]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
                Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, mask = None):
        for attn, ff in self.layers:
            x = attn(x, mask = mask)
            #embed()
            x = ff(x)
        return x

In [13]:
class ViT_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")
        else:
            if self.loss_type == 'Softmax':
                self.loss = Softmax(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'CosFace':
                self.loss = CosFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'ArcFace':
                self.loss = ArcFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'SFace':
                self.loss = SFaceLoss(in_features=dim, out_features=num_class, device_id=self.GPU_ID)

    def forward(self, img, label=None, mask=None):
        p = self.patch_size
        
        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        x = self.transformer(x, mask)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        emb = self.mlp_head(x)
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        else:
            return emb

In [14]:
class ViTs_face(nn.Module):
    def __init__(self, *, loss_type, GPU_ID, num_class, image_size, patch_size, ac_patch_size,
                         pad, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * ac_patch_size ** 2
        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective (at least 16). Try decreasing your patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.patch_size = patch_size
        self.soft_split = nn.Unfold(kernel_size=(ac_patch_size, ac_patch_size), stride=(self.patch_size, self.patch_size), padding=(pad, pad))


        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
        )
        self.loss_type = loss_type
        self.GPU_ID = GPU_ID
        if self.loss_type == 'None':
            print("no loss for vit_face")
        else:
            if self.loss_type == 'Softmax':
                self.loss = Softmax(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'CosFace':
                self.loss = CosFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'ArcFace':
                self.loss = ArcFace(in_features=dim, out_features=num_class, device_id=self.GPU_ID)
            elif self.loss_type == 'SFace':
                self.loss = SFaceLoss(in_features=dim, out_features=num_class, device_id=self.GPU_ID)

    def forward(self, img, label= None , mask = None):
        p = self.patch_size
        x = self.soft_split(img).transpose(1, 2)
        x = self.patch_to_embedding(x)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        x = self.transformer(x, mask)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        emb = self.mlp_head(x)
        if label is not None:
            x = self.loss(emb, label)
            return x, emb
        else:
            return emb

In [15]:
model = ViT_face(
            image_size=112,
            patch_size=8,
            loss_type='CosFace',
            GPU_ID= [device],
            num_class=93431,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        )
model.load_state_dict(
    torch.load("../Face-Transformer/results/ViT-P8S8_ms1m_cosface/Backbone_VIT_Epoch_2_Batch_20000_Time_2021-01-12-16-48_checkpoint.pth", map_location=device)
)

self.device_id [device(type='cuda', index=0)]


<All keys matched successfully>

In [16]:
for param in model.parameters():
    param.requires_grad = False

model = nn.Sequential(
    model,
    nn.Linear(512, 512),
).to(device)

In [17]:
best_accu = 0.0
def objective(trial):
    loss_lr = trial.suggest_float("loss_learning_rate", 1e-4, 1e-2, log=True)
    arc_margin = ArcFace(512, 2, [device]).to(device)
    loss_optimizer = opt.AdamW(arc_margin.parameters(), lr=loss_lr)
    
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    wd = trial.suggest_float("weight_decay", 1e-3, 1e-1, log=True)
    eps = trial.suggest_float("epsilon", 1e-9, 1e-6, log=True)
    optimizer = opt.AdamW(filter(lambda p : p.requires_grad, model.parameters()), lr=lr, weight_decay=wd, eps=eps)
    criterion = nn.CrossEntropyLoss()
    
    bs = trial.suggest_int("batch_size", 50, 300)
    num_epochs = trial.suggest_int("epochs", 10, 50)
    
    print("Learning rate for Loss: "+ str(loss_lr))
    print("Learning rate: "+ str(lr))
    print("Weight decay: "+ str(wd))
    print("Epsilon: "+ str(eps))
    print("Batch size: "+ str(bs))
    print("Number of epochs: "+ str(num_epochs))
    
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        train_loader = DataLoader(train_data, batch_size=bs, shuffle=True, num_workers=4)
        val_loader = DataLoader(val_data, batch_size=bs, shuffle=False, num_workers=4)


        model.train()
        running_loss = []
        running_accu = []
        
        for img, label in tqdm(train_loader, desc="Training", leave=False):
            img, label = img.to(device), label.to(device)

            emb = model(img)
            arc_out = arc_margin(emb, label)
            loss = criterion(arc_out, label)

            pred = torch.argmax(arc_out, 1)
            accuracy = torch.eq(pred, label).sum() / len(img)

            optimizer.zero_grad()
            loss.backward()
            loss_optimizer.step()
            optimizer.step()

            running_accu.append(accuracy.cpu().detach().numpy())
            running_loss.append(loss.cpu().detach().numpy())
        print("Epoch: {}/{} - Loss: {:.4f} - Accuracy: {:.4f}".format(epoch+1, num_epochs, np.mean(running_loss), np.mean(running_accu)))
        

        model.eval()
        val_loss = []
        val_accu = []
        
        with torch.no_grad():
            for img, label in tqdm(val_loader):
                img, label = img.to(device), label.to(device)
                
                embed = model(img)
                out = arc_margin(embed, label)
                loss = criterion(out, label)
                
                pred = torch.argmax(out, 1)
                accuracy = torch.eq(pred, label).sum() / len(img)
                
                val_accu.append(accuracy.cpu().detach().numpy())
                val_loss.append(loss.cpu().detach().numpy())
        val_accu = np.mean(val_accu)
        val_loss = np.mean(val_loss)
        print("Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(val_loss, val_accu))
        
        trial.report(val_accu, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    global best_accu
    if val_accu > best_accu:
        best_accu = val_accu
        print("Saving best model...")
        torch.save(model.state_dict(), "../vit_arcface.pt")
            
    return val_accu

In [18]:
study = optuna.create_study(direction='maximize',
                            study_name='vit-arcface-study',
                            storage='sqlite:///study3.db',
                            load_if_exists=True)
study.optimize(objective, n_trials=5)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# Display the study statistics
print("\nStudy statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

[32m[I 2023-11-27 13:42:23,766][0m Using an existing study with name 'vit-arcface-study' instead of creating a new one.[0m


Learning rate for Loss: 0.005717229561247942
Learning rate: 5.095612467690143e-05
Weight decay: 0.01865088689760723
Epsilon: 3.45520088694907e-07
Batch size: 256
Number of epochs: 35


Epochs:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/35 - Loss: 20.1762 - Accuracy: 0.0800


  0%|          | 0/7 [00:00<?, ?it/s]

[32m[I 2023-11-27 13:43:45,604][0m Trial 7 pruned. [0m


Val Loss: 15.4464 - Val Accuracy: 0.1647
Learning rate for Loss: 0.004294472911099488
Learning rate: 0.0018282933871611087
Weight decay: 0.07988837973986813
Epsilon: 1.6683513303232733e-07
Batch size: 229
Number of epochs: 47


Epochs:   0%|          | 0/47 [00:00<?, ?it/s]

Training:   0%|          | 0/62 [00:00<?, ?it/s]

Epoch: 1/47 - Loss: 7.6777 - Accuracy: 0.6627


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-11-27 13:45:07,434][0m Trial 8 pruned. [0m


Val Loss: 5.5452 - Val Accuracy: 0.7758
Learning rate for Loss: 0.00025463771956104177
Learning rate: 0.0026765360027718437
Weight decay: 0.056523950360720936
Epsilon: 3.2933290504782507e-07
Batch size: 139
Number of epochs: 29


Epochs:   0%|          | 0/29 [00:00<?, ?it/s]

Training:   0%|          | 0/101 [00:00<?, ?it/s]

Epoch: 1/29 - Loss: 5.8805 - Accuracy: 0.6999


  0%|          | 0/13 [00:00<?, ?it/s]

[32m[I 2023-11-27 13:46:29,495][0m Trial 9 pruned. [0m


Val Loss: 3.9869 - Val Accuracy: 0.7727
Learning rate for Loss: 0.0003827337885935781
Learning rate: 0.030128559432486882
Weight decay: 0.03172068493230515
Epsilon: 6.397755245591623e-07
Batch size: 246
Number of epochs: 43


Epochs:   0%|          | 0/43 [00:00<?, ?it/s]

Training:   0%|          | 0/57 [00:00<?, ?it/s]

Epoch: 1/43 - Loss: 5.1453 - Accuracy: 0.7088


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-11-27 13:47:51,293][0m Trial 10 pruned. [0m


Val Loss: 3.7390 - Val Accuracy: 0.7310
Learning rate for Loss: 0.001981613227973426
Learning rate: 0.09789105936796073
Weight decay: 0.08670879201316586
Epsilon: 2.2561577848103715e-07
Batch size: 181
Number of epochs: 50


Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Training:   0%|          | 0/78 [00:00<?, ?it/s]

Epoch: 1/50 - Loss: 4.9648 - Accuracy: 0.6925


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2023-11-27 13:49:12,499][0m Trial 11 pruned. [0m


Val Loss: 3.6686 - Val Accuracy: 0.7180

Study statistics: 
  Number of finished trials:  12
  Number of pruned trials:  5
  Number of complete trials:  5


In [19]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.8180497288703918
  Params: 
    batch_size: 282
    epochs: 34
    epsilon: 8.098612014260704e-08
    learning_rate: 0.0010250459833447548
    loss_learning_rate: 0.0005050136792180132
    weight_decay: 0.04781824979310061


In [None]:
# ViT P8-S8 CosFace

Best trial:
Value:  0.844062328338623
Params: 
batch_size: 239
epochs: 10
epsilon: 8.937985831226569e-07
learning_rate: 0.0005430849641573857
loss_learning_rate: 0.0031328725979152287
weight_decay: 0.006123139639234927

In [None]:
# ViT P8-S8 ArcFace

Best trial:
Value:  0.8180497288703918
Params: 
batch_size: 282
epochs: 34
epsilon: 8.098612014260704e-08
learning_rate: 0.0010250459833447548
loss_learning_rate: 0.0005050136792180132
weight_decay: 0.04781824979310061

In [15]:
model = ViTs_face(
            loss_type='CosFace',
            GPU_ID=[device],
            num_class=93431,
            image_size=112,
            patch_size=8,
            ac_patch_size=12,
            pad=4,
            dim=512,
            depth=20,
            heads=8,
            mlp_dim=2048,
            dropout=0.1,
            emb_dropout=0.1
        )
model.load_state_dict(
    torch.load("../Face-Transformer/results/ViT-P12S8_ms1m_cosface/Backbone_VITs_Epoch_2_Batch_12000_Time_2021-03-17-04-05_checkpoint.pth", map_location=device)
)

self.device_id [device(type='cuda', index=0)]


<All keys matched successfully>

In [16]:
for param in model.parameters():
    param.requires_grad = False

model = nn.Sequential(
    model,
    nn.Linear(512, 512),
).to(device)

In [17]:
best_accu = 0.0
def objective(trial):
    loss_lr = trial.suggest_float("loss_learning_rate", 1e-4, 1e-2, log=True)
    arc_margin = ArcFace(512, 2, [device]).to(device)
    loss_optimizer = opt.AdamW(arc_margin.parameters(), lr=loss_lr)
    
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    wd = trial.suggest_float("weight_decay", 1e-3, 1e-1, log=True)
    eps = trial.suggest_float("epsilon", 1e-9, 1e-6, log=True)
    optimizer = opt.AdamW(filter(lambda p : p.requires_grad, model.parameters()), lr=lr, weight_decay=wd, eps=eps)
    criterion = nn.CrossEntropyLoss()
    
    bs = trial.suggest_int("batch_size", 50, 300)
    num_epochs = trial.suggest_int("epochs", 10, 50)
    
    print("Learning rate for Loss: "+ str(loss_lr))
    print("Learning rate: "+ str(lr))
    print("Weight decay: "+ str(wd))
    print("Epsilon: "+ str(eps))
    print("Batch size: "+ str(bs))
    print("Number of epochs: "+ str(num_epochs))
    
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        train_loader = DataLoader(train_data, batch_size=bs, shuffle=True, num_workers=4)
        val_loader = DataLoader(val_data, batch_size=bs, shuffle=False, num_workers=4)


        model.train()
        running_loss = []
        running_accu = []
        
        for img, label in tqdm(train_loader, desc="Training", leave=False):
            img, label = img.to(device), label.to(device)

            emb = model(img)
            arc_out = arc_margin(emb, label)
            loss = criterion(arc_out, label)

            pred = torch.argmax(arc_out, 1)
            accuracy = torch.eq(pred, label).sum() / len(img)

            optimizer.zero_grad()
            loss.backward()
            loss_optimizer.step()
            optimizer.step()

            running_accu.append(accuracy.cpu().detach().numpy())
            running_loss.append(loss.cpu().detach().numpy())
        print("Epoch: {}/{} - Loss: {:.4f} - Accuracy: {:.4f}".format(epoch+1, num_epochs, np.mean(running_loss), np.mean(running_accu)))
        

        model.eval()
        val_loss = []
        val_accu = []
        
        with torch.no_grad():
            for img, label in tqdm(val_loader):
                img, label = img.to(device), label.to(device)
                
                embed = model(img)
                out = arc_margin(embed, label)
                loss = criterion(out, label)
                
                pred = torch.argmax(out, 1)
                accuracy = torch.eq(pred, label).sum() / len(img)
                
                val_accu.append(accuracy.cpu().detach().numpy())
                val_loss.append(loss.cpu().detach().numpy())
        val_accu = np.mean(val_accu)
        val_loss = np.mean(val_loss)
        print("Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(val_loss, val_accu))
        
        trial.report(val_accu, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    global best_accu
    if val_accu > best_accu:
        best_accu = val_accu
        print("Saving best model...")
        torch.save(model.state_dict(), "../vit_12-8_arcface.pt")
            
    return val_accu

In [18]:
study = optuna.create_study(direction='maximize',
                            study_name='vit-arcface-12-study',
                            storage='sqlite:///study4.db',
                            load_if_exists=True)
study.optimize(objective, n_trials=10)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

# Display the study statistics
print("\nStudy statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

[32m[I 2023-11-27 15:11:03,456][0m A new study created in RDB with name: vit-arcface-12-study[0m


Learning rate for Loss: 0.0005287012966014011
Learning rate: 3.9480188370206865e-05
Weight decay: 0.018861547338784878
Epsilon: 1.0993114732445902e-09
Batch size: 55
Number of epochs: 17


Epochs:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 1/17 - Loss: 16.2934 - Accuracy: 0.2291


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 9.1203 - Val Accuracy: 0.4766


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 2/17 - Loss: 8.8014 - Accuracy: 0.5493


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 7.0444 - Val Accuracy: 0.6107


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 3/17 - Loss: 7.5018 - Accuracy: 0.6444


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 6.2802 - Val Accuracy: 0.6759


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 4/17 - Loss: 6.9104 - Accuracy: 0.6852


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 5.7905 - Val Accuracy: 0.7104


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 5/17 - Loss: 6.6146 - Accuracy: 0.7156


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 5.5101 - Val Accuracy: 0.7360


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 6/17 - Loss: 6.2994 - Accuracy: 0.7362


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 5.3264 - Val Accuracy: 0.7599


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 7/17 - Loss: 6.2009 - Accuracy: 0.7416


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 5.1267 - Val Accuracy: 0.7741


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 8/17 - Loss: 6.2170 - Accuracy: 0.7569


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 5.0090 - Val Accuracy: 0.7849


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 9/17 - Loss: 6.0614 - Accuracy: 0.7626


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.9080 - Val Accuracy: 0.7946


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 10/17 - Loss: 5.9873 - Accuracy: 0.7680


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.8780 - Val Accuracy: 0.7991


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 11/17 - Loss: 5.9745 - Accuracy: 0.7751


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.8371 - Val Accuracy: 0.8065


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 12/17 - Loss: 5.6956 - Accuracy: 0.7831


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.8089 - Val Accuracy: 0.8139


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 13/17 - Loss: 5.8513 - Accuracy: 0.7830


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.7907 - Val Accuracy: 0.8156


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 14/17 - Loss: 5.7114 - Accuracy: 0.7847


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.7178 - Val Accuracy: 0.8202


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 15/17 - Loss: 5.7199 - Accuracy: 0.7897


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.7174 - Val Accuracy: 0.8241


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 16/17 - Loss: 5.6384 - Accuracy: 0.7854


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.6688 - Val Accuracy: 0.8263


Training:   0%|          | 0/255 [00:00<?, ?it/s]

Epoch: 17/17 - Loss: 5.7067 - Accuracy: 0.7937


  0%|          | 0/32 [00:00<?, ?it/s]

Val Loss: 4.6202 - Val Accuracy: 0.8261
Saving best model...


[32m[I 2023-11-27 15:36:18,338][0m Trial 0 finished with value: 0.8260653018951416 and parameters: {'loss_learning_rate': 0.0005287012966014011, 'learning_rate': 3.9480188370206865e-05, 'weight_decay': 0.018861547338784878, 'epsilon': 1.0993114732445902e-09, 'batch_size': 55, 'epochs': 17}. Best is trial 0 with value: 0.8260653018951416.[0m


Learning rate for Loss: 0.0064512419663663905
Learning rate: 8.414269697509338e-05
Weight decay: 0.002079896551324374
Epsilon: 2.817232586541786e-08
Batch size: 122
Number of epochs: 37


Epochs:   0%|          | 0/37 [00:00<?, ?it/s]

Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 1/37 - Loss: 6.3350 - Accuracy: 0.7515


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.8179 - Val Accuracy: 0.8301


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 2/37 - Loss: 5.5334 - Accuracy: 0.7980


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.7740 - Val Accuracy: 0.8290


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 3/37 - Loss: 5.6169 - Accuracy: 0.8015


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.6720 - Val Accuracy: 0.8330


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 4/37 - Loss: 5.6007 - Accuracy: 0.8013


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.5946 - Val Accuracy: 0.8351


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 5/37 - Loss: 5.3330 - Accuracy: 0.8090


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.6142 - Val Accuracy: 0.8373


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 6/37 - Loss: 5.4387 - Accuracy: 0.8062


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.5847 - Val Accuracy: 0.8357


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 7/37 - Loss: 5.2645 - Accuracy: 0.8055


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.5229 - Val Accuracy: 0.8397


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 8/37 - Loss: 5.3715 - Accuracy: 0.8060


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.5367 - Val Accuracy: 0.8412


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 9/37 - Loss: 5.1702 - Accuracy: 0.8112


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4848 - Val Accuracy: 0.8421


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 10/37 - Loss: 5.2484 - Accuracy: 0.8111


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4949 - Val Accuracy: 0.8423


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 11/37 - Loss: 5.2093 - Accuracy: 0.8120


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4257 - Val Accuracy: 0.8450


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 12/37 - Loss: 5.2401 - Accuracy: 0.8130


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4533 - Val Accuracy: 0.8480


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 13/37 - Loss: 5.1778 - Accuracy: 0.8157


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4581 - Val Accuracy: 0.8410


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 14/37 - Loss: 5.0849 - Accuracy: 0.8170


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4556 - Val Accuracy: 0.8450


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 15/37 - Loss: 5.3076 - Accuracy: 0.8117


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4661 - Val Accuracy: 0.8441


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 16/37 - Loss: 5.1720 - Accuracy: 0.8125


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4702 - Val Accuracy: 0.8443


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 17/37 - Loss: 5.0895 - Accuracy: 0.8118


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.4274 - Val Accuracy: 0.8461


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 18/37 - Loss: 5.1460 - Accuracy: 0.8190


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.3926 - Val Accuracy: 0.8546


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 19/37 - Loss: 5.1829 - Accuracy: 0.8158


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.3186 - Val Accuracy: 0.8546


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 20/37 - Loss: 5.0998 - Accuracy: 0.8188


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.3058 - Val Accuracy: 0.8535


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 21/37 - Loss: 5.0486 - Accuracy: 0.8208


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.3224 - Val Accuracy: 0.8535


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 22/37 - Loss: 5.0751 - Accuracy: 0.8187


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2797 - Val Accuracy: 0.8506


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 23/37 - Loss: 5.1171 - Accuracy: 0.8198


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2157 - Val Accuracy: 0.8523


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 24/37 - Loss: 5.1451 - Accuracy: 0.8201


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2442 - Val Accuracy: 0.8517


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 25/37 - Loss: 5.1197 - Accuracy: 0.8214


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2486 - Val Accuracy: 0.8566


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 26/37 - Loss: 5.0046 - Accuracy: 0.8202


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1922 - Val Accuracy: 0.8564


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 27/37 - Loss: 4.9511 - Accuracy: 0.8214


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1788 - Val Accuracy: 0.8586


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 28/37 - Loss: 5.0457 - Accuracy: 0.8208


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2171 - Val Accuracy: 0.8557


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 29/37 - Loss: 4.9995 - Accuracy: 0.8220


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1688 - Val Accuracy: 0.8526


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 30/37 - Loss: 5.0402 - Accuracy: 0.8244


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1481 - Val Accuracy: 0.8546


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 31/37 - Loss: 4.9404 - Accuracy: 0.8248


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1507 - Val Accuracy: 0.8544


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 32/37 - Loss: 4.9407 - Accuracy: 0.8258


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1829 - Val Accuracy: 0.8561


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 33/37 - Loss: 4.9928 - Accuracy: 0.8233


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1139 - Val Accuracy: 0.8595


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 34/37 - Loss: 5.0129 - Accuracy: 0.8243


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1480 - Val Accuracy: 0.8572


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 35/37 - Loss: 4.9946 - Accuracy: 0.8230


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.1669 - Val Accuracy: 0.8602


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 36/37 - Loss: 4.9277 - Accuracy: 0.8289


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2142 - Val Accuracy: 0.8612


Training:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch: 37/37 - Loss: 4.9341 - Accuracy: 0.8271


  0%|          | 0/15 [00:00<?, ?it/s]

Val Loss: 4.2400 - Val Accuracy: 0.8595
Saving best model...


[32m[I 2023-11-27 16:27:44,639][0m Trial 1 finished with value: 0.8595185875892639 and parameters: {'loss_learning_rate': 0.0064512419663663905, 'learning_rate': 8.414269697509338e-05, 'weight_decay': 0.002079896551324374, 'epsilon': 2.817232586541786e-08, 'batch_size': 122, 'epochs': 37}. Best is trial 1 with value: 0.8595185875892639.[0m


Learning rate for Loss: 0.002048066701138052
Learning rate: 0.002294502691943137
Weight decay: 0.07883431997632344
Epsilon: 1.9496876915079775e-09
Batch size: 111
Number of epochs: 26


Epochs:   0%|          | 0/26 [00:00<?, ?it/s]

Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 1/26 - Loss: 6.0900 - Accuracy: 0.7646


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 4.3831 - Val Accuracy: 0.8440


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 2/26 - Loss: 5.2255 - Accuracy: 0.8107


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 3.8396 - Val Accuracy: 0.8488


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 3/26 - Loss: 4.4939 - Accuracy: 0.8185


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 3.5347 - Val Accuracy: 0.8472


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 4/26 - Loss: 3.9937 - Accuracy: 0.8119


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 3.0705 - Val Accuracy: 0.8608


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 5/26 - Loss: 3.6514 - Accuracy: 0.8128


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.7847 - Val Accuracy: 0.8509


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 6/26 - Loss: 3.3695 - Accuracy: 0.8059


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.4639 - Val Accuracy: 0.8481


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 7/26 - Loss: 3.0959 - Accuracy: 0.8012


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.4029 - Val Accuracy: 0.8333


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 8/26 - Loss: 2.9156 - Accuracy: 0.7917


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2912 - Val Accuracy: 0.8324


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 9/26 - Loss: 2.6530 - Accuracy: 0.7932


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.1768 - Val Accuracy: 0.8269


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 10/26 - Loss: 2.5208 - Accuracy: 0.7857


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.0997 - Val Accuracy: 0.8315


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 11/26 - Loss: 2.5407 - Accuracy: 0.7771


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.1007 - Val Accuracy: 0.8124


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 12/26 - Loss: 2.5706 - Accuracy: 0.7701


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.1418 - Val Accuracy: 0.8089


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 13/26 - Loss: 2.6316 - Accuracy: 0.7633


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2120 - Val Accuracy: 0.8071


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 14/26 - Loss: 2.6893 - Accuracy: 0.7641


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2083 - Val Accuracy: 0.8104


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 15/26 - Loss: 2.6903 - Accuracy: 0.7757


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.1800 - Val Accuracy: 0.8178


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 16/26 - Loss: 2.7523 - Accuracy: 0.7777


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2549 - Val Accuracy: 0.8226


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 17/26 - Loss: 2.8141 - Accuracy: 0.7868


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.3845 - Val Accuracy: 0.8300


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 18/26 - Loss: 2.7892 - Accuracy: 0.7931


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.3827 - Val Accuracy: 0.8330


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 19/26 - Loss: 2.8304 - Accuracy: 0.8040


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.3685 - Val Accuracy: 0.8480


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 20/26 - Loss: 2.8143 - Accuracy: 0.8110


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2975 - Val Accuracy: 0.8534


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 21/26 - Loss: 2.8504 - Accuracy: 0.8076


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2861 - Val Accuracy: 0.8388


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 22/26 - Loss: 2.8325 - Accuracy: 0.8040


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2098 - Val Accuracy: 0.8452


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 23/26 - Loss: 2.8429 - Accuracy: 0.8051


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2880 - Val Accuracy: 0.8405


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 24/26 - Loss: 2.7943 - Accuracy: 0.8074


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.1888 - Val Accuracy: 0.8476


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 25/26 - Loss: 2.7256 - Accuracy: 0.8087


  0%|          | 0/16 [00:00<?, ?it/s]

Val Loss: 2.2000 - Val Accuracy: 0.8500


Training:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch: 26/26 - Loss: 2.7727 - Accuracy: 0.7960


  0%|          | 0/16 [00:00<?, ?it/s]

[32m[I 2023-11-27 17:04:23,924][0m Trial 2 finished with value: 0.829969048500061 and parameters: {'loss_learning_rate': 0.002048066701138052, 'learning_rate': 0.002294502691943137, 'weight_decay': 0.07883431997632344, 'epsilon': 1.9496876915079775e-09, 'batch_size': 111, 'epochs': 26}. Best is trial 1 with value: 0.8595185875892639.[0m


Val Loss: 2.2080 - Val Accuracy: 0.8300
Learning rate for Loss: 0.005105394341604069
Learning rate: 0.011774973968694286
Weight decay: 0.01974560386402252
Epsilon: 2.158051575526208e-09
Batch size: 80
Number of epochs: 14


Epochs:   0%|          | 0/14 [00:00<?, ?it/s]

Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 1/14 - Loss: 4.5888 - Accuracy: 0.7416


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.5438 - Val Accuracy: 0.7857


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 2/14 - Loss: 2.8446 - Accuracy: 0.7644


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.3006 - Val Accuracy: 0.8031


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 3/14 - Loss: 2.6955 - Accuracy: 0.7614


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.2309 - Val Accuracy: 0.8042


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 4/14 - Loss: 2.6264 - Accuracy: 0.7705


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.0615 - Val Accuracy: 0.8022


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 5/14 - Loss: 2.7216 - Accuracy: 0.7566


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.1703 - Val Accuracy: 0.8196


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 6/14 - Loss: 2.7259 - Accuracy: 0.7626


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.1854 - Val Accuracy: 0.8003


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 7/14 - Loss: 2.6766 - Accuracy: 0.7600


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.2219 - Val Accuracy: 0.7949


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 8/14 - Loss: 2.7143 - Accuracy: 0.7672


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.3179 - Val Accuracy: 0.7935


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 9/14 - Loss: 2.6362 - Accuracy: 0.7666


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.1337 - Val Accuracy: 0.8063


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 10/14 - Loss: 2.5726 - Accuracy: 0.7728


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.3256 - Val Accuracy: 0.8043


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 11/14 - Loss: 2.6960 - Accuracy: 0.7652


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.2252 - Val Accuracy: 0.8128


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 12/14 - Loss: 2.6961 - Accuracy: 0.7637


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.0956 - Val Accuracy: 0.8019


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 13/14 - Loss: 2.7203 - Accuracy: 0.7656


  0%|          | 0/22 [00:00<?, ?it/s]

Val Loss: 2.1873 - Val Accuracy: 0.8037


Training:   0%|          | 0/175 [00:00<?, ?it/s]

Epoch: 14/14 - Loss: 2.7477 - Accuracy: 0.7620


  0%|          | 0/22 [00:00<?, ?it/s]

[32m[I 2023-11-27 17:24:04,078][0m Trial 3 finished with value: 0.8045454025268555 and parameters: {'loss_learning_rate': 0.005105394341604069, 'learning_rate': 0.011774973968694286, 'weight_decay': 0.01974560386402252, 'epsilon': 2.158051575526208e-09, 'batch_size': 80, 'epochs': 14}. Best is trial 1 with value: 0.8595185875892639.[0m


Val Loss: 2.1685 - Val Accuracy: 0.8045
Learning rate for Loss: 0.002078486291727591
Learning rate: 1.2343587217402633e-05
Weight decay: 0.033664043669104575
Epsilon: 1.57189898824515e-08
Batch size: 145
Number of epochs: 38


Epochs:   0%|          | 0/38 [00:00<?, ?it/s]

Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 1/38 - Loss: 7.7252 - Accuracy: 0.5500


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 2.8246 - Val Accuracy: 0.7692


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 2/38 - Loss: 2.9499 - Accuracy: 0.7442


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 2.1274 - Val Accuracy: 0.7894


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 3/38 - Loss: 2.6714 - Accuracy: 0.7518


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 2.0366 - Val Accuracy: 0.7920


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 4/38 - Loss: 2.5524 - Accuracy: 0.7528


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 2.0009 - Val Accuracy: 0.7942


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 5/38 - Loss: 2.5070 - Accuracy: 0.7570


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9901 - Val Accuracy: 0.7968


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 6/38 - Loss: 2.5489 - Accuracy: 0.7541


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9858 - Val Accuracy: 0.7973


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 7/38 - Loss: 2.5494 - Accuracy: 0.7530


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9802 - Val Accuracy: 0.7963


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 8/38 - Loss: 2.5436 - Accuracy: 0.7523


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9786 - Val Accuracy: 0.7973


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 9/38 - Loss: 2.4984 - Accuracy: 0.7571


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9754 - Val Accuracy: 0.7979


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 10/38 - Loss: 2.5135 - Accuracy: 0.7577


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9698 - Val Accuracy: 0.7979


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 11/38 - Loss: 2.5708 - Accuracy: 0.7540


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9694 - Val Accuracy: 0.7979


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 12/38 - Loss: 2.5355 - Accuracy: 0.7544


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9698 - Val Accuracy: 0.7984


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 13/38 - Loss: 2.5257 - Accuracy: 0.7524


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9707 - Val Accuracy: 0.7989


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 14/38 - Loss: 2.5789 - Accuracy: 0.7552


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9721 - Val Accuracy: 0.7984


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 15/38 - Loss: 2.5563 - Accuracy: 0.7556


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9738 - Val Accuracy: 0.7984


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 16/38 - Loss: 2.5321 - Accuracy: 0.7556


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9761 - Val Accuracy: 0.7979


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 17/38 - Loss: 2.4968 - Accuracy: 0.7591


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9705 - Val Accuracy: 0.7984


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 18/38 - Loss: 2.5635 - Accuracy: 0.7530


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9732 - Val Accuracy: 0.8000


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 19/38 - Loss: 2.5571 - Accuracy: 0.7585


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9721 - Val Accuracy: 0.8005


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 20/38 - Loss: 2.4628 - Accuracy: 0.7578


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9712 - Val Accuracy: 0.8011


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 21/38 - Loss: 2.4603 - Accuracy: 0.7598


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9704 - Val Accuracy: 0.8011


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 22/38 - Loss: 2.5141 - Accuracy: 0.7603


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9698 - Val Accuracy: 0.8016


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 23/38 - Loss: 2.5716 - Accuracy: 0.7586


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9653 - Val Accuracy: 0.8021


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 24/38 - Loss: 2.5697 - Accuracy: 0.7588


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9650 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 25/38 - Loss: 2.5445 - Accuracy: 0.7598


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9608 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 26/38 - Loss: 2.5265 - Accuracy: 0.7590


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9606 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 27/38 - Loss: 2.4937 - Accuracy: 0.7594


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9604 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 28/38 - Loss: 2.5681 - Accuracy: 0.7566


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9602 - Val Accuracy: 0.8032


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 29/38 - Loss: 2.5578 - Accuracy: 0.7562


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9598 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 30/38 - Loss: 2.5371 - Accuracy: 0.7567


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9598 - Val Accuracy: 0.8027


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 31/38 - Loss: 2.5385 - Accuracy: 0.7569


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9598 - Val Accuracy: 0.8037


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 32/38 - Loss: 2.5271 - Accuracy: 0.7598


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9598 - Val Accuracy: 0.8037


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 33/38 - Loss: 2.5656 - Accuracy: 0.7598


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9600 - Val Accuracy: 0.8037


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 34/38 - Loss: 2.4671 - Accuracy: 0.7554


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9640 - Val Accuracy: 0.8032


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 35/38 - Loss: 2.5276 - Accuracy: 0.7595


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9640 - Val Accuracy: 0.8032


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 36/38 - Loss: 2.5413 - Accuracy: 0.7590


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9640 - Val Accuracy: 0.8037


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 37/38 - Loss: 2.5194 - Accuracy: 0.7656


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 1.9641 - Val Accuracy: 0.8037


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 38/38 - Loss: 2.5121 - Accuracy: 0.7607


  0%|          | 0/13 [00:00<?, ?it/s]

[32m[I 2023-11-27 18:16:13,472][0m Trial 4 finished with value: 0.8037135601043701 and parameters: {'loss_learning_rate': 0.002078486291727591, 'learning_rate': 1.2343587217402633e-05, 'weight_decay': 0.033664043669104575, 'epsilon': 1.57189898824515e-08, 'batch_size': 145, 'epochs': 38}. Best is trial 1 with value: 0.8595185875892639.[0m


Val Loss: 1.9641 - Val Accuracy: 0.8037
Learning rate for Loss: 0.0004705276131049822
Learning rate: 6.938102103951862e-05
Weight decay: 0.0023123231673494145
Epsilon: 2.4452158562365622e-08
Batch size: 118
Number of epochs: 48


Epochs:   0%|          | 0/48 [00:00<?, ?it/s]

Training:   0%|          | 0/119 [00:00<?, ?it/s]

Epoch: 1/48 - Loss: 15.4977 - Accuracy: 0.2711


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2023-11-27 18:17:37,457][0m Trial 5 pruned. [0m


Val Loss: 4.8425 - Val Accuracy: 0.6447
Learning rate for Loss: 0.0001327558072769619
Learning rate: 0.08117958145129886
Weight decay: 0.012568233465178961
Epsilon: 2.3381741711891608e-07
Batch size: 145
Number of epochs: 17


Epochs:   0%|          | 0/17 [00:00<?, ?it/s]

Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 1/17 - Loss: 4.8292 - Accuracy: 0.7289


  0%|          | 0/13 [00:00<?, ?it/s]

Val Loss: 2.5963 - Val Accuracy: 0.8000


Training:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch: 2/17 - Loss: 2.9486 - Accuracy: 0.7484


  0%|          | 0/13 [00:00<?, ?it/s]

[32m[I 2023-11-27 18:20:22,517][0m Trial 6 pruned. [0m


Val Loss: 2.1971 - Val Accuracy: 0.7719
Learning rate for Loss: 0.00021864433464278268
Learning rate: 0.09280151879826815
Weight decay: 0.003976174639929885
Epsilon: 1.186486500543241e-09
Batch size: 216
Number of epochs: 23


Epochs:   0%|          | 0/23 [00:00<?, ?it/s]

Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 1/23 - Loss: 6.0000 - Accuracy: 0.6925


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.9426 - Val Accuracy: 0.8144


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 2/23 - Loss: 3.4219 - Accuracy: 0.7915


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.3568 - Val Accuracy: 0.8302


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 3/23 - Loss: 2.8051 - Accuracy: 0.7902


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0201 - Val Accuracy: 0.8092


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 4/23 - Loss: 2.6641 - Accuracy: 0.7504


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.2151 - Val Accuracy: 0.7652


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 5/23 - Loss: 2.9448 - Accuracy: 0.7094


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.3273 - Val Accuracy: 0.7599


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 6/23 - Loss: 2.7763 - Accuracy: 0.7658


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0357 - Val Accuracy: 0.8362


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 7/23 - Loss: 2.6837 - Accuracy: 0.7805


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.1192 - Val Accuracy: 0.8264


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 8/23 - Loss: 2.6434 - Accuracy: 0.7748


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 1.9960 - Val Accuracy: 0.8086


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 9/23 - Loss: 2.6944 - Accuracy: 0.7698


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0161 - Val Accuracy: 0.7964


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 10/23 - Loss: 2.6991 - Accuracy: 0.7625


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 1.9363 - Val Accuracy: 0.8239


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 11/23 - Loss: 2.6387 - Accuracy: 0.7737


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0050 - Val Accuracy: 0.8187


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 12/23 - Loss: 2.7392 - Accuracy: 0.7633


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0289 - Val Accuracy: 0.8177


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 13/23 - Loss: 2.7311 - Accuracy: 0.7722


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0172 - Val Accuracy: 0.8233


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 14/23 - Loss: 2.7281 - Accuracy: 0.7834


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 1.9654 - Val Accuracy: 0.8429


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 15/23 - Loss: 2.8331 - Accuracy: 0.7853


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.2262 - Val Accuracy: 0.8226


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 16/23 - Loss: 2.7927 - Accuracy: 0.7936


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.1730 - Val Accuracy: 0.8217


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 17/23 - Loss: 2.7149 - Accuracy: 0.8056


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.0930 - Val Accuracy: 0.8420


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 18/23 - Loss: 2.7520 - Accuracy: 0.8054


  0%|          | 0/9 [00:00<?, ?it/s]

Val Loss: 2.1365 - Val Accuracy: 0.8370


Training:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch: 19/23 - Loss: 2.7455 - Accuracy: 0.8004


  0%|          | 0/9 [00:00<?, ?it/s]

[32m[I 2023-11-27 18:46:31,916][0m Trial 7 pruned. [0m


Val Loss: 1.9965 - Val Accuracy: 0.8364
Learning rate for Loss: 0.0048218331409497495
Learning rate: 0.00018752660636600846
Weight decay: 0.0020822059185875976
Epsilon: 3.550395771074807e-08
Batch size: 183
Number of epochs: 32


Epochs:   0%|          | 0/32 [00:00<?, ?it/s]

Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 1/32 - Loss: 6.8267 - Accuracy: 0.7108


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 4.3433 - Val Accuracy: 0.8348


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 2/32 - Loss: 4.9714 - Accuracy: 0.8029


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 4.1280 - Val Accuracy: 0.8256


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 3/32 - Loss: 4.8144 - Accuracy: 0.7969


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.9718 - Val Accuracy: 0.8294


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 4/32 - Loss: 4.8179 - Accuracy: 0.7961


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.8708 - Val Accuracy: 0.8331


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 5/32 - Loss: 4.5019 - Accuracy: 0.7984


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.7899 - Val Accuracy: 0.8337


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 6/32 - Loss: 4.4878 - Accuracy: 0.8008


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.7228 - Val Accuracy: 0.8331


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 7/32 - Loss: 4.4549 - Accuracy: 0.7948


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.6660 - Val Accuracy: 0.8331


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 8/32 - Loss: 4.2058 - Accuracy: 0.7995


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.6150 - Val Accuracy: 0.8331


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 9/32 - Loss: 4.2495 - Accuracy: 0.7961


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.5744 - Val Accuracy: 0.8326


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 10/32 - Loss: 4.1821 - Accuracy: 0.7978


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.5328 - Val Accuracy: 0.8321


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 11/32 - Loss: 4.2070 - Accuracy: 0.7983


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.4954 - Val Accuracy: 0.8316


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 12/32 - Loss: 4.1710 - Accuracy: 0.7956


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.4605 - Val Accuracy: 0.8305


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 13/32 - Loss: 3.9759 - Accuracy: 0.8012


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.4331 - Val Accuracy: 0.8299


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 14/32 - Loss: 3.9591 - Accuracy: 0.7981


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.4115 - Val Accuracy: 0.8305


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 15/32 - Loss: 3.9687 - Accuracy: 0.8008


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.3875 - Val Accuracy: 0.8310


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 16/32 - Loss: 3.9467 - Accuracy: 0.7980


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.3613 - Val Accuracy: 0.8310


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 17/32 - Loss: 3.9815 - Accuracy: 0.8000


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.3353 - Val Accuracy: 0.8310


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 18/32 - Loss: 3.8425 - Accuracy: 0.8002


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.3104 - Val Accuracy: 0.8310


Training:   0%|          | 0/77 [00:00<?, ?it/s]

Epoch: 19/32 - Loss: 3.8855 - Accuracy: 0.7997


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2023-11-27 19:12:54,803][0m Trial 8 pruned. [0m


Val Loss: 3.2870 - Val Accuracy: 0.8310
Learning rate for Loss: 0.008460081114259922
Learning rate: 0.005506471740230944
Weight decay: 0.0023352866812973586
Epsilon: 7.115835655333167e-08
Batch size: 193
Number of epochs: 33


Epochs:   0%|          | 0/33 [00:00<?, ?it/s]

Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 1/33 - Loss: 6.2427 - Accuracy: 0.7518


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.8656 - Val Accuracy: 0.8429


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 2/33 - Loss: 4.7446 - Accuracy: 0.8114


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.4337 - Val Accuracy: 0.8517


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 3/33 - Loss: 4.5210 - Accuracy: 0.8136


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.2481 - Val Accuracy: 0.8538


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 4/33 - Loss: 4.2664 - Accuracy: 0.8147


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 3.0911 - Val Accuracy: 0.8543


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 5/33 - Loss: 3.9788 - Accuracy: 0.8173


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.9479 - Val Accuracy: 0.8543


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 6/33 - Loss: 3.8283 - Accuracy: 0.8195


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.8634 - Val Accuracy: 0.8569


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 7/33 - Loss: 3.7267 - Accuracy: 0.8214


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.7864 - Val Accuracy: 0.8564


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 8/33 - Loss: 3.7256 - Accuracy: 0.8199


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.7348 - Val Accuracy: 0.8590


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 9/33 - Loss: 3.5967 - Accuracy: 0.8199


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.6883 - Val Accuracy: 0.8585


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 10/33 - Loss: 3.4290 - Accuracy: 0.8233


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.6267 - Val Accuracy: 0.8574


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 11/33 - Loss: 3.3761 - Accuracy: 0.8240


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.6156 - Val Accuracy: 0.8569


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 12/33 - Loss: 3.4078 - Accuracy: 0.8186


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.5794 - Val Accuracy: 0.8616


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 13/33 - Loss: 3.2269 - Accuracy: 0.8265


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.5662 - Val Accuracy: 0.8631


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 14/33 - Loss: 3.2941 - Accuracy: 0.8241


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.5426 - Val Accuracy: 0.8746


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 15/33 - Loss: 3.2967 - Accuracy: 0.8260


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.5337 - Val Accuracy: 0.8725


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 16/33 - Loss: 3.1050 - Accuracy: 0.8280


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.5166 - Val Accuracy: 0.8741


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 17/33 - Loss: 3.2314 - Accuracy: 0.8275


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4951 - Val Accuracy: 0.8720


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 18/33 - Loss: 3.2188 - Accuracy: 0.8241


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4937 - Val Accuracy: 0.8689


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 19/33 - Loss: 3.2473 - Accuracy: 0.8215


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4714 - Val Accuracy: 0.8699


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 20/33 - Loss: 3.1486 - Accuracy: 0.8236


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4607 - Val Accuracy: 0.8746


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 21/33 - Loss: 3.1593 - Accuracy: 0.8236


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4473 - Val Accuracy: 0.8731


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 22/33 - Loss: 3.0333 - Accuracy: 0.8283


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4382 - Val Accuracy: 0.8705


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 23/33 - Loss: 3.1054 - Accuracy: 0.8252


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4353 - Val Accuracy: 0.8710


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 24/33 - Loss: 3.1612 - Accuracy: 0.8237


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4169 - Val Accuracy: 0.8705


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 25/33 - Loss: 3.0831 - Accuracy: 0.8229


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.4077 - Val Accuracy: 0.8715


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 26/33 - Loss: 3.0322 - Accuracy: 0.8228


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3923 - Val Accuracy: 0.8720


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 27/33 - Loss: 3.0597 - Accuracy: 0.8276


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3817 - Val Accuracy: 0.8725


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 28/33 - Loss: 3.0261 - Accuracy: 0.8238


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3811 - Val Accuracy: 0.8751


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 29/33 - Loss: 2.9526 - Accuracy: 0.8310


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3833 - Val Accuracy: 0.8746


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 30/33 - Loss: 3.0965 - Accuracy: 0.8245


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3813 - Val Accuracy: 0.8741


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 31/33 - Loss: 2.9405 - Accuracy: 0.8303


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3699 - Val Accuracy: 0.8720


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 32/33 - Loss: 2.9861 - Accuracy: 0.8245


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3730 - Val Accuracy: 0.8720


Training:   0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 33/33 - Loss: 3.1101 - Accuracy: 0.8186


  0%|          | 0/10 [00:00<?, ?it/s]

Val Loss: 2.3584 - Val Accuracy: 0.8725
Saving best model...


[32m[I 2023-11-27 19:58:30,086][0m Trial 9 finished with value: 0.8725388646125793 and parameters: {'loss_learning_rate': 0.008460081114259922, 'learning_rate': 0.005506471740230944, 'weight_decay': 0.0023352866812973586, 'epsilon': 7.115835655333167e-08, 'batch_size': 193, 'epochs': 33}. Best is trial 9 with value: 0.8725388646125793.[0m



Study statistics: 
  Number of finished trials:  10
  Number of pruned trials:  4
  Number of complete trials:  6


In [19]:
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.8725388646125793
  Params: 
    batch_size: 193
    epochs: 33
    epsilon: 7.115835655333167e-08
    learning_rate: 0.005506471740230944
    loss_learning_rate: 0.008460081114259922
    weight_decay: 0.0023352866812973586


In [None]:
# ViT P12-S8 CosFace

Best trial:
Value:  0.8658874034881592
Params: 
batch_size: 194
epochs: 29
epsilon: 1.36747634297886e-09
learning_rate: 0.0004065644431783593
loss_learning_rate: 0.00311170833117293
weight_decay: 0.004542522877662855

In [None]:
# ViT P12-S8 ArcFace

Best trial:
Value:  0.8725388646125793
Params: 
batch_size: 193
epochs: 33
epsilon: 7.115835655333167e-08
learning_rate: 0.005506471740230944
loss_learning_rate: 0.008460081114259922
weight_decay: 0.0023352866812973586