## Imports

In [1]:
import os
import torch
import torch.nn as nn
import wandb
import numpy as np
import torchvision.transforms

from mmcv_csn import ResNet3dCSN
from csn import csn50
from i3d_head import I3DHead
from autoencoder import EncoderDecoder
from depth_head import DepthHead
from scheduler import GradualWarmupScheduler
from mmaction.datasets import build_dataset

os.chdir('../')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init(entity="cares", project="autoencoder",
           group="wlasl-10", name="depth")

[34m[1mwandb[0m: Currently logged in as: [33msttaseen[0m ([33mcares[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Device Agnostic Code

In [3]:
# Set up device agnostic code
try:
    device = 'mps' if torch.backends.mps.is_available() else 'cpu'
except:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Dataset

In [4]:
train_cfg=dict(
    type='RawframeDataset',
    ann_file='data/wlasl10/train_annotations.txt',
    data_prefix='data/wlasl10/rawframes',
    pipeline=[
        dict(
            type='SampleFrames',
            clip_len=32,
            frame_interval=2,
            num_clips=1),
        dict(type='RawFrameDecode'),
        dict(type='Resize', scale=(-1, 256)),
        dict(type='RandomResizedCrop'),
        dict(type='Resize', scale=(224, 224), keep_ratio=False),
        dict(type='Flip', flip_ratio=0.5),
        dict(
            type='Normalize',
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
            to_bgr=False),
        dict(type='FormatShape', input_format='NCTHW'),
        dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
        dict(type='ToTensor', keys=['imgs', 'label'])
    ])


test_cfg=dict(
        type='RawframeDataset',
        ann_file='data/wlasl10/test_annotations.txt',
        data_prefix='data/wlasl10/rawframes',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=32,
                frame_interval=2,
                num_clips=1,
                test_mode=True),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='CenterCrop', crop_size=224),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_bgr=False),
            dict(type='FormatShape', input_format='NCTHW'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs'])
    ])

In [5]:
work_dir = 'work_dirs/wlasl10-depth-dataset/'

os.makedirs(work_dir, exist_ok=True)

In [6]:
# Building the datasets

batch_size = 2

train_dataset = build_dataset(train_cfg)
test_dataset = build_dataset(test_cfg)

# Setting up dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                    batch_size=1,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True)

## Model

In [7]:
# Create a CSN model
encoder = ResNet3dCSN(
    pretrained2d=False,
    # pretrained=None,
    pretrained='https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth',
    depth=50,
    with_pool2=False,
    bottleneck_mode='ir',
    norm_eval=True,
    zero_init_residual=False,
    bn_frozen=True
)

encoder.init_weights()

depth_head = DepthHead()

decoder = I3DHead(num_classes=400,
                 in_channels=2048,
                 spatial_type='avg',
                 dropout_ratio=0.5,
                 init_std=0.01)

decoder.init_weights()

model = EncoderDecoder(encoder, decoder, depth_head)

### Setup MiDaS

In [8]:
# Set up MiDaS depth model
model_type = "DPT_Large"     # MiDaS v3 - Large     (highest accuracy, slowest inference speed)
# model_type = "DPT_Hybrid"   # MiDaS v3 - Hybrid    (medium accuracy, medium inference speed)
# model_type = "MiDaS_small"  # MiDaS v2.1 - Small   (lowest accuracy, highest inference speed)

midas = torch.hub.load("intel-isl/MiDaS", model_type)
midas.to(device)
midas.eval()

Using cache found in /home/sadat/.cache/torch/hub/intel-isl_MiDaS_master


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (act): GELU()
            (drop1): Dropout(p=0

In [9]:
def estimate_depth(images):
    with torch.no_grad():
        depth = midas(images.permute(0,2,1,3,4).reshape(-1,3,224,224))

        depth = torch.nn.functional.interpolate(
            depth.unsqueeze(1),
            size=(224,224),
            mode="bicubic",
            align_corners=False,
        ).squeeze()
        
    return depth.reshape(-1, 1, 32, 224, 224)

## Optimizer

In [10]:
# Specify optimizer
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.000125, momentum=0.9, weight_decay=0.00001)

# Specify Loss
loss_cls = nn.CrossEntropyLoss()
loss_depth = nn.MSELoss()

# Specify total epochs
epochs = 100

# Specify learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=120, gamma=0.1)

scheduler_steplr = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[34, 84], gamma=0.1)
scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=16, after_scheduler=scheduler_steplr)

model.to(device)

EncoderDecoder(
  (encoder): ResNet3dCSN(
    (conv1): ConvModule(
      (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
      (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (activate): ReLU(inplace=True)
    )
    (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
    (pool2): MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=0, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): CSNBottleneck3d(
        (conv1): ConvModule(
          (conv): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (activate): ReLU(inplace=True)
        )
        (conv2): Sequential(
          (0): ConvModule(
            (conv): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=64, bi

In [11]:
# x0=torch.rand((1,64,32,112,112)).to(device)
# x1=torch.rand((1,256,32,56,56)).to(device)
# x2=torch.rand((1,512,16,28,28)).to(device)
# x3=torch.rand((1,1024,8,14,14)).to(device)
# x4=torch.rand((1,2048,4,7,7)).to(device)
# x = (x0,x1,x2,x3,x4)

In [12]:
# depth_head(x).squeeze().shape

## Train Loop

In [13]:
# Setup wandb
# wandb.watch(model, log_freq=10)

def top_k_accuracy(scores, labels, topk=(1, )):
    """Calculate top k accuracy score.
    Args:
        scores (list[np.ndarray]): Prediction scores for each class.
        labels (list[int]): Ground truth labels.
        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
    Returns:
        list[float]: Top k accuracy score for each k.
    """
    res = np.zeros(len(topk))
    labels = np.array(labels)[:, np.newaxis]
    for i, k in enumerate(topk):
        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res[i] = topk_acc_score

    return res


def train_one_epoch(epoch_index, interval=5):
    """Run one epoch for training.
    Args:
        epoch_index (int): Current epoch.
        interval (int): Frequency at which to print logs.
    Returns:
        last_loss (float): Loss value for the last batch.
    """
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, x in enumerate(train_loader):
        # Every data instance is an input + label pair
        images, targets = x['imgs'].to(device), x['label'].to(device)
        images = images.reshape((-1, ) + images.shape[2:])
        targets = targets.reshape(-1, )
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        cls_score, predicted_depth = model(images)
        
        # Estimate depth using MiDaS
        depth = estimate_depth(images)
        
        # Get losses
        loss_cls_score = loss_cls(cls_score, targets)
        loss_depth_score = loss_depth(predicted_depth, depth)
        loss = 0.8 * loss_cls_score + 0.2 * loss_depth_score
        
        # Compute the loss and its gradients
        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), max_norm=40, norm_type=2.0)

        # Adjust learning weights
        optimizer.step()
        

        # Gather data and report
        running_loss += loss.item()
        if i % interval == interval-1:
            last_loss = running_loss / interval  # loss per batch
            print(
                f'Epoch [{epoch_index}][{i+1}/{len(train_loader)}], loss_cls: {loss_cls_score.item():.5}, depth_loss: {loss_depth_score.item():.5} lr: {scheduler.get_last_lr()[0]:.5e}, loss: {last_loss:.5}')
            running_loss = 0.

    return last_loss, scheduler.get_last_lr()[0]


def validate():
    """Run one epoch for validation.
    Returns:
        avg_vloss (float): Validation loss value for the last batch.
        top1_acc (float): Top-1 accuracy in decimal.
        top5_acc (float): Top-5 accuracy in decimal.
    """
    running_vloss = 0.0
    running_vacc = np.zeros(2)

    print('Evaluating top_k_accuracy...')

    with torch.inference_mode():
        for i, x in enumerate(test_loader):
            vimages, vtargets = x['imgs'].to(device), x['label'].to(device)
            vimages = vimages.reshape((-1, ) + vimages.shape[2:])
            vtargets = vtargets.reshape(-1, )
            
            # Make predictions for this batch
            cls_score, predicted_depth = model(vimages)

            # Estimate depth using MiDaS
            depth = estimate_depth(vimages)

            # Get losses
            loss_cls_score = loss_cls(cls_score, vtargets)
            loss_depth_score = loss_depth(predicted_depth, depth)
            vloss = 0.8 * loss_cls_score + 0.2 * loss_depth_score
            
            running_vloss += vloss

            running_vacc += top_k_accuracy(cls_score.detach().cpu().numpy(),
                                           vtargets.detach().cpu().numpy(), topk=(1, 5))

    avg_vloss = running_vloss / (i + 1)

    acc = running_vacc/len(test_loader)
    top1_acc = acc[0].item()
    top5_acc = acc[1].item()

    return (avg_vloss, top1_acc, top5_acc)


# Train Loop
best_vloss = 1_000_000.

for epoch in range(epochs):
    # Turn on gradient tracking and do a forward pass
    model.train(True)
    avg_loss, learning_rate = train_one_epoch(epoch+1)

    # Turn off  gradients for reporting
    model.train(False)

    avg_vloss, top1_acc, top5_acc = validate()

    print(
        f'top1_acc: {top1_acc:.4}, top5_acc: {top5_acc:.4}, train_loss: {avg_loss:.5}, val_loss: {avg_vloss:.5}')

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = work_dir + f'epoch_{epoch+1}.pth'
        print(f'Saving checkpoint at {epoch+1} epochs...')
        torch.save(model.state_dict(), model_path)

     # Adjust learning rate
    scheduler.step()

    # Track wandb
    wandb.log({'train/loss': avg_loss,
               'train/learning_rate': learning_rate,
               'val/loss': avg_vloss,
               'val/top1_accuracy': top1_acc,
               'val/top5_accuracy': top5_acc})

Epoch [1][5/100], loss_cls: 5.957, depth_loss: 185.78 lr: 0.00000e+00, loss: 61.723
Epoch [1][10/100], loss_cls: 6.0782, depth_loss: 247.7 lr: 0.00000e+00, loss: 49.425
Epoch [1][15/100], loss_cls: 6.0302, depth_loss: 149.94 lr: 0.00000e+00, loss: 60.489
Epoch [1][20/100], loss_cls: 5.8484, depth_loss: 188.43 lr: 0.00000e+00, loss: 53.803
Epoch [1][25/100], loss_cls: 6.1516, depth_loss: 315.83 lr: 0.00000e+00, loss: 65.932
Epoch [1][30/100], loss_cls: 6.1372, depth_loss: 263.03 lr: 0.00000e+00, loss: 64.128
Epoch [1][35/100], loss_cls: 6.1178, depth_loss: 199.56 lr: 0.00000e+00, loss: 59.029
Epoch [1][40/100], loss_cls: 5.7811, depth_loss: 298.41 lr: 0.00000e+00, loss: 63.43
Epoch [1][45/100], loss_cls: 5.9172, depth_loss: 311.47 lr: 0.00000e+00, loss: 70.877
Epoch [1][50/100], loss_cls: 5.9634, depth_loss: 166.38 lr: 0.00000e+00, loss: 49.024
Epoch [1][55/100], loss_cls: 5.9322, depth_loss: 218.07 lr: 0.00000e+00, loss: 62.312
Epoch [1][60/100], loss_cls: 6.1725, depth_loss: 334.26 lr

Epoch [5][55/100], loss_cls: 3.2985, depth_loss: 121.6 lr: 3.12500e-05, loss: 23.588
Epoch [5][60/100], loss_cls: 3.2606, depth_loss: 101.36 lr: 3.12500e-05, loss: 21.682
Epoch [5][65/100], loss_cls: 2.7699, depth_loss: 64.207 lr: 3.12500e-05, loss: 21.874
Epoch [5][70/100], loss_cls: 2.3693, depth_loss: 87.796 lr: 3.12500e-05, loss: 19.021
Epoch [5][75/100], loss_cls: 2.1037, depth_loss: 49.692 lr: 3.12500e-05, loss: 15.965
Epoch [5][80/100], loss_cls: 2.4505, depth_loss: 93.658 lr: 3.12500e-05, loss: 20.284
Epoch [5][85/100], loss_cls: 4.1476, depth_loss: 44.358 lr: 3.12500e-05, loss: 20.281
Epoch [5][90/100], loss_cls: 2.4698, depth_loss: 34.601 lr: 3.12500e-05, loss: 12.501
Epoch [5][95/100], loss_cls: 3.0408, depth_loss: 66.134 lr: 3.12500e-05, loss: 17.334
Epoch [5][100/100], loss_cls: 1.7856, depth_loss: 46.679 lr: 3.12500e-05, loss: 14.633
Evaluating top_k_accuracy...
top1_acc: 0.1143, top5_acc: 0.6286, train_loss: 14.633, val_loss: 11.365
Saving checkpoint at 5 epochs...
Epoch

top1_acc: 0.1429, top5_acc: 0.4857, train_loss: 6.8287, val_loss: 5.8968
Saving checkpoint at 9 epochs...
Epoch [10][5/100], loss_cls: 1.5058, depth_loss: 24.909 lr: 7.03125e-05, loss: 5.9367
Epoch [10][10/100], loss_cls: 1.5175, depth_loss: 29.17 lr: 7.03125e-05, loss: 6.1134
Epoch [10][15/100], loss_cls: 2.2007, depth_loss: 33.926 lr: 7.03125e-05, loss: 7.8609
Epoch [10][20/100], loss_cls: 2.9843, depth_loss: 15.989 lr: 7.03125e-05, loss: 7.2094
Epoch [10][25/100], loss_cls: 2.7221, depth_loss: 39.553 lr: 7.03125e-05, loss: 6.7311
Epoch [10][30/100], loss_cls: 2.4088, depth_loss: 17.656 lr: 7.03125e-05, loss: 5.8487
Epoch [10][35/100], loss_cls: 3.2663, depth_loss: 26.673 lr: 7.03125e-05, loss: 8.4801
Epoch [10][40/100], loss_cls: 1.906, depth_loss: 24.622 lr: 7.03125e-05, loss: 6.7431
Epoch [10][45/100], loss_cls: 2.8371, depth_loss: 23.992 lr: 7.03125e-05, loss: 6.9391
Epoch [10][50/100], loss_cls: 0.71914, depth_loss: 23.455 lr: 7.03125e-05, loss: 6.2521
Epoch [10][55/100], loss_c

Epoch [14][45/100], loss_cls: 1.6939, depth_loss: 11.77 lr: 1.01563e-04, loss: 6.3929
Epoch [14][50/100], loss_cls: 3.3997, depth_loss: 31.633 lr: 1.01563e-04, loss: 6.9439
Epoch [14][55/100], loss_cls: 2.132, depth_loss: 14.638 lr: 1.01563e-04, loss: 6.5775
Epoch [14][60/100], loss_cls: 1.5259, depth_loss: 55.927 lr: 1.01563e-04, loss: 7.6575
Epoch [14][65/100], loss_cls: 0.0015866, depth_loss: 19.757 lr: 1.01563e-04, loss: 5.9938
Epoch [14][70/100], loss_cls: 4.3467, depth_loss: 11.743 lr: 1.01563e-04, loss: 5.0883
Epoch [14][75/100], loss_cls: 4.1739, depth_loss: 26.133 lr: 1.01563e-04, loss: 8.1704
Epoch [14][80/100], loss_cls: 1.5411, depth_loss: 41.423 lr: 1.01563e-04, loss: 5.7956
Epoch [14][85/100], loss_cls: 2.4287, depth_loss: 43.748 lr: 1.01563e-04, loss: 9.3122
Epoch [14][90/100], loss_cls: 1.0251, depth_loss: 27.085 lr: 1.01563e-04, loss: 5.0756
Epoch [14][95/100], loss_cls: 5.7041, depth_loss: 34.195 lr: 1.01563e-04, loss: 7.5132
Epoch [14][100/100], loss_cls: 1.5301, dep

Epoch [18][90/100], loss_cls: 4.7135, depth_loss: 31.027 lr: 1.25000e-04, loss: 7.9469
Epoch [18][95/100], loss_cls: 2.9992, depth_loss: 18.535 lr: 1.25000e-04, loss: 5.8633
Epoch [18][100/100], loss_cls: 3.1691, depth_loss: 41.909 lr: 1.25000e-04, loss: 7.4427
Evaluating top_k_accuracy...
top1_acc: 0.3714, top5_acc: 0.9143, train_loss: 7.4427, val_loss: 4.7797
Epoch [19][5/100], loss_cls: 3.8984, depth_loss: 19.905 lr: 1.25000e-04, loss: 6.2731
Epoch [19][10/100], loss_cls: 2.4005, depth_loss: 17.954 lr: 1.25000e-04, loss: 4.8124
Epoch [19][15/100], loss_cls: 0.0080468, depth_loss: 36.803 lr: 1.25000e-04, loss: 4.9173
Epoch [19][20/100], loss_cls: 2.0369, depth_loss: 17.396 lr: 1.25000e-04, loss: 5.9393
Epoch [19][25/100], loss_cls: 1.3523, depth_loss: 27.371 lr: 1.25000e-04, loss: 4.1416
Epoch [19][30/100], loss_cls: 2.2661, depth_loss: 10.254 lr: 1.25000e-04, loss: 4.7529
Epoch [19][35/100], loss_cls: 2.0465, depth_loss: 67.635 lr: 1.25000e-04, loss: 7.8822
Epoch [19][40/100], loss_

Epoch [23][30/100], loss_cls: 0.0031927, depth_loss: 17.74 lr: 1.25000e-04, loss: 4.0174
Epoch [23][35/100], loss_cls: 2.2993, depth_loss: 35.114 lr: 1.25000e-04, loss: 7.6215
Epoch [23][40/100], loss_cls: 1.2019, depth_loss: 20.958 lr: 1.25000e-04, loss: 5.8886
Epoch [23][45/100], loss_cls: 0.0087104, depth_loss: 12.222 lr: 1.25000e-04, loss: 4.0089
Epoch [23][50/100], loss_cls: 2.1741, depth_loss: 37.749 lr: 1.25000e-04, loss: 6.1134
Epoch [23][55/100], loss_cls: 2.0446, depth_loss: 26.205 lr: 1.25000e-04, loss: 5.9352
Epoch [23][60/100], loss_cls: 1.1954, depth_loss: 9.8482 lr: 1.25000e-04, loss: 5.8075
Epoch [23][65/100], loss_cls: 0.11137, depth_loss: 15.175 lr: 1.25000e-04, loss: 5.2311
Epoch [23][70/100], loss_cls: 1.6779, depth_loss: 17.928 lr: 1.25000e-04, loss: 4.3752
Epoch [23][75/100], loss_cls: 1.673, depth_loss: 28.967 lr: 1.25000e-04, loss: 6.6553
Epoch [23][80/100], loss_cls: 0.040073, depth_loss: 29.786 lr: 1.25000e-04, loss: 5.0371
Epoch [23][85/100], loss_cls: 0.6781

Epoch [27][75/100], loss_cls: 0.30047, depth_loss: 16.621 lr: 1.25000e-04, loss: 3.6252
Epoch [27][80/100], loss_cls: 3.0388, depth_loss: 12.62 lr: 1.25000e-04, loss: 4.9626
Epoch [27][85/100], loss_cls: 0.53315, depth_loss: 30.639 lr: 1.25000e-04, loss: 3.9128
Epoch [27][90/100], loss_cls: 0.0, depth_loss: 17.083 lr: 1.25000e-04, loss: 4.5771
Epoch [27][95/100], loss_cls: 0.1318, depth_loss: 7.9498 lr: 1.25000e-04, loss: 4.0028
Epoch [27][100/100], loss_cls: 0.26272, depth_loss: 15.392 lr: 1.25000e-04, loss: 3.308
Evaluating top_k_accuracy...
top1_acc: 0.6857, top5_acc: 0.9429, train_loss: 3.308, val_loss: 3.4711
Epoch [28][5/100], loss_cls: 0.38823, depth_loss: 26.878 lr: 1.25000e-04, loss: 4.2351
Epoch [28][10/100], loss_cls: 1.6172, depth_loss: 12.714 lr: 1.25000e-04, loss: 3.6093
Epoch [28][15/100], loss_cls: 0.39687, depth_loss: 15.944 lr: 1.25000e-04, loss: 4.1846
Epoch [28][20/100], loss_cls: 0.73271, depth_loss: 16.063 lr: 1.25000e-04, loss: 4.3666
Epoch [28][25/100], loss_cls

Epoch [32][15/100], loss_cls: 0.4448, depth_loss: 22.128 lr: 1.25000e-04, loss: 3.6094
Epoch [32][20/100], loss_cls: 0.57122, depth_loss: 22.011 lr: 1.25000e-04, loss: 3.9097
Epoch [32][25/100], loss_cls: 1.0237, depth_loss: 22.431 lr: 1.25000e-04, loss: 4.9831
Epoch [32][30/100], loss_cls: 1.5649, depth_loss: 18.895 lr: 1.25000e-04, loss: 3.1251
Epoch [32][35/100], loss_cls: 5.5399, depth_loss: 10.096 lr: 1.25000e-04, loss: 5.3715
Epoch [32][40/100], loss_cls: 0.94701, depth_loss: 12.815 lr: 1.25000e-04, loss: 5.1202
Epoch [32][45/100], loss_cls: 0.53716, depth_loss: 22.434 lr: 1.25000e-04, loss: 4.0505
Epoch [32][50/100], loss_cls: 3.889, depth_loss: 18.2 lr: 1.25000e-04, loss: 4.9014
Epoch [32][55/100], loss_cls: 0.77809, depth_loss: 21.455 lr: 1.25000e-04, loss: 4.7031
Epoch [32][60/100], loss_cls: 1.042, depth_loss: 51.84 lr: 1.25000e-04, loss: 4.696
Epoch [32][65/100], loss_cls: 1.3104, depth_loss: 9.5162 lr: 1.25000e-04, loss: 3.9251
Epoch [32][70/100], loss_cls: 0.724, depth_lo

Epoch [36][55/100], loss_cls: 0.010228, depth_loss: 20.679 lr: 1.25000e-04, loss: 4.3693
Epoch [36][60/100], loss_cls: 2.9626, depth_loss: 18.024 lr: 1.25000e-04, loss: 4.9009
Epoch [36][65/100], loss_cls: 0.40098, depth_loss: 18.87 lr: 1.25000e-04, loss: 4.7365
Epoch [36][70/100], loss_cls: 0.84873, depth_loss: 28.42 lr: 1.25000e-04, loss: 4.7354
Epoch [36][75/100], loss_cls: 0.018322, depth_loss: 14.841 lr: 1.25000e-04, loss: 2.84
Epoch [36][80/100], loss_cls: 0.73949, depth_loss: 8.9467 lr: 1.25000e-04, loss: 3.6211
Epoch [36][85/100], loss_cls: 0.41313, depth_loss: 35.455 lr: 1.25000e-04, loss: 3.6223
Epoch [36][90/100], loss_cls: 0.26518, depth_loss: 6.3294 lr: 1.25000e-04, loss: 3.1239
Epoch [36][95/100], loss_cls: 0.001281, depth_loss: 13.33 lr: 1.25000e-04, loss: 3.4804
Epoch [36][100/100], loss_cls: 0.56828, depth_loss: 6.5558 lr: 1.25000e-04, loss: 5.6815
Evaluating top_k_accuracy...
top1_acc: 0.7714, top5_acc: 1.0, train_loss: 5.6815, val_loss: 3.8299
Epoch [37][5/100], loss

Epoch [40][100/100], loss_cls: 1.6832, depth_loss: 15.941 lr: 1.25000e-04, loss: 4.4773
Evaluating top_k_accuracy...
top1_acc: 0.8, top5_acc: 0.9714, train_loss: 4.4773, val_loss: 3.3352
Epoch [41][5/100], loss_cls: 0.038611, depth_loss: 19.606 lr: 1.25000e-04, loss: 3.3086
Epoch [41][10/100], loss_cls: 0.98954, depth_loss: 24.888 lr: 1.25000e-04, loss: 3.7292
Epoch [41][15/100], loss_cls: 0.031393, depth_loss: 12.466 lr: 1.25000e-04, loss: 3.2346
Epoch [41][20/100], loss_cls: 1.844, depth_loss: 23.536 lr: 1.25000e-04, loss: 3.2911
Epoch [41][25/100], loss_cls: 0.20531, depth_loss: 22.776 lr: 1.25000e-04, loss: 6.6536
Epoch [41][30/100], loss_cls: 0.42943, depth_loss: 8.8557 lr: 1.25000e-04, loss: 2.7872
Epoch [41][35/100], loss_cls: 0.07552, depth_loss: 19.889 lr: 1.25000e-04, loss: 2.1942
Epoch [41][40/100], loss_cls: 0.0058587, depth_loss: 14.703 lr: 1.25000e-04, loss: 3.8145
Epoch [41][45/100], loss_cls: 0.0098062, depth_loss: 13.838 lr: 1.25000e-04, loss: 3.1295
Epoch [41][50/100]

Epoch [45][35/100], loss_cls: 7.2951e-05, depth_loss: 18.585 lr: 1.25000e-04, loss: 2.5783
Epoch [45][40/100], loss_cls: 0.067118, depth_loss: 13.537 lr: 1.25000e-04, loss: 3.084
Epoch [45][45/100], loss_cls: 0.035441, depth_loss: 18.687 lr: 1.25000e-04, loss: 4.6583
Epoch [45][50/100], loss_cls: 0.0026531, depth_loss: 21.814 lr: 1.25000e-04, loss: 4.1173
Epoch [45][55/100], loss_cls: 0.32674, depth_loss: 29.362 lr: 1.25000e-04, loss: 4.4839
Epoch [45][60/100], loss_cls: 2.8914, depth_loss: 15.835 lr: 1.25000e-04, loss: 4.4765
Epoch [45][65/100], loss_cls: 1.4484e-05, depth_loss: 7.8362 lr: 1.25000e-04, loss: 2.6254
Epoch [45][70/100], loss_cls: 3.2186e-06, depth_loss: 19.001 lr: 1.25000e-04, loss: 3.7945
Epoch [45][75/100], loss_cls: 1.5843, depth_loss: 22.447 lr: 1.25000e-04, loss: 5.4972
Epoch [45][80/100], loss_cls: 0.8683, depth_loss: 17.801 lr: 1.25000e-04, loss: 2.6153
Epoch [45][85/100], loss_cls: 0.00032874, depth_loss: 32.737 lr: 1.25000e-04, loss: 5.2296
Epoch [45][90/100], 

Epoch [49][75/100], loss_cls: 0.096894, depth_loss: 8.926 lr: 1.25000e-04, loss: 3.9166
Epoch [49][80/100], loss_cls: 1.2954, depth_loss: 14.85 lr: 1.25000e-04, loss: 4.7167
Epoch [49][85/100], loss_cls: 1.8596e-05, depth_loss: 23.595 lr: 1.25000e-04, loss: 2.5723
Epoch [49][90/100], loss_cls: 0.064701, depth_loss: 18.801 lr: 1.25000e-04, loss: 4.6926
Epoch [49][95/100], loss_cls: 0.0, depth_loss: 24.522 lr: 1.25000e-04, loss: 3.4999
Epoch [49][100/100], loss_cls: 0.0022952, depth_loss: 14.969 lr: 1.25000e-04, loss: 2.9591
Evaluating top_k_accuracy...
top1_acc: 0.7714, top5_acc: 0.9714, train_loss: 2.9591, val_loss: 3.109
Epoch [50][5/100], loss_cls: 1.0049, depth_loss: 16.674 lr: 1.25000e-04, loss: 2.871
Epoch [50][10/100], loss_cls: 2.044, depth_loss: 20.91 lr: 1.25000e-04, loss: 3.9998
Epoch [50][15/100], loss_cls: 0.0055266, depth_loss: 26.357 lr: 1.25000e-04, loss: 3.6795
Epoch [50][20/100], loss_cls: 0.00038984, depth_loss: 8.6447 lr: 1.25000e-04, loss: 4.1053
Epoch [50][25/100],

Epoch [54][10/100], loss_cls: 0.00084812, depth_loss: 7.8044 lr: 1.25000e-05, loss: 2.0141
Epoch [54][15/100], loss_cls: 0.95886, depth_loss: 18.697 lr: 1.25000e-05, loss: 3.6439
Epoch [54][20/100], loss_cls: 5.0068e-06, depth_loss: 12.201 lr: 1.25000e-05, loss: 2.1327
Epoch [54][25/100], loss_cls: 0.0016625, depth_loss: 11.725 lr: 1.25000e-05, loss: 3.1742
Epoch [54][30/100], loss_cls: 0.00050313, depth_loss: 12.959 lr: 1.25000e-05, loss: 2.1921
Epoch [54][35/100], loss_cls: 0.96749, depth_loss: 11.592 lr: 1.25000e-05, loss: 2.2028
Epoch [54][40/100], loss_cls: 0.24686, depth_loss: 13.272 lr: 1.25000e-05, loss: 2.0767
Epoch [54][45/100], loss_cls: 0.011055, depth_loss: 21.078 lr: 1.25000e-05, loss: 2.4033
Epoch [54][50/100], loss_cls: 4.345e-05, depth_loss: 9.8942 lr: 1.25000e-05, loss: 1.8253
Epoch [54][55/100], loss_cls: 0.010467, depth_loss: 10.997 lr: 1.25000e-05, loss: 2.8469
Epoch [54][60/100], loss_cls: 0.00039463, depth_loss: 9.7057 lr: 1.25000e-05, loss: 3.3621
Epoch [54][65/

Epoch [58][50/100], loss_cls: 0.00055687, depth_loss: 13.484 lr: 1.25000e-05, loss: 2.5501
Epoch [58][55/100], loss_cls: 0.00011038, depth_loss: 10.628 lr: 1.25000e-05, loss: 4.3178
Epoch [58][60/100], loss_cls: 1.6212e-05, depth_loss: 17.454 lr: 1.25000e-05, loss: 2.9097
Epoch [58][65/100], loss_cls: 0.017458, depth_loss: 9.1222 lr: 1.25000e-05, loss: 2.0974
Epoch [58][70/100], loss_cls: 2.688, depth_loss: 15.282 lr: 1.25000e-05, loss: 2.8827
Epoch [58][75/100], loss_cls: 0.012625, depth_loss: 8.7055 lr: 1.25000e-05, loss: 2.125
Epoch [58][80/100], loss_cls: 0.0059511, depth_loss: 9.0063 lr: 1.25000e-05, loss: 1.9673
Epoch [58][85/100], loss_cls: 9.5897e-05, depth_loss: 11.131 lr: 1.25000e-05, loss: 2.2158
Epoch [58][90/100], loss_cls: 0.16175, depth_loss: 24.561 lr: 1.25000e-05, loss: 3.9873
Epoch [58][95/100], loss_cls: 0.076782, depth_loss: 11.33 lr: 1.25000e-05, loss: 2.3201
Epoch [58][100/100], loss_cls: 0.22257, depth_loss: 14.939 lr: 1.25000e-05, loss: 2.9294
Evaluating top_k_a

Epoch [62][90/100], loss_cls: 0.0008533, depth_loss: 13.825 lr: 1.25000e-05, loss: 2.8695
Epoch [62][95/100], loss_cls: 0.0034314, depth_loss: 11.749 lr: 1.25000e-05, loss: 4.5167
Epoch [62][100/100], loss_cls: 0.003151, depth_loss: 8.9075 lr: 1.25000e-05, loss: 1.9677
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 1.9677, val_loss: 2.7655
Epoch [63][5/100], loss_cls: 0.0026104, depth_loss: 7.9113 lr: 1.25000e-05, loss: 2.7219
Epoch [63][10/100], loss_cls: 0.00069229, depth_loss: 7.768 lr: 1.25000e-05, loss: 1.7591
Epoch [63][15/100], loss_cls: 0.0011543, depth_loss: 11.314 lr: 1.25000e-05, loss: 1.9628
Epoch [63][20/100], loss_cls: 0.05955, depth_loss: 11.285 lr: 1.25000e-05, loss: 2.2863
Epoch [63][25/100], loss_cls: 5.126e-06, depth_loss: 6.6252 lr: 1.25000e-05, loss: 1.9198
Epoch [63][30/100], loss_cls: 0.018501, depth_loss: 13.179 lr: 1.25000e-05, loss: 2.1088
Epoch [63][35/100], loss_cls: 0.0012618, depth_loss: 9.2989 lr: 1.25000e-05, loss: 2.6242
Ep

Epoch [67][20/100], loss_cls: 0.0043005, depth_loss: 11.007 lr: 1.25000e-05, loss: 1.9134
Epoch [67][25/100], loss_cls: 0.015867, depth_loss: 25.701 lr: 1.25000e-05, loss: 3.432
Epoch [67][30/100], loss_cls: 0.00018212, depth_loss: 14.521 lr: 1.25000e-05, loss: 2.6165
Epoch [67][35/100], loss_cls: 0.0583, depth_loss: 12.843 lr: 1.25000e-05, loss: 3.6304
Epoch [67][40/100], loss_cls: 5.0841e-05, depth_loss: 12.001 lr: 1.25000e-05, loss: 2.5555
Epoch [67][45/100], loss_cls: 0.067847, depth_loss: 10.543 lr: 1.25000e-05, loss: 2.0846
Epoch [67][50/100], loss_cls: 0.095532, depth_loss: 9.645 lr: 1.25000e-05, loss: 2.0185
Epoch [67][55/100], loss_cls: 0.00083466, depth_loss: 10.362 lr: 1.25000e-05, loss: 2.6069
Epoch [67][60/100], loss_cls: 0.31205, depth_loss: 14.283 lr: 1.25000e-05, loss: 3.5126
Epoch [67][65/100], loss_cls: 0.0090601, depth_loss: 7.3383 lr: 1.25000e-05, loss: 3.2387
Epoch [67][70/100], loss_cls: 0.13298, depth_loss: 4.9063 lr: 1.25000e-05, loss: 2.3786
Epoch [67][75/100],

Epoch [71][55/100], loss_cls: 0.0035828, depth_loss: 10.405 lr: 1.25000e-05, loss: 2.9392
Epoch [71][60/100], loss_cls: 0.10845, depth_loss: 9.4838 lr: 1.25000e-05, loss: 2.0137
Epoch [71][65/100], loss_cls: 0.0030402, depth_loss: 8.2444 lr: 1.25000e-05, loss: 2.6574
Epoch [71][70/100], loss_cls: 4.5299e-06, depth_loss: 9.111 lr: 1.25000e-05, loss: 2.2491
Epoch [71][75/100], loss_cls: 1.9211, depth_loss: 12.644 lr: 1.25000e-05, loss: 3.6772
Epoch [71][80/100], loss_cls: 2.3424e-05, depth_loss: 14.008 lr: 1.25000e-05, loss: 3.6422
Epoch [71][85/100], loss_cls: 0.50375, depth_loss: 29.297 lr: 1.25000e-05, loss: 3.2012
Epoch [71][90/100], loss_cls: 3.8147e-06, depth_loss: 5.8344 lr: 1.25000e-05, loss: 2.4896
Epoch [71][95/100], loss_cls: 1.4901e-05, depth_loss: 7.961 lr: 1.25000e-05, loss: 1.9779
Epoch [71][100/100], loss_cls: 5.0901e-05, depth_loss: 7.124 lr: 1.25000e-05, loss: 2.198
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 2.198, val_loss: 2.6969
Epoc

Epoch [75][95/100], loss_cls: 0.0008215, depth_loss: 8.9846 lr: 1.25000e-05, loss: 2.5507
Epoch [75][100/100], loss_cls: 0.0020626, depth_loss: 9.8648 lr: 1.25000e-05, loss: 1.6809
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 1.6809, val_loss: 2.6863
Epoch [76][5/100], loss_cls: 7.0507e-05, depth_loss: 8.1622 lr: 1.25000e-05, loss: 2.7029
Epoch [76][10/100], loss_cls: 0.15482, depth_loss: 8.535 lr: 1.25000e-05, loss: 4.7702
Epoch [76][15/100], loss_cls: 0.0043115, depth_loss: 8.7074 lr: 1.25000e-05, loss: 2.8482
Epoch [76][20/100], loss_cls: 0.00049569, depth_loss: 5.5473 lr: 1.25000e-05, loss: 2.3379
Epoch [76][25/100], loss_cls: 0.0029741, depth_loss: 6.3151 lr: 1.25000e-05, loss: 1.8153
Epoch [76][30/100], loss_cls: 5.9605e-08, depth_loss: 16.205 lr: 1.25000e-05, loss: 2.8808
Epoch [76][35/100], loss_cls: 0.00026416, depth_loss: 5.6086 lr: 1.25000e-05, loss: 1.9478
Epoch [76][40/100], loss_cls: 0.068474, depth_loss: 10.653 lr: 1.25000e-05, loss: 2.005

Epoch [80][25/100], loss_cls: 4.0531e-06, depth_loss: 7.9365 lr: 1.25000e-05, loss: 1.9785
Epoch [80][30/100], loss_cls: 4.3511e-06, depth_loss: 8.5307 lr: 1.25000e-05, loss: 1.9959
Epoch [80][35/100], loss_cls: 0.13922, depth_loss: 7.8189 lr: 1.25000e-05, loss: 2.875
Epoch [80][40/100], loss_cls: 0.00016716, depth_loss: 9.1286 lr: 1.25000e-05, loss: 2.2409
Epoch [80][45/100], loss_cls: 1.2156, depth_loss: 13.384 lr: 1.25000e-05, loss: 3.1723
Epoch [80][50/100], loss_cls: 3.6287, depth_loss: 6.4857 lr: 1.25000e-05, loss: 2.5983
Epoch [80][55/100], loss_cls: 0.0021141, depth_loss: 14.328 lr: 1.25000e-05, loss: 3.0757
Epoch [80][60/100], loss_cls: 2.4438e-06, depth_loss: 7.0236 lr: 1.25000e-05, loss: 2.1253
Epoch [80][65/100], loss_cls: 0.00080022, depth_loss: 6.514 lr: 1.25000e-05, loss: 1.7312
Epoch [80][70/100], loss_cls: 0.0003093, depth_loss: 5.2744 lr: 1.25000e-05, loss: 2.2002
Epoch [80][75/100], loss_cls: 0.014956, depth_loss: 10.82 lr: 1.25000e-05, loss: 3.252
Epoch [80][80/100]

Epoch [84][60/100], loss_cls: 0.018004, depth_loss: 11.299 lr: 1.25000e-05, loss: 2.7194
Epoch [84][65/100], loss_cls: 0.45388, depth_loss: 5.9812 lr: 1.25000e-05, loss: 3.4728
Epoch [84][70/100], loss_cls: 0.00022879, depth_loss: 7.0064 lr: 1.25000e-05, loss: 2.8467
Epoch [84][75/100], loss_cls: 1.2981, depth_loss: 7.4675 lr: 1.25000e-05, loss: 2.5909
Epoch [84][80/100], loss_cls: 4.5715e-05, depth_loss: 11.367 lr: 1.25000e-05, loss: 2.8168
Epoch [84][85/100], loss_cls: 0.54589, depth_loss: 20.342 lr: 1.25000e-05, loss: 2.5406
Epoch [84][90/100], loss_cls: 0.0028644, depth_loss: 5.913 lr: 1.25000e-05, loss: 1.485
Epoch [84][95/100], loss_cls: 1.5001, depth_loss: 10.202 lr: 1.25000e-05, loss: 3.0482
Epoch [84][100/100], loss_cls: 4.0531e-06, depth_loss: 8.5726 lr: 1.25000e-05, loss: 2.0705
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 2.0705, val_loss: 2.6109
Saving checkpoint at 84 epochs...
Epoch [85][5/100], loss_cls: 0.00014173, depth_loss: 5.5484 lr:

Epoch [88][95/100], loss_cls: 0.61735, depth_loss: 9.3129 lr: 1.25000e-05, loss: 2.4265
Epoch [88][100/100], loss_cls: 0.0017603, depth_loss: 15.478 lr: 1.25000e-05, loss: 3.0308
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 3.0308, val_loss: 2.6973
Epoch [89][5/100], loss_cls: 0.0030994, depth_loss: 8.5538 lr: 1.25000e-05, loss: 1.9543
Epoch [89][10/100], loss_cls: 0.28669, depth_loss: 15.187 lr: 1.25000e-05, loss: 2.4544
Epoch [89][15/100], loss_cls: 0.76941, depth_loss: 16.075 lr: 1.25000e-05, loss: 1.9111
Epoch [89][20/100], loss_cls: 0.00013571, depth_loss: 9.0497 lr: 1.25000e-05, loss: 1.9338
Epoch [89][25/100], loss_cls: 7.8081e-06, depth_loss: 6.8521 lr: 1.25000e-05, loss: 1.8326
Epoch [89][30/100], loss_cls: 0.00067159, depth_loss: 15.901 lr: 1.25000e-05, loss: 2.2147
Epoch [89][35/100], loss_cls: 1.8835e-05, depth_loss: 10.204 lr: 1.25000e-05, loss: 3.4738
Epoch [89][40/100], loss_cls: 0.16756, depth_loss: 11.059 lr: 1.25000e-05, loss: 3.6777
Ep

Epoch [93][25/100], loss_cls: 0.00072974, depth_loss: 11.787 lr: 1.25000e-05, loss: 1.5308
Epoch [93][30/100], loss_cls: 4.1723e-07, depth_loss: 7.2311 lr: 1.25000e-05, loss: 2.8042
Epoch [93][35/100], loss_cls: 0.006242, depth_loss: 10.108 lr: 1.25000e-05, loss: 3.0967
Epoch [93][40/100], loss_cls: 5.4836e-06, depth_loss: 13.143 lr: 1.25000e-05, loss: 1.9388
Epoch [93][45/100], loss_cls: 0.0026677, depth_loss: 6.8645 lr: 1.25000e-05, loss: 2.3685
Epoch [93][50/100], loss_cls: 1.3485, depth_loss: 19.69 lr: 1.25000e-05, loss: 2.6409
Epoch [93][55/100], loss_cls: 0.016438, depth_loss: 16.952 lr: 1.25000e-05, loss: 2.2846
Epoch [93][60/100], loss_cls: 0.57794, depth_loss: 20.051 lr: 1.25000e-05, loss: 3.1067
Epoch [93][65/100], loss_cls: 0.0005427, depth_loss: 6.5175 lr: 1.25000e-05, loss: 1.6373
Epoch [93][70/100], loss_cls: 0.01983, depth_loss: 6.1187 lr: 1.25000e-05, loss: 1.7452
Epoch [93][75/100], loss_cls: 1.8358e-05, depth_loss: 6.8162 lr: 1.25000e-05, loss: 2.1225
Epoch [93][80/10

Epoch [97][60/100], loss_cls: 0.0011539, depth_loss: 18.441 lr: 1.25000e-05, loss: 2.8249
Epoch [97][65/100], loss_cls: 1.1412, depth_loss: 22.95 lr: 1.25000e-05, loss: 4.2271
Epoch [97][70/100], loss_cls: 0.006056, depth_loss: 7.6871 lr: 1.25000e-05, loss: 2.2675
Epoch [97][75/100], loss_cls: 0.00023127, depth_loss: 7.0918 lr: 1.25000e-05, loss: 2.0532
Epoch [97][80/100], loss_cls: 0.00011026, depth_loss: 9.387 lr: 1.25000e-05, loss: 2.2011
Epoch [97][85/100], loss_cls: 3.0577e-05, depth_loss: 5.7686 lr: 1.25000e-05, loss: 2.5085
Epoch [97][90/100], loss_cls: 4.4821e-05, depth_loss: 13.868 lr: 1.25000e-05, loss: 2.2358
Epoch [97][95/100], loss_cls: 2.7656e-05, depth_loss: 12.461 lr: 1.25000e-05, loss: 2.4304
Epoch [97][100/100], loss_cls: 0.71404, depth_loss: 19.784 lr: 1.25000e-05, loss: 1.995
Evaluating top_k_accuracy...
top1_acc: 0.8286, top5_acc: 0.9714, train_loss: 1.995, val_loss: 2.6627
Epoch [98][5/100], loss_cls: 0.013766, depth_loss: 8.0234 lr: 1.25000e-05, loss: 2.7161
Epoc