In [1]:
import os
import torch
import torch.nn as nn
import wandb
import numpy as np

from dataset.dataset import MultiModalDataset
from mmcv_model.mmcv_csn import ResNet3dCSN
from mmcv_model.scheduler import GradualWarmupScheduler

from model.average_model import MultiModalModel
from model.multimodal_neck import MultiModalNeck
from model.simple_head import SimpleHead
from model.flow_autoencoder import FlowAutoencoder



In [2]:
def top_k_accuracy(scores, labels, topk=(1, )):
    """Calculate top k accuracy score.
    Args:
        scores (list[np.ndarray]): Prediction scores for each class.
        labels (list[int]): Ground truth labels.
        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
    Returns:
        list[float]: Top k accuracy score for each k.
    """
    res = np.zeros(len(topk))
    labels = np.array(labels)[:, np.newaxis]
    for i, k in enumerate(topk):
        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res[i] = topk_acc_score

    return res

In [3]:
def validate():
    """Run one epoch for validation.
    Returns:
        avg_vloss (float): Validation loss value for the last batch.
        top1_acc (float): Top-1 accuracy in decimal.
        top5_acc (float): Top-5 accuracy in decimal.
    """
    running_vloss = 0.0
    running_vacc = np.zeros(2)

    print('Evaluating top_k_accuracy...')

    model.eval()
    with torch.inference_mode():
        for i, results in enumerate(test_loader):
            rgb = results['rgb']
            flow = results['flow']
            vtargets = results['label']

            vtargets = vtargets.reshape(-1, )

            rgb, flow, vtargets = rgb.to(device), flow.to(device), vtargets.to(device)

            voutputs = model(rgb=rgb,
                             flow=flow)

            vloss = loss_fn(voutputs, vtargets)
            running_vloss += vloss

            running_vacc += top_k_accuracy(voutputs.detach().cpu().numpy(),
                                           vtargets.detach().cpu().numpy(), topk=(1, 5))

    avg_vloss = running_vloss / (i + 1)

    acc = running_vacc/len(test_loader)
    top1_acc = acc[0].item()
    top5_acc = acc[1].item()

    return (avg_vloss, top1_acc, top5_acc)

In [4]:
print('Loading rgb backbone checkpoint...')
rgb_checkpoint = torch.load('flow_encoder_rgb_wlasl100_100e.pth')
print('Loading flow backbone checkpoint...')
flow_checkpoint = torch.load('flow_encoder_flow_wlasl100_100e.pth')

os.chdir('../../..')

Loading rgb backbone checkpoint...
Loading flow backbone checkpoint...


In [5]:
# wandb.init(entity="cares", project="jack-slr",
#            group="average", name="late-one-fc")

# Set up device agnostic code
device = 'cuda'

# Configs
work_dir = 'work_dirs/7sees-late-fusion-v1/'
batch_size = 1

os.makedirs(work_dir, exist_ok=True)

train_dataset = MultiModalDataset(ann_file='data/wlasl/train_annotations.txt',
                                  root_dir='data/wlasl/rawframes',
                                  clip_len=32,
                                  modalities=('rgb', 'flow'),
                                  resolution=224,
                                  frame_interval=1,
                                  input_resolution=256,
                                  num_clips=1
                                  )

test_dataset = MultiModalDataset(ann_file='data/wlasl/test_annotations.txt',
                                 root_dir='data/wlasl/rawframes',
                                 clip_len=32,
                                 resolution=224,
                                 modalities=('rgb', 'flow'),
                                 test_mode=True,
                                 frame_interval=1,
                                 input_resolution=256,
                                 num_clips=1
                                 )


# Setting up dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=4,
                                           pin_memory=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=1,
                                          shuffle=True,
                                          num_workers=4,
                                          pin_memory=True)

# Custom multimodal model
rgb_backbone = ResNet3dCSN(
    pretrained2d=False,
    pretrained=None,
    depth=50,
    with_pool2=False,
    bottleneck_mode='ir',
    norm_eval=True,
    zero_init_residual=False,
    bn_frozen=True
)

rgb_neck = MultiModalNeck()
rgb_head = SimpleHead(num_classes=400,
                  in_channels=2048,
                  dropout_ratio=0.5,
                  init_std=0.01)

rgb_stream = FlowAutoencoder(rgb_backbone=rgb_backbone,
                        neck=rgb_neck,
                        head=rgb_head)

rgb_stream.load_state_dict(rgb_checkpoint)
del rgb_checkpoint

    # Custom multimodal model
flow_backbone = ResNet3dCSN(
    pretrained2d=False,
    pretrained=None,
    depth=50,
    with_pool2=False,
    bottleneck_mode='ir',
    norm_eval=True,
    zero_init_residual=False,
    bn_frozen=True
)

flow_neck = MultiModalNeck()
flow_head = SimpleHead(num_classes=400,
                  in_channels=2048,
                  dropout_ratio=0.5,
                  init_std=0.01)


flow_stream = FlowAutoencoder(flow_backbone=flow_backbone,
                        neck=flow_neck,
                        head=flow_head)

flow_stream.load_state_dict(flow_checkpoint)
del flow_checkpoint

print('Backbones loaded successfully.')

# Freeze the backbones
for name, para in rgb_backbone.named_parameters():
    para.requires_grad = False

for name, para in flow_backbone.named_parameters():
    para.requires_grad = False

model = MultiModalModel(rgb_stream=rgb_stream,
                        flow_stream=flow_stream)

model.to(device)

Backbones loaded successfully.


MultiModalModel(
  (rgb_stream): FlowAutoencoder(
    (rgb_backbone): ResNet3dCSN(
      (conv1): ConvModule(
        (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
        (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (activate): ReLU(inplace=True)
      )
      (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
      (pool2): MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=0, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): CSNBottleneck3d(
          (conv1): ConvModule(
            (conv): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (activate): ReLU(inplace=True)
          )
          (conv2): Sequential(
            (0): ConvModule(
              (conv): Conv3d(64, 64

In [6]:
validate()

Evaluating top_k_accuracy...


NameError: name 'loss_fn' is not defined

In [None]:
iter_ = iter(test_loader)
results = next(iter_)

In [7]:
rgb_stream.rgb_backbone(rgb)

NameError: name 'rgb' is not defined

In [None]:
rgb = results['rgb']
flow = results['flow']
vtargets = results['label']

vtargets = vtargets.reshape(-1, )

rgb, flow, vtargets = rgb.to(device), flow.to(device), vtargets.to(device)

voutputs = model(rgb=rgb,
                 flow=flow)

vloss = loss_fn(voutputs, vtargets)

In [None]:
# 0.5/0.5 = 0.8256, top5_acc: 0.9186, val_loss: 1.4156
# 0.8/0.3 = top1_acc: 0.8333, top5_acc: 0.9302, val_loss: 1.467
