In [1]:
from model.multistream_backbone import MultiStreamBackbone



In [2]:
multistream = MultiStreamBackbone(rgb_checkpoint='rgb.pth',
                                 flow_checkpoint='flow.pth',
                                 depth_checkpoint='depth.pth',
                                 skeleton_checkpoint='skeleton.pth',
                                 face_checkpoint='face.pth',
                                 left_hand_checkpoint='left_hand.pth',
                                 right_hand_checkpoint='right_hand.pth'
                                 )

In [14]:
from dataset.dataset import MultiModalDataset
import os
import torch
import torch.nn as nn
import numpy as np

In [4]:
def top_k_accuracy(scores, labels, topk=(1, )):
    """Calculate top k accuracy score.
    Args:
        scores (list[np.ndarray]): Prediction scores for each class.
        labels (list[int]): Ground truth labels.
        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
    Returns:
        list[float]: Top k accuracy score for each k.
    """
    res = np.zeros(len(topk))
    labels = np.array(labels)[:, np.newaxis]
    for i, k in enumerate(topk):
        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res[i] = topk_acc_score

    return res

In [31]:
def validate():
    """Run one epoch for validation.
    Returns:
        avg_vloss (float): Validation loss value for the last batch.
        top1_acc (float): Top-1 accuracy in decimal.
        top5_acc (float): Top-5 accuracy in decimal.
    """
    running_vloss = 0.0
    running_vacc = np.zeros(2)

    print('Evaluating top_k_accuracy...')

    model.eval()
    with torch.inference_mode():
        for i, results in enumerate(test_loader):
            rgb = results['rgb']
            flow = results['flow']
            depth = results['depth']
            face = results['face']
            skeleton = results['skeleton']
            right_hand = results['right_hand']
            left_hand = results['left_hand']
            vtargets = results['label']

            vtargets = vtargets.reshape(-1, )

            rgb, flow, vtargets = rgb.to(device), flow.to(device), vtargets.to(device)
            depth, face, skeleton = depth.to(device), face.to(device), skeleton.to(device)
            left_hand, right_hand = left_hand.to(device), right_hand.to(device)

            voutputs = model(rgb=rgb,
                             flow=flow,
                            depth=depth,
                            left_hand=left_hand,
                            right_hand=right_hand,
                            face=face,
                            skeleton=skeleton)


            running_vacc += top_k_accuracy(voutputs.detach().cpu().numpy(),
                                           vtargets.detach().cpu().numpy(), topk=(1, 5))

    acc = running_vacc/len(test_loader)
    top1_acc = acc[0].item()
    top5_acc = acc[1].item()

    return (top1_acc, top5_acc)


In [6]:
os.chdir('../../')

In [32]:
# Set up device agnostic code
device = 'cuda'

# Configs
work_dir = 'work_dirs/7sees-late-fusion-v1/'
batch_size = 1

os.makedirs(work_dir, exist_ok=True)

train_dataset = MultiModalDataset(ann_file='data/wlasl/train_annotations.txt',
                                  root_dir='data/wlasl/rawframes',
                                  clip_len=32,
                                  modalities=('rgb',
                                              'flow',
                                              'depth',
                                              'pose',
                                              'skeleton',
                                              'face',
                                              'left_hand',
                                              'right_hand'
                                             ),
                                  resolution=224,
                                  frame_interval=1,
                                  input_resolution=256,
                                  num_clips=1
                                  )

test_dataset = MultiModalDataset(ann_file='data/wlasl/test_annotations.txt',
                                 root_dir='data/wlasl/rawframes',
                                 clip_len=32,
                                 resolution=224,
                                 modalities=('rgb',
                                              'flow',
                                              'depth',
                                              'pose',
                                              'skeleton',
                                              'face',
                                              'left_hand',
                                              'right_hand'
                                            ),
                                 test_mode=True,
                                 frame_interval=1,
                                 input_resolution=256,
                                 num_clips=1
                                 )

# Setting up dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=4,
                                           pin_memory=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=1,
                                          shuffle=True,
                                          num_workers=4,
                                          pin_memory=True)

In [33]:
for name, para in multistream.named_parameters():
        para.requires_grad = False

In [37]:
class Sees7(nn.Module):
    """Multimodal Model Seven-Sees.
    """

    def __init__(self,
                multistream_backbone=None,
                head=None):
        
        super(Sees7, self).__init__()
        self.multistream_backbone = multistream_backbone
        self.head = head
        
    def forward(self,
                rgb=None,
                flow=None,
                depth=None,
                skeleton=None,
                face=None,
                right_hand=None,
                left_hand=None
                ):
        
        stream = self.multistream_backbone(rgb=rgb,
                                           flow=flow,
                                           depth=depth,
                                           skeleton=skeleton,
                                           face=face,
                                           right_hand=right_hand,
                                           left_hand=left_hand)
        

        if self.head is None:
            cls_score = (1/7)*stream['rgb'] + (1/7)*stream['flow'] + (1/7)*stream['depth']
            + (1/7)*stream['skeleton'] + (1/7)*stream['face'] + (1/7)*stream['left_hand']
            + (1/7)*stream['right_hand']
        
        cls_score = self.head(stream)

        return cls_score

In [38]:
model = Sees7(multistream_backbone=multistream)
model.to(device)

Sees7(
  (multistream_backbone): MultiStreamBackbone(
    (rgb_stream): FlowAutoencoder(
      (rgb_backbone): ResNet3dCSN(
        (conv1): ConvModule(
          (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
          (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (activate): ReLU(inplace=True)
        )
        (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
        (pool2): MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=0, dilation=1, ceil_mode=False)
        (layer1): Sequential(
          (0): CSNBottleneck3d(
            (conv1): ConvModule(
              (conv): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
              (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
              (activate): ReLU(inplace=True)
            )
            (conv2): Seque

In [39]:
top1_acc, top5_acc = validate()
print(f'top1_acc: {top1_acc:.4}, top5_acc: {top5_acc:.4}')

Evaluating top_k_accuracy...
top1_acc: 0.7907, top5_acc: 0.9419
