In [1]:
import os
import torch
import wandb
import torch.nn as nn
import numpy as np

from model.multistream_backbone import MultiStreamBackbone
from model.sees7 import Sees7
from model.one_neuron_head import OneNeuronHead
from dataset.dataset import MultiModalDataset
from mmcv_model.scheduler import GradualWarmupScheduler



In [2]:
def top_k_accuracy(scores, labels, topk=(1, )):
    """Calculate top k accuracy score.
    Args:
        scores (list[np.ndarray]): Prediction scores for each class.
        labels (list[int]): Ground truth labels.
        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
    Returns:
        list[float]: Top k accuracy score for each k.
    """
    res = np.zeros(len(topk))
    labels = np.array(labels)[:, np.newaxis]
    for i, k in enumerate(topk):
        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res[i] = topk_acc_score

    return res

def validate():
    """Run one epoch for validation.
    Returns:
        avg_vloss (float): Validation loss value for the last batch.
        top1_acc (float): Top-1 accuracy in decimal.
        top5_acc (float): Top-5 accuracy in decimal.
    """
    running_vloss = 0.0
    running_vacc = np.zeros(2)

    print('Evaluating top_k_accuracy...')

    model.eval()
    with torch.inference_mode():
        for i, results in enumerate(test_loader):
            rgb = results['rgb']
            flow = results['flow']
            depth = results['depth']
            face = results['face']
            skeleton = results['skeleton']
            right_hand = results['right_hand']
            left_hand = results['left_hand']
            vtargets = results['label']

            vtargets = vtargets.reshape(-1, )

            rgb, flow, vtargets = rgb.to(device), flow.to(device), vtargets.to(device)
            depth, face, skeleton = depth.to(device), face.to(device), skeleton.to(device)
            left_hand, right_hand = left_hand.to(device), right_hand.to(device)

            voutputs = model(rgb=rgb,
                             flow=flow,
                            depth=depth,
                            left_hand=left_hand,
                            right_hand=right_hand,
                            face=face,
                            skeleton=skeleton)


            running_vacc += top_k_accuracy(voutputs.detach().cpu().numpy(),
                                           vtargets.detach().cpu().numpy(), topk=(1, 5))

    acc = running_vacc/len(test_loader)
    top1_acc = acc[0].item()
    top5_acc = acc[1].item()

    return (top1_acc, top5_acc)

In [3]:
device = 'cuda'


# Building the model
multistream = MultiStreamBackbone(rgb_checkpoint='./rgb.pth',
                                flow_checkpoint='./flow.pth',
                                depth_checkpoint='./depth.pth',
                                skeleton_checkpoint='./skeleton.pth',
                                face_checkpoint='./face.pth',
                                left_hand_checkpoint='./left_hand.pth',
                                right_hand_checkpoint='./right_hand.pth'
                                )

one_neuron_head = OneNeuronHead(num_modalities=7)

# Freeze the backbones
for name, para in multistream.named_parameters():
    para.requires_grad = False

model = Sees7(multistream_backbone=multistream)
model.to(device)

Skeleton checkpoint loaded successfully...


Sees7(
  (multistream_backbone): MultiStreamBackbone(
    (rgb_stream): FlowAutoencoder(
      (rgb_backbone): ResNet3dCSN(
        (conv1): ConvModule(
          (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
          (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (activate): ReLU(inplace=True)
        )
        (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
        (pool2): MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=0, dilation=1, ceil_mode=False)
        (layer1): Sequential(
          (0): CSNBottleneck3d(
            (conv1): ConvModule(
              (conv): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
              (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
              (activate): ReLU(inplace=True)
            )
            (conv2): Seque

In [4]:
# Build the dataloaders
os.chdir('../../')
work_dir = 'work_dirs/sees7/'
batch_size = 9

train_dataset = MultiModalDataset(ann_file='data/wlasl/train_annotations.txt',
                                root_dir='data/wlasl/rawframes',
                                clip_len=32,
                                resolution=224,
                                modalities=('rgb',
                                            'flow',
                                            'depth',
                                            'pose',
                                            'skeleton',
                                            'face',
                                            'left_hand',
                                            'right_hand'
                                            ),
                                test_mode=False,
                                frame_interval=1,
                                input_resolution=256,
                                num_clips=1
                                )

test_dataset = MultiModalDataset(ann_file='data/wlasl/test_annotations.txt',
                                root_dir='data/wlasl/rawframes',
                                clip_len=32,
                                resolution=224,
                                modalities=('rgb',
                                            'flow',
                                            'depth',
                                            'pose',
                                            'skeleton',
                                            'face',
                                            'left_hand',
                                            'right_hand'
                                            ),
                                test_mode=True,
                                frame_interval=1,
                                input_resolution=256,
                                num_clips=1
                                )

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=4,
                                        pin_memory=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                        batch_size=1,
                                        shuffle=True,
                                        num_workers=4,
                                        pin_memory=True)

epochs = 100

# Specify optimizer
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.000125, momentum=0.9, weight_decay=0.00001)

# Specify learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=120, gamma=0.1)

scheduler_steplr = torch.optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=[34, 84], gamma=0.1)
scheduler = GradualWarmupScheduler(
    optimizer, multiplier=1, total_epoch=16, after_scheduler=scheduler_steplr)

# Specify Loss
loss_fn = nn.CrossEntropyLoss()

In [5]:
# for epoch in range(epochs):
#     # Turn on gradient tracking and do a forward pass
#     model.train(True)
#     avg_loss, learning_rate = train_one_epoch(epoch+1)

#     # Turn off  gradients for reporting
#     model.train(False)

#     top1_acc, top5_acc = validate()

#     print(
#         f'top1_acc: {top1_acc:.4}, top5_acc: {top5_acc:.4}, train_loss: {avg_loss:.5}')

#     # Track best performance, and save the model's state
#     model_path = work_dir + f'epoch_{epoch+1}.pth'
#     print(f'Saving checkpoint at {epoch+1} epochs...')
#     torch.save(model.state_dict(), model_path)

#     # Adjust learning rate
#     scheduler.step()

In [6]:
def train_one_epoch(epoch_index, interval=5):
    """Run one epoch for training.
    Args:
        epoch_index (int): Current epoch.
        interval (int): Frequency at which to print logs.
    Returns:
        last_loss (float): Loss value for the last batch.
    """
    running_loss = 0.
    last_loss = 0.

    for i, results in enumerate(train_loader):
        rgb = results['rgb']
        flow = results['flow']
        depth = results['depth']
        face = results['face']
        skeleton = results['skeleton']
        right_hand = results['right_hand']
        left_hand = results['left_hand']
        targets = results['label']

        targets = targets.reshape(-1, )

        rgb, flow, targets = rgb.to(device), flow.to(device), targets.to(device)
        depth, face, skeleton = depth.to(device), face.to(device), skeleton.to(device)
        left_hand, right_hand = left_hand.to(device), right_hand.to(device)

            

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(rgb=rgb,
                            flow=flow,
                            depth=depth,
                            left_hand=left_hand,
                            right_hand=right_hand,
                            face=face,
                            skeleton=skeleton)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, targets)
        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), max_norm=40, norm_type=2.0)

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % interval == interval-1:
            last_loss = running_loss / interval  # loss per batch
            print(
                f'Epoch [{epoch_index}][{i+1}/{len(train_loader)}], lr: {scheduler.get_last_lr()[0]:.5e}, loss: {last_loss:.5}')
            running_loss = 0.

    return last_loss, scheduler.get_last_lr()[0]

In [7]:
# Evaluate
top1_acc, top5_acc = validate()
print(f'top1_acc: {top1_acc:.4}, top5_acc: {top5_acc:.4}')

Evaluating top_k_accuracy...
top1_acc: 0.7752, top5_acc: 0.9225


In [8]:
iter_ = iter(train_loader)
results = next(iter_)

In [9]:
rgb = results['rgb']
flow = results['flow']
depth = results['depth']
face = results['face']
skeleton = results['skeleton']
right_hand = results['right_hand']
left_hand = results['left_hand']
vtargets = results['label']

vtargets = vtargets.reshape(-1, )

rgb, flow, vtargets = rgb.to(device), flow.to(device), vtargets.to(device)
depth, face, skeleton = depth.to(device), face.to(device), skeleton.to(device)
left_hand, right_hand = left_hand.to(device), right_hand.to(device)

stream_output = model.multistream_backbone(rgb=rgb,
                 flow=flow,
                depth=depth,
                left_hand=left_hand,
                right_hand=right_hand,
                face=face,
                skeleton=skeleton)

In [10]:
stream_output['flow'].shape

torch.Size([9, 400])

In [11]:
vtargets.shape

torch.Size([9])

In [12]:
fc_cls = nn.Linear(7, 1)

In [13]:
stream = stream_output

In [14]:
list_ = []

for modality in stream:
    list_.append(stream[modality])

In [15]:
len(list_)

7

In [16]:
list_[0].shape

torch.Size([9, 400])

In [17]:
x = torch.cat(list_)

In [18]:
x.shape

torch.Size([63, 400])

In [19]:
multistream.to(device)

MultiStreamBackbone(
  (rgb_stream): FlowAutoencoder(
    (rgb_backbone): ResNet3dCSN(
      (conv1): ConvModule(
        (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
        (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (activate): ReLU(inplace=True)
      )
      (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
      (pool2): MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=0, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): CSNBottleneck3d(
          (conv1): ConvModule(
            (conv): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (bn): BatchNorm3d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (activate): ReLU(inplace=True)
          )
          (conv2): Sequential(
            (0): ConvModule(
              (conv): Conv3d(64

In [20]:
multistream.rgb_stream(results['rgb'].to(device)).shape

torch.Size([9, 400])

In [21]:

for key in stream:
    print(key)

print(list_[0].shape)

x = torch.cat(list_, dim=0)

return self.fc_cls(x.permute(1,0)).permute(1,0)

rgb
flow
depth
skeleton
face
left_hand
right_hand
torch.Size([9, 400])


SyntaxError: 'return' outside function (3479812261.py, line 8)

In [22]:
one_neuron_head(stream_output)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)