## Imports

In [1]:
import os
import torch
import torch.nn as nn
import wandb
import numpy as np

from torchvision import transforms
from model.mmcv_csn import ResNet3dCSN
from model.cls_head import ClassifierHead
from model.pose_encoder import PoseEncoder
from model.scheduler import GradualWarmupScheduler
from model.multimodal_neck import MultiModalNeck
from mmaction.datasets import build_dataset
from dataset.dataset import MultiModalDataset
from dataset.transforms import transform
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    device = 'mps' if torch.backends.mps.is_available() else 'cpu'
except:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
work_dir = 'work_dirs/wlasl-dataset/'
batch_size = 2

os.makedirs(work_dir, exist_ok=True)

In [4]:
transforms = transform()

train_dataset = MultiModalDataset(ann_file='data/wlasl10/train_annotations.txt',
                            root_dir='data/wlasl10/rawframes',
                            clip_len=32,
                            resolution=224,
                            transforms = transforms,
                            frame_interval=1,
                            num_clips=1
                            )

test_dataset = MultiModalDataset(ann_file='data/wlasl10/test_annotations.txt',
                            root_dir='data/wlasl10/rawframes',
                            clip_len=32,
                            resolution=224,
                            transforms = transforms,
                            frame_interval=1,
                            num_clips=1
                            )

# Setting up dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                    batch_size=1,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=True)

In [5]:
rgb, _, face, left_hand, right_hand, depth, flow, pose,  label = next(iter(test_loader)) 

256 256
186 186
186 186
186 186
256 256
256 256
256 256
256 256
256 256
256 256
256 256256 
256
256 256
203 203
203 203
203 203
256 256
207 207
207 207
207 207
256 256
216 216
256 256
216 216
203 204216
 216
203 204
203 204
256 256
215 214
215 214
215 214
256 256
180 181
180 181
180 181


In [6]:
from model.seven_seas_net import SevenSeesNet

model = SevenSeesNet()
model.init_weights()

2023-02-12 12:13:30,667 - model - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2023-02-12 12:13:30,668 - model - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2023-02-12 12:13:30,699 - model - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2023-02-12 12:13:30,700 - model - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2023-02-12 12:13:30,726 - model - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2023-02-12 12:13:30,726 - model - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scrat

In [7]:
x = model(rgb=rgb,
         depth=depth,
         flow=flow,
         face=face,
         left_hand=left_hand,
         right_hand=right_hand,
         pose=pose)

In [8]:
x.shape

torch.Size([1, 400])

## Training Loop

In [9]:
# Specify optimizer
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.000125, momentum=0.9, weight_decay=0.00001)

# Specify Loss
loss_cls = nn.CrossEntropyLoss()

# Specify total epochs
epochs = 100

# Specify learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=120, gamma=0.1)

scheduler_steplr = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[34, 84], gamma=0.1)
scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=16, after_scheduler=scheduler_steplr)

# Specify Loss
loss_fn = nn.CrossEntropyLoss()

In [10]:
def top_k_accuracy(scores, labels, topk=(1, )):
    """Calculate top k accuracy score.
    Args:
        scores (list[np.ndarray]): Prediction scores for each class.
        labels (list[int]): Ground truth labels.
        topk (tuple[int]): K value for top_k_accuracy. Default: (1, ).
    Returns:
        list[float]: Top k accuracy score for each k.
    """
    res = np.zeros(len(topk))
    labels = np.array(labels)[:, np.newaxis]
    for i, k in enumerate(topk):
        max_k_preds = np.argsort(scores, axis=1)[:, -k:][:, ::-1]
        match_array = np.logical_or.reduce(max_k_preds == labels, axis=1)
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res[i] = topk_acc_score

    return res


def train_one_epoch(epoch_index, interval=5):
    """Run one epoch for training.
    Args:
        epoch_index (int): Current epoch.
        interval (int): Frequency at which to print logs.
    Returns:
        last_loss (float): Loss value for the last batch.
    """
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, (rgb, _, face, left_hand, right_hand, depth, flow, pose, targets) in enumerate(train_loader):
        rgb, face, left_hand, right_hand, depth, flow, pose, targets = rgb.to(device), face.to(device), left_hand.to(device), right_hand.to(device), depth.to(device), flow.to(device), pose.to(device), targets.to(device)
#         rgb = rgb.reshape((-1, ) + rgb.shape[2:])
#         face = face.reshape((-1, ) + face.shape[2:])
#         flow = flow.reshape((-1, ) + flow.shape[2:])
#         left_hand = left_hand.reshape((-1, ) + left_hand.shape[2:])
#         right_hand = right_hand.reshape((-1, ) + right_hand.shape[2:])
#         depth = depth.reshape((-1, ) + depth.shape[2:])
        
        targets = targets.reshape(-1, )

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(rgb=rgb,
                         depth=depth,
                         flow=flow,
                         face=face,
                         left_hand=left_hand,
                         right_hand=right_hand,
                         pose=pose)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, targets)
        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), max_norm=40, norm_type=2.0)

        # Adjust learning weights
        optimizer.step()
        

        # Gather data and report
        running_loss += loss.item()
        if i % interval == interval-1:
            last_loss = running_loss / interval  # loss per batch
            print(
                f'Epoch [{epoch_index}][{i+1}/{len(train_loader)}], lr: {scheduler.get_last_lr()[0]:.5e}, loss: {last_loss:.5}')
            running_loss = 0.

    return last_loss, scheduler.get_last_lr()[0]


def validate():
    """Run one epoch for validation.
    Returns:
        avg_vloss (float): Validation loss value for the last batch.
        top1_acc (float): Top-1 accuracy in decimal.
        top5_acc (float): Top-5 accuracy in decimal.
    """
    running_vloss = 0.0
    running_vacc = np.zeros(2)

    print('Evaluating top_k_accuracy...')

    with torch.inference_mode():
        for i, (rgb, _, face, left_hand, right_hand, depth, flow, pose, targets)  in enumerate(test_loader):
            rgb, face, left_hand, right_hand, depth, flow, pose, targets = rgb.to(device), face.to(device), left_hand.to(device), right_hand.to(device), depth.to(device), flow.to(device), pose.to(device), targets.to(device)
            rgb = rgb.reshape((-1, ) + rgb.shape[2:])
            face = face.reshape((-1, ) + face.shape[2:])
            flow = flow.reshape((-1, ) + flow.shape[2:])
            left_hand = left_hand.reshape((-1, ) + left_hand.shape[2:])
            right_hand = right_hand.reshape((-1, ) + right_hand.shape[2:])
            depth = depth.reshape((-1, ) + depth.shape[2:])
            
            targets = targets.reshape(-1, )

            outputs = model(rgb=rgb,
                             depth=depth,
                             flow=flow,
                             face=face,
                             left_hand=left_hand,
                             right_hand=right_hand,
                             pose=pose)

            loss = loss_fn(outputs, targets)
            running_vloss += loss

            running_vacc += top_k_accuracy(outputs.detach().cpu().numpy(),
                                           targets.detach().cpu().numpy(), topk=(1, 5))

    avg_vloss = running_vloss / (i + 1)

    acc = running_vacc/len(test_loader)
    top1_acc = acc[0].item()
    top5_acc = acc[1].item()

    return (avg_vloss, top1_acc, top5_acc)

In [11]:
for i, (rgb, _, face, left_hand, right_hand, depth, flow, pose, targets) in enumerate(train_loader):
        rgb, face, left_hand, right_hand, depth, flow, pose, targets = rgb.to(device), face.to(device), left_hand.to(device), right_hand.to(device), depth.to(device), flow.to(device), pose.to(device), targets.to(device)

256 256
256 256256
 256
218 204217 
204
202218  202217

204 204
202 202
218 217204
 204
202 202
256 256
210 210
210 210
210 210
256 256
191 191
256 191256
 191
189 191189 
191
189 189
189 189
256 256
229 229
229 229
229 229
256 256
202 203
202 203
202 203
256 256
163 162
163 162
256 256
189 189
189 189
189 189
256 256
212 211
212 211
212 211
256 256
205 205
205 205
205 205
256 256
196 196
196 196
196 196
256 256
256 256
256 256
256 256
256 256
188 188
188 188
256 256188
 188
204 205
204 205
204 205
256 256
256 256
198 198213 
212
213 198212


ValueError: Caught ValueError in DataLoader worker process 3.
Original Traceback (most recent call last):
  File "/home/sadat/miniconda3/envs/dataloader/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/sadat/miniconda3/envs/dataloader/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/sadat/miniconda3/envs/dataloader/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/sadat/Desktop/seven-sees-net/dataset/dataset.py", line 204, in __getitem__
    results = self.transforms(results)
  File "/home/sadat/Desktop/seven-sees-net/dataset/transforms.py", line 172, in __call__
    frames = self.crop(frames, 'right_hand')
  File "/home/sadat/Desktop/seven-sees-net/dataset/transforms.py", line 156, in crop
    padimage[x0:x1,y0:y1] = img
ValueError: could not broadcast input array from shape (163,162,3) into shape (162,162,3)


 198
213 212
198 198
256 256
186 186
186 186
186 186
256 256
235 236
235 236
235 236
256 256
198 199
198 199
198 199
256 256
207 206
207 206
207 206
256 256
206 207
206 207
206 207
256 256
183 183
183 183
183 183
256 256
256 256
199 199209
 208
199 199
209 208
199 199
209 208
256 256
197 196
197 196
197 196
256 256
189 189
189 189
189 189
256 256
192 192
192 192
192 192
256 256
188 187
188 187
188 187
256 256
198 198
198 198
198 198


In [None]:
# Train Loop
best_vloss = 1_000_000.

# Transfer model to device
model.to(device)

for epoch in range(epochs):
    # Turn on gradient tracking and do a forward pass
    model.train(True)
    avg_loss, learning_rate = train_one_epoch(epoch+1)

    # Turn off  gradients for reporting
    model.train(False)

#     avg_vloss, top1_acc, top5_acc = validate()

#     print(
#         f'top1_acc: {top1_acc:.4}, top5_acc: {top5_acc:.4}, train_loss: {avg_loss:.5}, val_loss: {avg_vloss:.5}')

#     # Track best performance, and save the model's state
#     if avg_vloss < best_vloss:
#         best_vloss = avg_vloss
#         model_path = work_dir + f'epoch_{epoch+1}.pth'
#         print(f'Saving checkpoint at {epoch+1} epochs...')
#         torch.save(model.state_dict(), model_path)

     # Adjust learning rate
    scheduler.step()

In [None]:
rgb.shape

In [None]:
model.rgb_encoder(rgb)[-1].shape

In [None]:
        for i, (rgb, _, face, left_hand, right_hand, depth, flow, pose, targets)  in enumerate(train_loader):
            rgb, face, left_hand, right_hand, depth, flow, pose, targets = rgb.to(device), face.to(device), left_hand.to(device), right_hand.to(device), depth.to(device), flow.to(device), pose.to(device), targets.to(device)
            rgbt = rgb.reshape((-1, ) + rgb.shape[2:])
            face = face.reshape((-1, ) + face.shape[2:])
            flow = flow.reshape((-1, ) + flow.shape[2:])
            left_hand = left_hand.reshape((-1, ) + left_hand.shape[2:])
            right_hand = right_hand.reshape((-1, ) + right_hand.shape[2:])
            depth = depth.reshape((-1, ) + depth.shape[2:])
            break

In [None]:
rgb.shape

In [None]:
rgbt.shape