In [17]:
# imports
import argparse
import logging
import time
from tqdm import tqdm
import numpy as np
import torch
from vector_cv_tools import datasets
from vector_cv_tools import transforms as T

from vector_cv_tools import utils

import albumentations as A
from torch.utils.data import DataLoader

import torchvision

In [18]:
def to_loader(dataset, num_workers, batch_size):
    """
    Loads the dataset into a loader for testing
    """
    return DataLoader(dataset,
                      num_workers=num_workers,
                      batch_size=batch_size,
                      collate_fn=utils.collate_video_fn,
                      shuffle=True)


In [19]:
# define spatial transforms
spatial_transforms = [A.Resize(128, 128), A.ToFloat(max_value=255)]
spatial_transforms = T.Compose_Video_Spatial_transform(spatial_transforms)

# define temporal transforms
temporal_transforms = [ T.video_transforms.Crop(size=64, 
                                                pad_if_needed=True,
                                                padding_mode = "wrap"), 
                        T.video_transforms.ToTensor()]

temporal_transforms = T.Compose_Video_Temporal_transform(temporal_transforms)

print("Spatial transforms: \n{}".format(spatial_transforms))
print("Temporal transforms: \n{}".format(temporal_transforms))


Spatial transforms: 
Compose_Video_Spatial_transform(
    Resize(always_apply=False, p=1, height=128, width=128, interpolation=1)
    ToFloat(always_apply=False, p=1.0, max_value=255)
)
Temporal transforms: 
Compose_Video_Temporal_transform(
    Crop(size=64, padding=None)
    ToTensor()
)


In [20]:
# create dataset
data = datasets.KineticsDataset(
        fps=10,
        max_frames=128,
        annotation_path = "/scratch/ssd002/datasets/kinetics/kinetics700/train.json",
        data_path = "/scratch/ssd002/datasets/kinetics/train",
        class_filter = ["push_up", "pull_ups"],
        spatial_transforms=spatial_transforms,
        temporal_transforms=temporal_transforms,)


In [12]:
# convert data to loader
num_workers = 4
batch_size = 8
loader = to_loader(data, num_workers, batch_size)
labels = data.metadata.labels

print("Looping through the dataset, {} labels, {} data points in total".
        format(data.num_classes, len(loader)))
for label, info in labels.items():
    print("{:<40} ID: {} size: {} {}".
        format(label, info["id"], len(info["indexes"]), len(info["indexes"])//20 * "|"))

Looping through the dataset, 2 labels, 237 data points in total
push_up                                  ID: 0 size: 964 ||||||||||||||||||||||||||||||||||||||||||||||||
pull_ups                                 ID: 1 size: 929 ||||||||||||||||||||||||||||||||||||||||||||||


In [21]:
data_point, label = data[0]
print(data_point.shape)
print(label)
vid = (data_point.numpy() * 255).astype(np.uint8)
utils.create_GIF("TestImage.gif", vid)

torch.Size([64, 128, 128, 3])
{'label_ids': [0], 'label_names': ['push_up'], 'sampled_fps': 10}


In [14]:
# get a pre-trained model
num_classes = data.num_classes
model = torchvision.models.video.r3d_18(pretrained=True, progress=True, num_classes=400)
print(model)

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [23]:
# freeze the layers except for the last one
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
            
feature_extract = True
set_parameter_requires_grad(model, feature_extract)

model.fc = torch.nn.Linear(in_features=512, out_features=num_classes, bias=True)
print(model)

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [24]:
# optimizers and loss
optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.002)
criterion = torch.nn.CrossEntropyLoss()


In [None]:
# train the model
model.train()
num_epochs = 50
device = torch.device("cuda:0")
model.to(device)
losses = []
acc = []
for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    start = time.time()
    total = running_corrects = 0
    total_loss = 0
    for idx, (d, l) in enumerate(loader):
        if idx % 100 == 0:
            name = "KINETICS_VID_{}_{}_LABEL_{}.gif".format(epoch, idx, l[0]["label_names"][0])
            print("Creating GIF:", name)
            vid = (d[0].numpy() * 255).astype(np.uint8)
            utils.create_GIF(name, vid)

        ########### Tweak input ##########
        # depending on what your model wants, tensor shapes may require a permute
        inputs = d.to(device).permute(0, 4, 2, 3, 1)
        
        # for single class, we just use the 0th element in the label
        labels = [li["label_ids"][0] for li in l]
        labels = torch.tensor(labels).to(device)

        # zero the parameter gradients
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels)
        total += len(labels)

    duration = time.time() - start
    accuracy = running_corrects / total
    loss =  total_loss / len(loader)
    print("Epoch took {:10.2f}s".format(duration))
    print("Average time per batch {}".format(duration/len(loader)))
    print("Accuracy: ", accuracy)
    print("Epoch Loss: ",loss)
    losses.append(loss)
    acc.append(accuracy)

Epoch 0/49
----------
Creating GIF: KINETICS_VID_0_0_LABEL_pull_ups.gif
Creating GIF: KINETICS_VID_0_100_LABEL_pull_ups.gif
Creating GIF: KINETICS_VID_0_200_LABEL_push_up.gif
Epoch took    2335.47s
Average time per batch 9.854308808403177
Accuracy:  tensor(0.6498, device='cuda:0')
Epoch Loss:  0.6156831582387289
Epoch 1/49
----------
Creating GIF: KINETICS_VID_1_0_LABEL_pull_ups.gif
Creating GIF: KINETICS_VID_1_100_LABEL_pull_ups.gif
Creating GIF: KINETICS_VID_1_200_LABEL_push_up.gif
Epoch took    2332.47s
Average time per batch 9.84164924561223
Accuracy:  tensor(0.7338, device='cuda:0')
Epoch Loss:  0.5394372684165898
Epoch 2/49
----------
Creating GIF: KINETICS_VID_2_0_LABEL_push_up.gif
Creating GIF: KINETICS_VID_2_100_LABEL_pull_ups.gif
Creating GIF: KINETICS_VID_2_200_LABEL_pull_ups.gif
Epoch took    2362.48s
Average time per batch 9.968257386976154
Accuracy:  tensor(0.7512, device='cuda:0')
Epoch Loss:  0.49784591221859686
Epoch 3/49
----------
Creating GIF: KINETICS_VID_3_0_LABEL