# About the environment - "vector_cv_project"

In [75]:
!which python
!echo $PYTHONPATH
!echo $LD_LIBRARY_PATH
!echo $PATH

/pkgs/anaconda3/bin/python
/pkgs/vector_cv_project/lib/python3.6/site-packages
/pkgs/nccl_2.8.3-1+cuda10.1_x86_64/lib:/pkgs/cudnn-10.1-v7.6.4.38/lib64:/pkgs/cuda-10.1/lib64
/pkgs/vector_cv_project/lib/python3.6/site-packages/bin:/pkgs/cuda-10.1/bin:/pkgs/anaconda3/bin:/pkgs/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/h/xinli/.local/bin


In [77]:
# imports
import argparse
import logging
import time
from tqdm import tqdm
import numpy as np
import torch
from vector_cv_tools import datasets
from vector_cv_tools import transforms as T

from vector_cv_tools import utils

import albumentations
import torchvision
from torch.utils.data import DataLoader

torch.cuda.is_available()

True

In [None]:
kinetics_annotation_path = "./datasets/kinetics/kinetics700/train.json"
kinetics_data_path = "./datasets/kinetics/train"

# A basic, un-transformed kinetics dataset


In [78]:
# define basic spatial and temporal transforms
base_spatial_transforms = T.ComposeVideoSpatialTransform([albumentations.ToFloat(max_value=255)])
base_temporal_transforms = T.ComposeVideoTemporalTransform([T.video_transforms.ToTensor()])

# create raw dataset
data_raw = datasets.KineticsDataset(
        fps=10,
        max_frames=128,
        round_source_fps=False,
        annotation_path = kinetics_annotation_path,
        data_path = kinetics_data_path,
        class_filter = ["push_up", "pull_ups"],
        spatial_transforms=base_spatial_transforms,
        temporal_transforms=base_temporal_transforms)


In [79]:
labels = data_raw.metadata.labels
print("Looping through the dataset, {} labels, {} data points in total".
        format(data_raw.num_classes, len(data_raw)))
for label, info in labels.items():
    print("{:<40} ID: {} size: {} {}".
        format(label, info["id"], len(info["indexes"]), len(info["indexes"])//20 * "|"))

Looping through the dataset, 2 labels, 1893 data points in total
push_up                                  ID: 0 size: 964 ||||||||||||||||||||||||||||||||||||||||||||||||
pull_ups                                 ID: 1 size: 929 ||||||||||||||||||||||||||||||||||||||||||||||


In [80]:
data_point, label = data_raw[0]
print(data_point.shape)
print(label)
vid = (data_point.numpy() * 255).astype(np.uint8)
utils.create_GIF("raw_img.gif", vid)

torch.Size([100, 480, 272, 3])
{'label_ids': [0], 'label_names': ['push_up'], 'sampled_fps': 10}


# A dataset with video transformations

In [105]:
###############################################
##### NOW PRESENT TO YOU: VideoTransforms!!####
###############################################

# compatibility with others
transform1 = T.from_torchvision(
    torchvision.transforms.ColorJitter())

transform2 = T.from_torchvision(
    torchvision.transforms.functional.hflip)

transform3 = T.from_albumentation(
    albumentations.VerticalFlip(p=1))

# Spatial: in-house
transform4 = T.RandomResizedSpatialCrop((280, 280), scale=(0, 1))
transform5 = T.RandomSpatialCrop((480, 480))
transform6 = T.RandomTemporalCrop(size=50, pad_if_needed=True, padding_mode="wrap")
transform7 = T.SampleEveryNthFrame(2)
transform8 = T.ToTensor()
                                  
spatial_transforms = base_spatial_transforms

# define temporal transforms
temporal_transforms = [transform1, transform2, transform3, transform4,
                        transform6, transform7, transform8]

temporal_transforms = T.ComposeVideoTemporalTransform(temporal_transforms)

print("Spatial transforms: \n{}".format(spatial_transforms))
print("Temporal transforms: \n{}".format(temporal_transforms))


Spatial transforms: 
ComposeVideoSpatialTransform(
    ToFloat(always_apply=False, p=1.0, max_value=255)
)
Temporal transforms: 
ComposeVideoTemporalTransform(
    TorchvisionWrapper for ColorJitter(brightness=None, contrast=None, saturation=None, hue=None)
    TorchvisionWrapper for <function hflip at 0x7f7fa28611e0>
    AlbumentationWrapper for VerticalFlip(always_apply=False, p=1)
    RandomResizedSpatialCrop(size=(280, 280), scale=(0, 1), ratio=(0.75, 1.3333), interpolation=PIL.Image.BILINEAR)
    RandomTemporalCrop(size=50, padding=None)
    SampleEveryNthFrame(n=2)
    ToTensor()
)


In [106]:
# create a dataset with transformations
data_transformed = datasets.KineticsDataset(
        fps=10,
        max_frames=128,
        round_source_fps=False,
        annotation_path = kinetics_annotation_path,
        data_path = kinetics_data_path,
        class_filter = ["push_up", "pull_ups"],
        spatial_transforms=spatial_transforms,
        temporal_transforms=temporal_transforms,)




In [107]:
data_point, label = data_transformed[0]
print(data_point.shape)
print(label)
vid = (data_point.numpy() * 255).astype(np.uint8)
utils.create_GIF("transformed_img.gif", vid)

torch.Size([25, 280, 280, 3])
{'label_ids': [0], 'label_names': ['push_up'], 'sampled_fps': 10}
