# Basic Imports and paths

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from fastai.distributed import *
from fastai.vision.all import *
from pathlib import Path

module_path = os.path.abspath(os.path.join('../../baselines'))
if module_path not in sys.path: sys.path.append(module_path)

In [3]:
device = torch.device('cuda:1')
core_pth = Path('/home/agrawalp2/prepositions'); assert core_pth.exists()

vidvrd_path = core_pth/Path('real_world_data/vidvrd/vidvrd-dataset'); assert vidvrd_path.exists()
encoder_path = core_pth/Path('experiments/baselines/models/encoder/GoogleNews-vectors-negative300.bin.gz'); assert encoder_path.exists()

In [4]:
n_frames = 3

# Language only Model

## imports

In [None]:
from dataloaders.vidvrd.languageOnlyDataset import languageOnlyDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd
from models.dynamic.language_only import SimpleLanguageOnlyModel

## data

In [None]:
train_ds = languageOnlyDataset(vidvrd_path/'train',
                              encoder_path = encoder_path,
                              y_tfms = [map_vidvrd_to_stupd])

valid_ds = languageOnlyDataset(vidvrd_path/'test',
                              encoder_path = encoder_path, 
                              y_tfms = [map_vidvrd_to_stupd])

len(train_ds), len(valid_ds)

In [None]:
train_dl = DataLoader(train_ds, batch_size =64 , shuffle = True, drop_last = True)
valid_dl = DataLoader(valid_ds, batch_size = 128 , shuffle = True, drop_last = True)

## training

In [None]:
dls = DataLoaders(train_dl, valid_dl)
dls.n_inp = 2
model = SimpleLanguageOnlyModel(word_embedding_dim=300, feature_dim=512, c=train_ds.c).cuda()
learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [None]:
learn.fit(5)

# 2D Only

In [5]:
from dataloaders.vidvrd.coordinate2dOnlyDataset import coordinate2D_OnlyDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd
from models.dynamic.coordinate_2d_only import coordinateOnlyModel

In [6]:
train_ds = coordinate2D_OnlyDataset(annotations_directory_path = vidvrd_path/'train',
                                    n_frames = n_frames,
                                    x_tfms = None, 
                                    y_tfms = [map_vidvrd_to_stupd])

valid_ds = coordinate2D_OnlyDataset(annotations_directory_path = vidvrd_path/'test',
                                    n_frames = n_frames,
                                    x_tfms = None, 
                                    y_tfms = [map_vidvrd_to_stupd])

len(train_ds),len(valid_ds)

(20968, 4205)

In [7]:
train_dl = DataLoader(train_ds, batch_size =128, shuffle = True, drop_last = True)
valid_dl = DataLoader(valid_ds, batch_size = 256, shuffle = True, drop_last = True)

## model

In [8]:
dls = DataLoaders(train_dl, valid_dl)
model = coordinateOnlyModel(10*n_frames, 64, train_ds.c).cuda()
learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [9]:
learn.fit(5)

epoch,train_loss,valid_loss,accuracy,balanced_accuracy_score,time
0,1.366492,1.363307,0.513672,0.157923,00:02


KeyboardInterrupt: 

# DRNet 

takes 7+ hours /iteration to train. Reasons. We have to read mp4 videos rather than images. We have to individually process, then stack multiple frames.

In [None]:
# from torchvision.io import read_video
# pth = vidvrd_path/('videos/ILSVRC2015_train_00169002.mp4'); assert pth.exists()

In [None]:
# frames, _,_ = read_video(str(pth), output_format = 'TCHW')
# frames

In [None]:
from dataloaders.vidvrd.drnetDataset import drnetDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd

from models.dynamic.drnet import DRNet

import torchvision.transforms as transforms


In [None]:
train_ds = drnetDataset(annotations_directory_path = vidvrd_path/'train',
                        video_path = vidvrd_path/'videos',
                        encoder_path = encoder_path,
                        n_frames = n_frames,
                        y_category_tfms = [map_vidvrd_to_stupd],
                        x_img_tfms =     [transforms.ToPILImage("RGB"),
                                        transforms.RandomResizedCrop(224, scale=(0.75, 0.85)),
                                        transforms.ColorJitter(0.1, 0.1, 0.1, 0.05)],
                        
                        bbox_mask_tfms = [transforms.ToPILImage("RGB"),
                                            transforms.Pad(4, padding_mode="edge"),
                                            transforms.RandomResizedCrop(32, scale=(0.75, 0.85))]
                         )

valid_ds = drnetDataset(annotations_directory_path = vidvrd_path/'test',
                        video_path = vidvrd_path/'videos',
                        encoder_path = encoder_path, 
                        n_frames = n_frames,
                        y_category_tfms = [map_vidvrd_to_stupd],
                        x_img_tfms =     [transforms.ToPILImage("RGB"),
                                            transforms.CenterCrop(224)],
                        
                        bbox_mask_tfms = [transforms.ToPILImage("RGB"),
                                            transforms.Pad(4, padding_mode="edge"),
                                            transforms.CenterCrop(32)]
                         )

len(train_ds),len(valid_ds)

In [None]:
train_dl = DataLoader(train_ds, batch_size = 64 , shuffle = True, num_workers = 0)
valid_dl = DataLoader(valid_ds, batch_size = 128 , shuffle = True, num_workers = 0)

In [None]:
dls = DataLoaders(train_dl, valid_dl)
dls.n_inp = 4

model = DRNet(word_embedding_dim = 300, 
              feature_dim = 512, 
              n_frames = n_frames,
              num_classes = train_ds.c, 
              num_layers = 3,
              pretrained = False).cuda()

learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [None]:
learn.fit(5)

# ViPCNN

In [10]:
from dataloaders.vidvrd.vipcnnDataset import vipcnnDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd
from models.dynamic.vipcnn import VipCNN
import torchvision.transforms as transforms

In [11]:
train_ds = vipcnnDataset(annotations_directory_path = vidvrd_path/'train',
                        video_path = vidvrd_path/'videos',
#                          split='train',
                         n_frames = n_frames, 
                         x_tfms = [transforms.ToPILImage("RGB"),
                                   transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
                                  ],
                         y_category_tfms = [map_vidvrd_to_stupd],
                        )

valid_ds = vipcnnDataset(annotations_directory_path = vidvrd_path/'test',
                        video_path = vidvrd_path/'videos',
#                          split='train',
                         n_frames = n_frames,
                         x_tfms = [transforms.ToPILImage("RGB")],
                         y_category_tfms = [map_vidvrd_to_stupd])

len(train_ds), len(valid_ds)

(20968, 4205)

In [12]:
train_dl = DataLoader(train_ds, batch_size =32 , shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = 64 , shuffle = True)

In [13]:
dls = DataLoaders(train_dl, valid_dl)
dls.n_inp = 3

model = VipCNN(roi_size = 6, num_classes = train_ds.c, pretrained = False).cuda()

learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [14]:
learn.fit(5)

epoch,train_loss,valid_loss,accuracy,balanced_accuracy_score,time


OutOfMemoryError: CUDA out of memory. Tried to allocate 626.00 MiB (GPU 0; 23.70 GiB total capacity; 21.76 GiB already allocated; 108.81 MiB free; 22.20 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# PPRFCN

In [None]:
from dataloaders.vidvrd.pprfcnDataset import pprfcnDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd
from models.dynamic.pprfcn import PPRFCN
import torchvision.transforms as transforms

In [None]:
train_ds = pprfcnDataset(annotations_directory_path = vidvrd_path/'train',
                        video_path = vidvrd_path/'videos',
#                          split='train',
                         n_frames = n_frames, 
                         x_tfms = [transforms.ToPILImage("RGB"),
                                   transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
                                  ],
                         y_category_tfms = [map_vidvrd_to_stupd],
                        )

valid_ds = pprfcnDataset(annotations_directory_path = vidvrd_path/'test',
                        video_path = vidvrd_path/'videos',
#                          split='train',
                         n_frames = n_frames,
                         x_tfms = [transforms.ToPILImage("RGB")],
                         y_category_tfms = [map_vidvrd_to_stupd])

len(train_ds), len(valid_ds)

In [None]:
train_dl = DataLoader(train_ds, batch_size =32 , shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = 64 , shuffle = True)

In [None]:
dls = DataLoaders(train_dl, valid_dl)
dls.n_inp = 3

model = PPRFCN(train_ds.c, pretrained = False).cuda()
learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [None]:
learn.fit(5)

# VTransE

In [39]:
from dataloaders.vidvrd.vtranseDataset import vtranseDataset
from dataloaders.vidvrd.utils import  map_vidvrd_to_stupd
from models.dynamic.vtranse import VtransE
import torchvision.transforms as transforms

In [40]:
train_ds = vtranseDataset(annotations_directory_path = vidvrd_path/'train',
                          video_path = vidvrd_path/'videos',
                          encoder_path = encoder_path, 
                          n_frames = n_frames,
                          x_category_tfms = None,
                          x_tfms = [transforms.ToPILImage("RGB"),
                                   transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
                                  ],
                         y_category_tfms = [map_vidvrd_to_stupd],
                        )

valid_ds = vtranseDataset(annotations_directory_path = vidvrd_path/'test',
                          video_path = vidvrd_path/'videos',
                          encoder_path = encoder_path, 
                          n_frames = n_frames, 
                          x_category_tfms = None,
                          x_tfms = [transforms.ToPILImage("RGB")],
                          y_category_tfms = [map_vidvrd_to_stupd])

len(train_ds), len(valid_ds)

(20968, 4205)

In [44]:
train_dl = DataLoader(train_ds, batch_size = 64 , shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = 128 , shuffle = True)

In [45]:
dls = DataLoaders(train_dl, valid_dl)
dls.n_inp = 7

model = VtransE(word_embedding_dim = 300, num_classes = train_ds.c, visual_feature_size=3, pretrained = False).cuda()
learn = Learner(dls, model = model, loss_func = CrossEntropyLossFlat(), metrics = [accuracy,BalancedAccuracy()])

In [None]:
learn.fit(5)

epoch,train_loss,valid_loss,accuracy,balanced_accuracy_score,time
