In [1]:
from build_dataset import create_dataloader
from models import *
from main import get_args
import torch.nn as nn
import torch
import numpy as np

from transformers import GPT2Tokenizer, GPT2Model, GPT2Config

from utils import convert_device, load_checkpoint

import matplotlib.pyplot as plt
import seaborn as sns

from glob import glob
import os

In [10]:
args = get_args(True)
args.yaml_file = '../config/captioning_config.yaml'

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'cls_token':'[CLS]'})

1

# Load a Dataloader

In [4]:
args.batch_size = 1
args.num_img_queries = 16
args.use_frame_position = True
args.use_saved_frame = True
args.use_seg_features = True
args.use_tsn_features = True
args.use_temporal_pairwise_difference = True
args.aggregation_frames_method = 'aggregation_frames_method2'
args.use_lora = True

In [5]:
validloader = create_dataloader(args, 'train', tokenizer)

In [6]:
boundary_ids, captions, frames, seg_features, tsn_features, labels = next(iter(validloader))

In [7]:
for k in ['boundary','before','after']:
    print(f'frame {k}: ',frames[k].shape)
    print(f'seg_features {k}: ',seg_features[k].shape)    
    
    if k != 'boundary':
        print(f'tsn_features {k}: ',tsn_features[k].shape)    
        
    print()
    

frame boundary:  torch.Size([1, 3, 224, 224])
seg_features boundary:  torch.Size([1, 1, 224, 224])

frame before:  torch.Size([1, 10, 3, 224, 224])
seg_features before:  torch.Size([1, 10, 1, 224, 224])
tsn_features before:  torch.Size([1, 2048])

frame after:  torch.Size([1, 10, 3, 224, 224])
seg_features after:  torch.Size([1, 10, 1, 224, 224])
tsn_features after:  torch.Size([1, 2048])



# Build a VideoBoundaryCoCa

In [8]:
args.device = 'cpu'

In [9]:
model = create_model(args, tokenizer)
model.to(args.device)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.8.crossattention.masked_bias', 'h.4.crossattention.bias', 'h.9.crossattention.c_attn.bias', 'h.5.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.6.crossattention.bias', 'h.2.crossattention.masked_bias', 'h.1.crossattention.masked_bias', 'h.2.ln_cross_attn.bias', 'h.4.crossattention.c_attn.weight', 'h.1.crossattention.bias', 'h.0.crossattention.masked_bias', 'h.7.crossattention.masked_bias', 'h.2.crossattention.bias', 'h.5.crossattention.c_proj.weight', 'h.4.ln_cross_attn.weight', 'h.7.ln_cross_attn.bias', 'h.6.crossattention.c_proj.bias', 'h.0.ln_cross_attn.bias', 'h.8.crossattention.q_attn.bias', 'h.3.crossattention.bias', 'h.7.crossattention.c_proj.bias', 'h.11.ln_cross_attn.bias', 'h.9.crossattention.q_attn.bias', 'h.8.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.3.crossattention.c_proj.weight', 'h.8.crossattention.q_attn.weight', 'h.0.

VideoBoudnaryCoCa(
  (image_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU()
          (drop1): Dropout(p=0.0, inplace=False)
          (fc2): Linear(in_features=5120, out_features=1280, bias=

In [14]:
param = 0
for p in model.unimodal_decoder.parameters():
    if p.requires_grad:
        param += p.numel()

In [18]:
param

38893056

In [17]:
model.unimodal_decoder.wte.weight.numel()

38598144

In [12]:
%%time 

acc1, acc5, caption_loss, contrastive_loss = model(
    captions=captions, 
    frames=frames, 
    seg_features=seg_features,
    tsn_features=tsn_features,
    labels=labels, 
    return_loss=True
)

CPU times: user 16.3 s, sys: 2.79 s, total: 19.1 s
Wall time: 14.6 s




In [13]:
acc1, acc5, caption_loss, contrastive_loss

tensor(16.1398, grad_fn=<AddBackward0>)

# Generation

In [14]:
output = model.generate(
    frames, 
    max_length             = args.gen_max_length, 
    decoder_start_token_id = tokenizer.encode('Subject')[0], 
    num_beams              = 2, 
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
tokenizer.batch_decode(output)

['Subject\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"\n\n\n\n\n\n\n\n\n\n\n\n\n\n".\n\n\n\n\n\n\n\n\n\n\n\n\n\n"----The-\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 'Subject\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe,,,,,-,,--------------------------------------------------------------------------------------------']

# Inference

In [4]:
args.batch_size = 1
args.num_img_queries = 16
args.use_frame_position = True
args.use_saved_frame = True
args.use_seg_features = True
args.use_tsn_features = True
args.use_temporal_pairwise_difference = True
args.aggregation_frames_method = 'aggregation_frames_method2'

In [5]:
testloader = create_dataloader(args, 'train', tokenizer, test_mode=True)

In [6]:
args.device = torch.device('cuda')

In [7]:
inputs  = next(iter(testloader))
boundary_ids, frames, seg_features, tsn_features = inputs
frames, seg_features, tsn_features = convert_device(frames, args.device), \
                                     convert_device(seg_features, args.device), \
                                     convert_device(tsn_features, args.device)

In [12]:
model = create_model(args, tokenizer)
model.to(args.device)

Some weights of MultiModalDecoder were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.crossattention.c_proj.bias', 'h.4.ln_cross_attn.weight', 'h.1.ln_cross_attn.weight', 'h.5.crossattention.bias', 'h.5.ln_cross_attn.bias', 'h.9.crossattention.q_attn.bias', 'h.8.crossattention.c_attn.weight', 'h.4.crossattention.bias', 'h.0.crossattention.q_attn.bias', 'h.6.crossattention.q_attn.bias', 'h.8.crossattention.q_attn.bias', 'h.11.crossattention.c_proj.weight', 'h.0.ln_cross_attn.weight', 'h.6.crossattention.masked_bias', 'h.2.crossattention.c_attn.weight', 'h.6.crossattention.c_proj.weight', 'h.7.crossattention.c_proj.weight', 'h.9.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.8.crossattention.c_proj.weight', 'h.11.ln_cross_attn.bias', 'h.3.crossattention.q_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.7.crossattention.masked_bias', 'h.11.crossattention.c_attn.bias', 'h.7.ln_cross_attn.weight

VideoBoudnaryCoCa(
  (image_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU()
          (drop1): Dropout(p=0.0, inplace=False)
          (fc2): Linear(in_features=5120, out_features=1280, bias=

In [None]:
%%time
inputs = {
    'frames'       : frames,
    'seg_features' : seg_features,
    'tsn_features' : tsn_features
}

output = model.generate(
    inputs, 
    max_length             = args.gen_max_length, 
    decoder_start_token_id = tokenizer.encode('Subject')[0], 
    num_beams              = 5, 
)

In [None]:
output

img = frames['boundary'][0].permute(1,2,0).detach().cpu().numpy()
img = (img - img.min()) / (img.max() - img.min()).astype(np.uint8)
plt.imshow(img)

boundary_ids

testloader.dataset.annotation[boundary_ids[0][:11]][int(boundary_ids[0][-1])]

tokenizer.batch_decode(output)