In [1]:
from build_dataset import create_dataloader
from models import *
from main import get_args
import torch.nn as nn
import torch

from transformers import GPT2Tokenizer, GPT2Model, GPT2Config

from utils import convert_device, agg_inputs_to_batch, load_checkpoint

In [2]:
args = get_args(True)
args.yaml_file = '../config/captioning_config.yaml'

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'cls_token':'[CLS]'})

1

# Load a Dataloader

In [4]:
validloader = create_dataloader(args, 'val', tokenizer)

In [5]:
boundary_ids, captions, frames, labels = agg_inputs_to_batch(next(iter(validloader)))

# Build a VideoBoundaryCoCa

In [6]:
args.device = 'cuda'

In [7]:
model = create_model(args, tokenizer)

Some weights of MultiModalDecoder were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.7.crossattention.c_attn.weight', 'h.9.crossattention.c_attn.weight', 'h.9.crossattention.c_attn.bias', 'h.6.ln_cross_attn.weight', 'h.8.ln_cross_attn.bias', 'h.8.ln_cross_attn.weight', 'h.9.ln_cross_attn.weight', 'h.0.ln_cross_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.0.crossattention.c_attn.weight', 'h.6.crossattention.q_attn.weight', 'h.2.crossattention.c_proj.bias', 'h.3.ln_cross_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.6.crossattention.q_attn.bias', 'h.2.crossattention.q_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.bias', 'h.11.crossattention.masked_bias', 'h.10.crossattention.c_proj.bias', 'h.9.crossattention.masked_bias', 'h.1.crossattention.c_attn.bias', 'h.9.crossattention.q_attn.bias', 'h.4.crossattention.c_proj.bias', 'h.5.crossattention.c_proj.weight', 'h.3.crossattention.bias', 'h.5.crossattention.q_attn.

In [12]:
loss = model(captions=captions, frames=frames, labels=labels, return_loss=True)

In [13]:
loss

tensor(16.1398, grad_fn=<AddBackward0>)

# Generation

In [14]:
output = model.generate(
    frames, 
    max_length             = args.gen_max_length, 
    decoder_start_token_id = tokenizer.encode('Subject')[0], 
    num_beams              = 2, 
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
tokenizer.batch_decode(output)

['Subject\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"\n\n\n\n\n\n\n\n\n\n\n\n\n\n".\n\n\n\n\n\n\n\n\n\n\n\n\n\n"----The-\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 'Subject\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe,,,,,-,,--------------------------------------------------------------------------------------------']

# Inference

In [8]:
testloader = create_dataloader(args, 'test', tokenizer, test_mode=True)

In [9]:
boundary_ids, frames  = agg_inputs_to_batch(next(iter(testloader)), test_mode=True)
frames = convert_device(frames, 'cuda')

In [10]:
model.to('cuda')
model = load_checkpoint('./output/test/test_step4.pt', model)

load checkpoint from ./output/test/test_step4.pt


In [11]:
output = model.generate(
    frames, 
    max_length             = args.gen_max_length, 
    decoder_start_token_id = tokenizer.encode('Subject')[0], 
    num_beams              = 2, 
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
tokenizer.batch_decode(output)

['Subject\n\n\n\n\n\n\n\n\n\n\n\n\n.................................................................................................................-',
 'Subject\n\n\n\n\n\n\n\n\n\n.........................................................................................-To.To-ToTheToToToToToToToTheTo-The-ToThe-The-The-The-']