In [20]:
import torch
from torch.utils.data import DataLoader
from models.transformer import Transformer, create_look_ahead_mask
from dataset_extracted import ExtractedFeatureDataset
import sentencepiece as spm
import transforms
from evaluate import Captioner
import os

In [29]:
test_feature_path = '../MVAD/I3D_rgb_kinetics/test'
with open('../MVAD/test_fine') as f:
    files = f.readlines()
    feature_files = list(map(lambda file: os.path.join(test_feature_path, str.strip(file) + '.npy'), files))

In [30]:
test_corpus_file = '../MVAD/corpus_M-VAD_test.txt'
tokenizer_file = 'tokenizer.model'
model_weight_file = '../checkpoint/20190812135219/20'
inp_max_seq_length = 50
tar_max_seq_length = 50
tar_vocab_size = 5000
d_model = 1024
num_heads = 8
num_layers = 6
encoder_num_layers = 2
decoder_num_layers = 4
dff = 2048
dropout = 0.1
max_seq_length = 80  # For positional encoding

In [31]:
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_file)

True

In [32]:
feature_transform = transforms.Compose([
    transforms.FeaturePadding(inp_max_seq_length)
])
caption_transform = transforms.Compose([
    transforms.CaptionPadding(tar_max_seq_length, sp.PieceToId('<PAD>'))
])

In [33]:
dataset = ExtractedFeatureDataset(None, test_corpus_file, inp_max_seq_length, tar_max_seq_length, sp, feature_transform=feature_transform, caption_transform=caption_transform, feature_files=feature_files)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [34]:
model = Transformer(tar_vocab_size, d_model, num_heads, encoder_num_layers, decoder_num_layers, dff, dropout, max_seq_length)
state_dict = torch.load(model_weight_file, map_location='cpu')
model.load_state_dict(state_dict['model_state_dict'], strict=True)
model.eval()

Transformer(
  (pe): PositionalEncoder()
  (encoder): TransformerEncoder(
    (encoder_layers): ModuleList(
      (encoder_layer1): TransformerEncoderLayer(
        (mha): MultiheadAttention(
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ff): FeedForward(
          (linear1): Linear(in_features=1024, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (an1): AddNorm(
          (dropout): Dropout(p=0.1)
          (layernorm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
        )
        (an2): AddNorm(
          (dropout): Dropout(p=0.1)
          (layernorm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
        )
      )
      (encoder_layer2): TransformerEncoderLayer(
        (mha): MultiheadAttention(
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )

In [35]:
captioner = Captioner(model, sp, tar_max_seq_length)
captioner.caption_video_from_dataloader(dataloader, count=10)

video: ('THE_BOUNTY_HUNTER_DVS278.avi',) caption origin: He drives up and taps the back of the pedicab with his bumper. caption predict: SOMEONE looks on underneath extk of mirror blows le around the  regard lowering under Tas.
video: ('FRIENDS_WITH_BENEFITS_DVS287.avi',) caption origin: An airborne view orbits the huge white letters. SOMEONE and SOMEONE dangle their legs from the second O. caption predict: Now spa road in a black and his face with the wrwayator. SOMEONE's SUV breath wife and take a few d table underling a cannai before' costume two swirl one S sc surface.
video: ('SPARKLE_2012_DVS578.avi',) caption origin: SOMEONE shifts her gaze, then looks to her mother. caption predict: SOMEONE looks onary.
video: ('THE_DESCENDANTS_DVS677.avi',) caption origin: He offers a pen. caption predict: SOMEONE looks onary.
video: ('UGLY_TRUTH_DVS239.avi',) caption origin: She flips open the lid of a pastry box and kisses his cheek. caption predict: Now spays over the passenger, in theirous

In [8]:
dataiter = iter(dataloader)

In [16]:
for sample in dataiter:
    sample[0].size()

TypeError: batch must contain tensors, numbers, dicts or lists; found <class 'NoneType'>