In [1]:
import torch
from torch.utils.data import DataLoader
from models.transformer import Transformer, create_look_ahead_mask
from dataset_extracted import ExtractedFeatureDataset
import sentencepiece as spm
import transforms
from evaluate import Captioner

In [2]:
test_feature_path = '../MVAD/I3D_rgb/test'
test_corpus_file = '../MVAD/corpus_M-VAD_test.txt'
tokenizer_file = 'tokenizer.model'
model_weight_file = '../checkpoint/20190812135219/2'
inp_max_seq_length = 256
tar_max_seq_length = 360
tar_vocab_size = 5000
d_model = 1024
num_heads = 8
num_layers = 6
encoder_num_layers = 2
decoder_num_layers = 4
dff = 2048
dropout = 0.1
max_seq_length = 512  # For positional encoding

In [3]:
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_file)

True

In [4]:
feature_transform = transforms.Compose([
    transforms.FeaturePadding(inp_max_seq_length)
])
caption_transform = transforms.Compose([
    transforms.CaptionPadding(tar_max_seq_length, sp.PieceToId('<PAD>'))
])

In [5]:
dataset = ExtractedFeatureDataset(test_feature_path, test_corpus_file, inp_max_seq_length, tar_max_seq_length, sp, feature_transform=feature_transform, caption_transform=caption_transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [6]:
model = Transformer(tar_vocab_size, d_model, num_heads, encoder_num_layers, decoder_num_layers, dff, dropout, max_seq_length)
state_dict = torch.load(model_weight_file)
model.load_state_dict(state_dict['model_state_dict'], strict=True)
model.eval()

Transformer(
  (pe): PositionalEncoder()
  (encoder): TransformerEncoder(
    (encoder_layers): ModuleList(
      (encoder_layer1): TransformerEncoderLayer(
        (mha): MultiheadAttention(
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ff): FeedForward(
          (linear1): Linear(in_features=1024, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (an1): AddNorm(
          (dropout): Dropout(p=0.1)
          (layernorm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
        )
        (an2): AddNorm(
          (dropout): Dropout(p=0.1)
          (layernorm): LayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
        )
      )
      (encoder_layer2): TransformerEncoderLayer(
        (mha): MultiheadAttention(
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )

In [7]:
captioner = Captioner(model, sp, tar_max_seq_length)
captioner.caption_video_from_dataloader(dataloader, count=10)

video: ('UGLY_TRUTH_DVS540.avi',) caption origin: SOMEONE gives SOMEONE a sultry look as they embrace. Their faces draw close together. caption predict: SOMEONE', SOMEONE' find his her head at the appear other of asts.
video: ('BIG_MOMMAS_LIKE_FATHER_LIKE_SON_DVS441.avi',) caption origin: A short distance down a street, a sedan turns on its headlights and crawls toward the promenade. caption predict: SOMEONE freezes, SOMEONE, SOMEONE and SOMEONE, and SOMEONE' other SOMEONE and SOMEONE's heads his her eyes. wide at the other head. as he turns to the ground.
video: ('FRIENDS_WITH_BENEFITS_DVS141.avi',) caption origin: SOMEONE totters naked to the bathroom with one hand cupped over his crotch. He leaves the door open as he steps out of view. In bed, SOMEONE puffs out a breath, then frowns. caption predict: SOMEONE shifts in his.. A man twotwee Ms a photo from the floor.
video: ('SPARKLE_2012_DVS480.avi',) caption origin: SOMEONE leaps on his back. SOMEONE grabs a poker from the fireplace.