In [1]:
import random
from data import ImageDetectionsField, TextField, RawField
from data import COCO, DataLoader
import evaluation
from models.transformer import Transformer, MemoryAugmentedEncoder, MeshedDecoder, ScaledDotProductAttentionMemory
import torch
from tqdm import tqdm
import argparse
import pickle
import numpy as np

random.seed(1234)
torch.manual_seed(1234)
np.random.seed(1234)

ImportError: cannot import name 'MemoryAugmentedEncoder'

In [15]:
%reload_ext autoreload
%autoreload 2

### predict the test set of artpedia dataset

In [4]:
def predict_captions(model, dataloader, text_field, mode = "multiple"):
    import itertools
    model.eval()
    gen = {}
    gts = {}
    with tqdm(desc='Evaluation', unit='it', total=len(dataloader)) as pbar:
        for it, (images, caps_gt) in enumerate(iter(dataloader)):
            print('-'*10 + f"the {it}the iteration" + '-'*10)
            images = torch.rand(1,50,2048)
            images = images.to(device)
            with torch.no_grad():
                out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
            caps_gen = text_field.decode(out, join_words=False)
            for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
                gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                gen['%d_%d' % (it, i)] = [gen_i.strip(), ]
                if mode == "single":
                    gts['%d_%d' % (it, i)] = [gts_i[0]]
                elif mode == "multiple":
                    gts['%d_%d' % (it, i)] = gts_i
            pbar.update()
    
    gts = evaluation.PTBTokenizer.tokenize(gts)
    gen = evaluation.PTBTokenizer.tokenize(gen)
    scores, _ = evaluation.compute_scores(gts, gen)

    return scores

In [5]:

device = torch.device('cuda')

# parser = argparse.ArgumentParser(description='Meshed-Memory Transformer')
# parser.add_argument('--batch_size', type=int, default=10)
# parser.add_argument('--workers', type=int, default=0)
# parser.add_argument('--features_path', type=str)
# parser.add_argument('--annotation_folder', type=str)
# args = parser.parse_args()
batch_size = 1
workers = 0
features_path = "../Dataset/coco/coco_detection.hdf5"
annotation_folder = "../Dataset/coco/annotations/"

print('Meshed-Memory Transformer Evaluation')

Meshed-Memory Transformer Evaluation


In [6]:
# Pipeline for text
text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy',
                       remove_punctuation=True, nopoints=False)

print("Creating pipeline: done !")

Creating pipeline: done !


In [7]:
                  # paired dataset
text_field.vocab = pickle.load(open('vocab.pkl', 'rb'))

print("Creating dataset: done !")

Creating dataset: done !


In [8]:
# Model and dataloaders
encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory,
                                 attention_module_kwargs={'m': 40})
decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'])
model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device)

print("Initialise model: done !")

Initialise model: done !


In [9]:
data = torch.load('meshed_memory_transformer.pth')
model.load_state_dict(data['state_dict'])

print("Load model parameters: done !")

Load model parameters: done !


In [10]:
dict_dataset_test = test_dataset.image_dictionary({'image': image_field, 'text': RawField()})          # dictionary dataset
dict_dataloader_test = DataLoader(dict_dataset_test, batch_size=batch_size, num_workers=workers)

print("Dataloader: done !")

Dataloader: done !


In [1]:
# scratch model
scores = predict_captions(model, dict_dataloader_test, text_field)
print(scores)

### describe one image

In [1]:
import h5py
import os
import random
from data import ImageDetectionsField, TextField, RawField
from data import COCO, DataLoader
import evaluation
from models.transformer import Transformer, MemoryAugmentedEncoder, MeshedDecoder, ScaledDotProductAttentionMemory
import torch
from tqdm import tqdm
import argparse
import pickle
import numpy as np

random.seed(1234)
torch.manual_seed(1234)
np.random.seed(1234)
device = torch.device('cuda')

In [2]:
text_field = TextField(init_token='<bos>', eos_token='<eos>', lower=True, tokenize='spacy',
                       remove_punctuation=True, nopoints=False)
text_field.vocab = pickle.load(open('vocab.pkl', 'rb'))

encoder = MemoryAugmentedEncoder(3, 0, attention_module=ScaledDotProductAttentionMemory,
                                 attention_module_kwargs={'m': 40})
decoder = MeshedDecoder(len(text_field.vocab), 54, 3, text_field.vocab.stoi['<pad>'])

model = Transformer(text_field.vocab.stoi['<bos>'], encoder, decoder).to(device)

data = torch.load('saved_models/artpedia_finetune_singlecap.pth')
model.load_state_dict(data['state_dict'])

print("Initialise model: done !")

Initialise model: done !


In [5]:
# choose an image
images = torch.tensor(np.load("../Dataset/artpedia/artpedia_features/100.npz")['x']).to(device)
images = images[:50].unsqueeze(dim=0)

beam_size = 1
out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], beam_size, out_size=1, is_sample=True)
print(out)
" ".join(text_field.decode(out, join_words=False)[0])

generation by top-k sampling
tensor([[  28, 2953,    0,    9,    4,    0,   12,   10,   23,    8,    4,    0,
         2473,  756,   50,    0,    0,    0,    0,    3]], device='cuda:0')


'it depicts <unk> with a <unk> man and woman in a <unk> oil painting by <unk> <unk> <unk> <unk>'

In [10]:
for file_name in os.listdir("../Dataset/artpedia/artpedia_features/")[:10]:
    print("\n------------")
    print(file_name)
    file_path = "../Dataset/artpedia/artpedia_features/" + file_name
    images = torch.tensor(np.load(file_path)['x']).to(device)
    images = images[:50].unsqueeze(dim=0)
    with torch.no_grad():
        out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
        print(out)
        caps_gen = text_field.decode(out, join_words=False)
        print(" ".join(caps_gen[0]))


------------
0.npz
generation by beam search with beam size 5
tensor([[   7,  756, 2953,    7,    0,    6,    4,    0,    0,    3,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]], device='cuda:0')
the painting depicts the <unk> of a <unk> <unk>

------------
1.npz
generation by beam search with beam size 5
tensor([[  28, 2953,    7,  756,    6,    7,    0,    6,    7,    0,    6,    0,
            3,    0,    0,    0,    0,    0,    0,    0]], device='cuda:0')
it depicts the painting of the <unk> of the <unk> of <unk>

------------
10.npz
generation by beam search with beam size 5
tensor([[  28, 2953,    7,    0,    6,    7,  756,    6,    0, 3584,    0,    0,
            0,    0,    0,    0,    0,    0, 3505,   10]], device='cuda:0')
it depicts the <unk> of the painting of <unk> ( <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> ) and

------------
100.npz
generation by beam search with beam size 5
tensor([[   7,  756, 2953,    4,   51,   12,   10,   23,   14,   

In [11]:
for file_name in os.listdir("../Dataset/artpedia/artpedia_features/")[:10]:
    print("\n------------")
    print(file_name)
    file_path = "../Dataset/artpedia/artpedia_features/" + file_name
    images = torch.tensor(np.load(file_path)['x']).to(device)
    images = images[:50].unsqueeze(dim=0)
    with torch.no_grad():
        out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 1, out_size=1, is_sample=True)
        print(out)
        caps_gen = text_field.decode(out, join_words=False)
        print(" ".join(caps_gen[0]))


------------
0.npz
generation by top-k sampling
tensor([[   7,    0,   11,    0,   13,  338,   15, 2473,    5, 3848,  756,    0,
            0,    8,    0,   10,    7,    0,    3,    0]], device='cuda:0')
the <unk> is <unk> to be an oil on canvas painting <unk> <unk> in <unk> and the <unk>

------------
1.npz
generation by top-k sampling
tensor([[  28, 2953,    0, 3584, 3505,    0,    8,    0,    0,    0,    0, 3505,
           11,    7,    0,  756,    7, 4485,    6,    7]], device='cuda:0')
it depicts <unk> ( ) <unk> in <unk> <unk> <unk> <unk> ) is the <unk> painting the artist of the

------------
10.npz
generation by top-k sampling
tensor([[  28, 2953,   16,  191,  315,  449,    0,    0,    0, 3584,    0,    0,
            0, 3505,  122,  575,  449,  509,    0,   50]], device='cuda:0')
it depicts two women who have <unk> <unk> <unk> ( <unk> <unk> <unk> ) as they have been <unk> by

------------
100.npz
generation by top-k sampling
tensor([[  28,  550,    0,  129,    0,    0,    0, 

top-k sampling will generate more unk token