In [1]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
import time
import sys

from Preprocess import load_captions
from data_loader import DataLoader
from data_loader import get_loader 
from Vocabulary import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
# public_directory = 'COCO'
local_directory = 'test'

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize smoothing function
smoothing = SmoothingFunction()

In [3]:
encoder_path = 'models/encoder-1-200.ckpt'
decoder_path = 'models/decoder-1-200.ckpt'

vocab_path = 'train'
image_dir = local_directory

# caption_path = public_directory+'/annotations/captions_train2014.json'

embed_size = 512
hidden_size = 4096
num_layers = 1
crop_size = 224

log_step = 50
batch_size = 1
num_workers = 2

threshold = 10

In [4]:
captions_dict = load_captions(vocab_path)

In [5]:
len(captions_dict['2513260012_03d33305cf.jpg'])

5

In [6]:
vocab = Vocabulary(captions_dict, threshold)
vocab_size = vocab.index

In [7]:
print(vocab_size)

1809


In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
    ])
# image = transform(Image.open(i))

In [9]:
# embedding_dim = 512
# vocab_size = vocab.index
# hidden_dim = 512
# model_name = model
# cnn = get_cnn(architecture = model_name, embedding_dim = embedding_dim)
# lstm = RNN(embedding_dim = embedding_dim, hidden_dim = hidden_dim, vocab_size = vocab_size)

In [10]:
dataloader = DataLoader(image_dir, vocab, transform)
imagenumbers, captiontotal, imagetotal= dataloader.gen_data()

In [11]:
data_loader = get_loader(imagenumbers, captiontotal, imagetotal, batch_size,
                         shuffle=True, num_workers=num_workers) 

In [12]:
encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
encoder = encoder.to(device)
# print(encoder)

Encoder Model:  resnet152


In [13]:
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
decoder = decoder.to(device)
print(decoder)

DecoderRNN(
  (embed): Embedding(1809, 512)
  (lstm): LSTM(512, 4096, batch_first=True)
  (linear): Linear(in_features=4096, out_features=1809, bias=True)
)


In [14]:
# Load the trained model parameters
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

In [20]:
total_step = len(data_loader)

# List to score the BLEU scores
bleu_scores = []

In [21]:
for i, (images, captions, lengths) in enumerate(data_loader):
        
    # Set mini-batch dataset
    images = images.to(device)
    # captions = captions.to(device)
    
    # print(images.shape)
    # torch.Size([256, 3, 224, 224])
    # torch.Size([256, 2048, 1, 1])
    # torch.Size([256, 2048])
    # torch.Size([256, 256])

    # Generate an caption from the image
    feature = encoder(images)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.id2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    output = ' '.join(sampled_caption)
    
    # Convert target word_ids to words
    captions = captions[0].cpu().numpy()
    target_caption = []
    for word_id in captions:
        word = vocab.id2word[word_id]
        target_caption.append(word)
        if word == '<end>':
            break
    target = ' '.join(target_caption)
    
    # Convert string to a list and ignore <start> <end>
    target_list = target.split()[1:-1]
    output_list = output.split()[1:-1]

    score = sentence_bleu([target_list], 
                           output_list, 
                          weights=(1, 0, 0, 0),
                           smoothing_function=smoothing.method3)
    bleu_scores.append(score)

    print('{}:{:.4f}  '.format(i, score), end="")
    
    # Print log info
    if i % log_step == 0:
        print('\n---------------------------------------------------')
        print('Target: ', target)
        print('Output: ', output)
        print('\n')
        print('Finish [{}/{}], Current BLEU Score: {:.4f}'
              .format(i, total_step, np.mean(bleu_scores)))
        print('---------------------------------------------------\n')

np.save("tests.npy", [bleu_scores, np.mean(bleu_scores)])

0:0.3834  
---------------------------------------------------
Target:  <start> two men <unk> a sign that says `` <unk> or <unk> `` . <end>
Output:  <start> a man in a red shirt and a <unk> <unk> <unk> . <end>


Finish [0/5000], Current BLEU Score: 0.3834
---------------------------------------------------

1:0.3894  2:0.2022  3:0.4167  4:0.3583  5:0.2388  6:0.3894  7:0.4167  8:0.2007  9:0.0556  10:0.3033  11:0.2500  12:0.2577  13:0.5000  14:0.3333  15:0.2500  16:0.2500  17:0.1298  18:0.3791  19:0.3033  20:0.2747  21:0.1667  22:0.0985  23:0.4232  24:0.5000  25:0.1580  26:0.2921  27:0.4167  28:0.1765  29:0.2500  30:0.2527  31:0.2000  32:0.3527  33:0.2500  34:0.3834  35:0.3527  36:0.1667  37:0.2084  38:0.3333  39:0.4600  40:0.2084  41:0.1667  42:0.3436  43:0.2022  44:0.1667  45:0.5000  46:0.1623  47:0.3846  48:0.3583  49:0.3583  50:0.1667  
---------------------------------------------------
Target:  <start> the two children swinging together on a swing . <end>
Output:  <start> a man in 

KeyboardInterrupt: 

In [32]:
cnn_out = cnn(image)
ids_list = lstm.greedy(cnn_out)

plt.imshow(Image.open(i))
plt.show()
print(vocab.get_sentence(ids_list))

batch_bleu_4 += sentence_bleu([caption_word_list], 
                              predicted_word_list,
                              smoothing_function=smoothing.method1)

NameError: name 'cnn' is not defined

In [33]:
images.shape

torch.Size([1, 3, 224, 224])

In [35]:
encoder(images).shape

torch.Size([1, 512])

In [58]:
sentence_bleu([target_list], 
                           output_list, 
                          weights=(1, 0, 0, 0),
                           smoothing_function=smoothing.method7)

0.5562862736224321

In [48]:
sentence_bleu([target_list], 
                           output_list, 
                          weights=(1, 0, 0, 0))

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.38940039153570244