In [17]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
import time
import sys

from Preprocess import load_captions
from data_loader import DataLoader
from data_loader import get_loader 
from Vocabulary import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [18]:
# public_directory = 'COCO'
local_directory = 'test'

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize smoothing function
smoothing = SmoothingFunction()

In [19]:
encoder_path = 'models/encoder-10-200.ckpt'
decoder_path = 'models/decoder-10-200.ckpt'

vocab_path = 'train'
image_dir = local_directory

# caption_path = public_directory+'/annotations/captions_train2014.json'

embed_size = 512
hidden_size = 512
num_layers = 1
crop_size = 224

log_step = 50
batch_size = 1
num_workers = 2

threshold = 20

In [20]:
captions_dict = load_captions(vocab_path)

In [21]:
len(captions_dict['2513260012_03d33305cf.jpg'])

5

In [22]:
vocab = Vocabulary(captions_dict, threshold)
vocab_size = vocab.index

In [23]:
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5),
                         (0.5, 0.5, 0.5))
    ])
# image = transform(Image.open(i))

In [24]:
# embedding_dim = 512
# vocab_size = vocab.index
# hidden_dim = 512
# model_name = model
# cnn = get_cnn(architecture = model_name, embedding_dim = embedding_dim)
# lstm = RNN(embedding_dim = embedding_dim, hidden_dim = hidden_dim, vocab_size = vocab_size)

In [25]:
dataloader = DataLoader(image_dir, vocab, transform)
imagenumbers, captiontotal, imagetotal= dataloader.gen_data()

In [26]:
data_loader = get_loader(imagenumbers, captiontotal, imagetotal, batch_size,
                         shuffle=True, num_workers=num_workers) 

In [27]:
encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
encoder = encoder.to(device)
# print(encoder)

In [28]:
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
decoder = decoder.to(device)
print(decoder)

DecoderRNN(
  (embed): Embedding(1072, 512)
  (lstm): LSTM(512, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=1072, bias=True)
)


In [29]:
# Load the trained model parameters
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

In [30]:
total_step = len(data_loader)

# List to score the BLEU scores
bleu_scores = []

In [None]:
for i, (images, captions, lengths) in enumerate(data_loader):
        
    # Set mini-batch dataset
    images = images.to(device)
    # captions = captions.to(device)
    
    # print(images.shape)
    # torch.Size([256, 3, 224, 224])
    # torch.Size([256, 2048, 1, 1])
    # torch.Size([256, 2048])
    # torch.Size([256, 256])

    # Generate an caption from the image
    feature = encoder(images)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.id2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    output = ' '.join(sampled_caption)
    
    # Convert target word_ids to words
    captions = captions[0].cpu().numpy()
    target_caption = []
    for word_id in captions:
        word = vocab.id2word[word_id]
        target_caption.append(word)
        if word == '<end>':
            break
    target = ' '.join(target_caption)
    
    # Convert string to a list and ignore <start> <end>
    target_list = target.split()[1:-1]
    output_list = output.split()[1:-1]

    score = sentence_bleu([target_list], 
                           output_list, 
                          weights=(1, 0, 0, 0),
                           smoothing_function=smoothing.method3)
    bleu_scores.append(score)

    print('{}:{:.4f}  '.format(i, score), end="")
    
    # Print log info
    if i % log_step == 0:
        print('\n---------------------------------------------------')
        print('Target: ', target)
        print('Output: ', output)
        print('\n')
        print('Finish [{}/{}], Current BLEU Score: {:.4f}'
              .format(i, total_step, np.mean(bleu_scores)))
        print('---------------------------------------------------\n')

np.save("tests.npy", [bleu_scores, np.mean(bleu_scores)])

0:0.3619  
---------------------------------------------------
Target:  <start> a couple walking alongside a <unk> next to a city . <end>
Output:  <start> a group of people are standing on a <unk> . <end>


Finish [0/5000], Current BLEU Score: 0.3619
---------------------------------------------------

1:0.3333  2:0.2143  3:0.7369  4:0.4981  5:0.3636  6:0.0667  7:0.4232  8:0.4615  9:0.1819  10:0.2500  11:0.2883  12:0.0856  13:0.2222  14:0.2098  15:0.3636  16:0.2857  17:0.2778  18:0.1000  19:0.3067  20:0.5966  21:0.2500  22:0.4286  23:0.0000  24:0.4600  25:0.3846  26:0.3957  27:0.2076  28:0.3559  29:0.3846  30:0.1395  31:0.1538  32:0.2007  33:0.2921  34:0.3846  35:0.4444  36:0.1818  37:0.3636  38:0.1757  39:0.1103  40:0.2000  41:0.1429  42:0.2727  43:0.3571  44:0.4777  45:0.4444  46:0.2483  47:0.5000  48:0.3125  49:0.2857  50:0.4548  
---------------------------------------------------
Target:  <start> people are gathered by a tree beside a line of white <unk> . <end>
Output:  <start> a

In [None]:
cnn_out = cnn(image)
ids_list = lstm.greedy(cnn_out)

plt.imshow(Image.open(i))
plt.show()
print(vocab.get_sentence(ids_list))

batch_bleu_4 += sentence_bleu([caption_word_list], 
                              predicted_word_list,
                              smoothing_function=smoothing.method1)

In [None]:
sampled_ids