In [25]:
from model_arc import EncoderCNN, DecoderRNN
import torch
from PIL import Image

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embed_size = 512
hidden_size = 512 
import json
with open('/home/vitoupro/code/image_captioning/data/processed/idx2word.json') as f:
    idx2word = json.load(f)
word2idx = {word: int(index) for index, word in idx2word.items()}


In [27]:
# Define the model architecture again
encoder = EncoderCNN(embed_size=embed_size).to(device)
decoder = DecoderRNN(embed_size=embed_size, hidden_size=hidden_size, vocab_size=len(word2idx)).to(device)

# Load the model state
checkpoint = torch.load('captioning_model.pth')
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])

# Set the model to evaluation mode
encoder.eval()
decoder.eval()


DecoderRNN(
  (embed): Embedding(64, 512)
  (lstm): LSTM(512, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=64, bias=True)
)

In [30]:
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from model_arc import EncoderCNN, DecoderRNN

def generate_caption(image_path, encoder, decoder, word2idx, idx2word, max_length=50):
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Image preprocessing
    transform = Compose([
        Resize(256),
        CenterCrop(224),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension and send to device

    # Generate features from the encoder
    with torch.no_grad():
        features = encoder(image)
        captions = torch.tensor([word2idx['<START>']]).unsqueeze(0).to(device)
        result_caption = []

        for _ in range(max_length):
            outputs = decoder(features, captions)
            outputs = outputs[:, -1, :]  # Get the last word
            _, predicted = outputs.max(1)  # Get the index of the max log-probability
            predicted_word = idx2word[str(predicted.item())]
            result_caption.append(predicted_word)

            if predicted_word == '<END>':
                break

            captions = torch.cat((captions, predicted.unsqueeze(0)), dim=1)

    return ' '.join(result_caption)

# Ensure that the necessary classes and variables (`encoder`, `decoder`, `word2idx`, `idx2word`) are properly initialized and used here
image_path = '/home/vitoupro/code/image_captioning/data/raw/animals/badger/0cf04d0dab.jpg'
encoder = EncoderCNN(embed_size=256)  # Assuming embed_size
decoder = DecoderRNN(embed_size=256, hidden_size=512, vocab_size=len(word2idx))  # Adjust as necessary
caption = generate_caption(image_path, encoder, decoder, word2idx, idx2word)
print("Generated Caption:", caption)


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same