In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import inception_v3
from nltk.translate.bleu_score import sentence_bleu
from torch.utils.data import DataLoader, Dataset
import numpy as np
from PIL import Image
import nltk

# 1. Data Preprocessing and Vocabulary
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        return self.word2idx.get(word, self.word2idx['<unk>'])

    def __len__(self):
        return len(self.word2idx)

def build_vocab(captions, threshold=5):
    vocab = Vocabulary()
    vocab.add_word('<pad>'); vocab.add_word('<start>'); vocab.add_word('<end>'); vocab.add_word('<unk>')
    word_counts = {}
    for caption in captions:
        for word in caption.lower().split():
            word_counts[word] = word_counts.get(word, 0) + 1
    for word, count in word_counts.items():
        if count >= threshold:
            vocab.add_word(word)
    return vocab

# 2. Image Feature Extraction with InceptionV3
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        inception = inception_v3(pretrained=True)
        modules = list(inception.children())[:-1]  # Remove the last classification layer
        self.inception = nn.Sequential(*modules)
        for param in self.inception.parameters():
            param.requires_grad = False

    def forward(self, images):
        features = self.inception(images)
        return features.view(features.size(0), -1)  # Flatten to 1D

# 3. LSTM-based Caption Decoder
class CaptionDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(CaptionDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        hiddens, _ = self.lstm(inputs)
        outputs = self.linear(hiddens)
        return outputs

# 4. Define Model and Training Loop
class ImageCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

# 5. Training Procedure
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for images, captions in dataloader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])  # Exclude <end> token for input
            loss = criterion(outputs.reshape(-1, outputs.size(2)), captions[:, 1:].reshape(-1))  # Exclude <start> for target
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# 6. Evaluation with BLEU Score
def evaluate_model(model, dataloader, vocab):
    model.eval()
    references, hypotheses = [], []
    with torch.no_grad():
        for images, captions in dataloader:
            images = images.to(device)
            features = model.encoder(images)
            sampled_ids = model.decoder.sample(features)
            sampled_caption = [vocab.idx2word[word.item()] for word in sampled_ids[0]]
            true_caption = [vocab.idx2word[idx] for idx in captions[0].cpu().numpy() if idx != vocab.word2idx['<pad>']]
            references.append(true_caption)
            hypotheses.append(sampled_caption)
    return np.mean([sentence_bleu([ref], hyp) for ref, hyp in zip(references, hypotheses)])

# Example Usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embed_size = 256
hidden_size = 512
vocab = build_vocab(captions)  # Assume captions is a list of all image captions in the dataset
vocab_size = len(vocab)

# Initialize model, loss function, optimizer
encoder = ImageEncoder().to(device)
decoder = CaptionDecoder(embed_size, hidden_size, vocab_size).to(device)
model = ImageCaptioningModel(encoder, decoder).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Example DataLoader (use actual DataLoader for COCO/Flickr8k)
dataloader = DataLoader([])  # Replace with your image-caption dataset

# Training and Evaluation
train_model(model, dataloader, criterion, optimizer)
bleu_score = evaluate_model(model, dataloader, vocab)
print(f"BLEU Score: {bleu_score}")
