In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [2]:
!pip install -q torch torchvision nltk tqdm kagglehub

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import os

DRIVE_ROOT = "/content/drive/MyDrive/FYP_Models/Image_Captioning"
os.makedirs(DRIVE_ROOT, exist_ok=True)

MODEL_PATH = os.path.join(DRIVE_ROOT, "resnet50_model.pth")
VOCAB_PATH = os.path.join(DRIVE_ROOT, "vocab.pt")
FEATURE_PATH = os.path.join(DRIVE_ROOT, "resnet50_features.pt")

print("Drive directory ready:", DRIVE_ROOT)

Drive directory ready: /content/drive/MyDrive/FYP_Models/Image_Captioning


In [5]:
import kagglehub

path = kagglehub.dataset_download("adityajn105/flickr8k")
print("Dataset path:", path)

Using Colab cache for faster access to the 'flickr8k' dataset.
Dataset path: /kaggle/input/flickr8k


In [6]:
import os

# Corrected paths based on Kaggle dataset structure
IMAGE_DIR = os.path.join(path, "Images")
CAPTION_FILE = os.path.join(path, "captions.txt")

print(len(os.listdir(IMAGE_DIR)), "images found")
print("Caption file exists:", os.path.exists(CAPTION_FILE))

8091 images found
Caption file exists: True


In [7]:
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
from tqdm import tqdm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

In [8]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
import csv # Import the csv module to handle quoted fields
captions = {}

with open(CAPTION_FILE, "r") as f:
    reader = csv.reader(f) # Use csv.reader to parse the file
    next(reader) # Skip the header line
    for row in reader:
        if len(row) >= 2: # Ensure the row has at least two columns (image, caption)
            img_name = row[0] # Image name is directly in the first column
            caption_text = row[1] # Caption text is in the second column

            caption_text = "<start> " + caption_text.lower() + " <end>"
            captions.setdefault(img_name, []).append(caption_text)
        else:
            # Optionally, log or handle malformed rows if necessary
            print(f"Skipping malformed row: {row}")

print("Total images with captions:", len(captions))

Total images with captions: 8091


In [10]:
import nltk
nltk.download('punkt_tab', quiet=True)

word_counter = Counter()

for caps in captions.values():
    for c in caps:
        word_counter.update(word_tokenize(c))

vocab = [w for w, c in word_counter.items() if c >= 5]

word2idx = {w: i+1 for i, w in enumerate(vocab)}
word2idx["<pad>"] = 0
idx2word = {i: w for w, i in word2idx.items()}

vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

# Save vocabulary to Google Drive
torch.save((word2idx, idx2word), VOCAB_PATH)
print("Vocabulary saved to Drive:", VOCAB_PATH)

Vocabulary size: 3004
Vocabulary saved to Drive: /content/drive/MyDrive/FYP_Models/Image_Captioning/vocab.pt


In [11]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [12]:
resnet = models.resnet50(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()

for p in resnet.parameters():
    p.requires_grad = False



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 171MB/s]


In [13]:
if os.path.exists(FEATURE_PATH):
    print("Loading image features from Drive...")
    features = torch.load(FEATURE_PATH)
else:
    print("Extracting image features (one-time)...")
    features = {}

    with torch.no_grad():
        for img_name in tqdm(captions.keys()):
            img_path = os.path.join(IMAGE_DIR, img_name)
            image = Image.open(img_path).convert("RGB")
            image = transform(image).unsqueeze(0)
            feature = feature = resnet(image).squeeze().detach()
            features[img_name] = feature

    torch.save(features, FEATURE_PATH)
    print("Image features saved to Drive")


Loading image features from Drive...


In [14]:
def caption_to_seq(caption):
    tokens = word_tokenize(caption)
    return [word2idx.get(w, 0) for w in tokens]

In [15]:
class CaptionModel(nn.Module):
    def __init__(self, vocab_size, feature_size=2048, embed_size=256, hidden_size=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.feature_fc = nn.Linear(feature_size, hidden_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        # features: [2048]
        h0 = self.feature_fc(features).unsqueeze(0).unsqueeze(0)
        c0 = torch.zeros_like(h0)

        embeddings = self.embedding(captions)
        outputs, _ = self.lstm(embeddings, (h0, c0))
        outputs = self.fc(outputs)
        return outputs

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CaptionModel(vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [17]:
EPOCHS = 50

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    step_count = 0

    for img, caps in captions.items():
        feature = features[img].to(device)

        for c in caps:
            seq = torch.tensor(caption_to_seq(c)).unsqueeze(0).to(device)
            inputs = seq[:, :-1]
            targets = seq[:, 1:]

            outputs = model(feature, inputs)
            loss = criterion(
                outputs.reshape(-1, vocab_size),
                targets.reshape(-1)
            )

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            step_count += 1

    avg_loss = total_loss / step_count
    print(f"Epoch {epoch+1}/{EPOCHS} | Avg Loss: {avg_loss:.4f}")

Epoch 1/50 | Avg Loss: 2.4964
Epoch 2/50 | Avg Loss: 2.1198
Epoch 3/50 | Avg Loss: 1.9782
Epoch 4/50 | Avg Loss: 1.8790
Epoch 5/50 | Avg Loss: 1.7965
Epoch 6/50 | Avg Loss: 1.7239
Epoch 7/50 | Avg Loss: 1.6573
Epoch 8/50 | Avg Loss: 1.5955
Epoch 9/50 | Avg Loss: 1.5368
Epoch 10/50 | Avg Loss: 1.4816
Epoch 11/50 | Avg Loss: 1.4283
Epoch 12/50 | Avg Loss: 1.3796
Epoch 13/50 | Avg Loss: 1.3341
Epoch 14/50 | Avg Loss: 1.2913
Epoch 15/50 | Avg Loss: 1.2534
Epoch 16/50 | Avg Loss: 1.2146
Epoch 17/50 | Avg Loss: 1.1804
Epoch 18/50 | Avg Loss: 1.1489
Epoch 19/50 | Avg Loss: 1.1175
Epoch 20/50 | Avg Loss: 1.0927
Epoch 21/50 | Avg Loss: 1.0666
Epoch 22/50 | Avg Loss: 1.0419
Epoch 23/50 | Avg Loss: 1.0184
Epoch 24/50 | Avg Loss: 0.9992
Epoch 25/50 | Avg Loss: 0.9782
Epoch 26/50 | Avg Loss: 0.9595
Epoch 27/50 | Avg Loss: 0.9461
Epoch 28/50 | Avg Loss: 0.9330
Epoch 29/50 | Avg Loss: 0.9170
Epoch 30/50 | Avg Loss: 0.9038
Epoch 31/50 | Avg Loss: 0.8897
Epoch 32/50 | Avg Loss: 0.8747
Epoch 33/50 | Avg

In [18]:
torch.save(model.state_dict(), MODEL_PATH)
print("Model saved to Google Drive:", MODEL_PATH)

Model saved to Google Drive: /content/drive/MyDrive/FYP_Models/Image_Captioning/resnet50_model.pth


In [19]:
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()
print("Model loaded from Drive")

Model loaded from Drive
