In [195]:
import torch
import os
import pandas as pd
from kagglehub import dataset_load
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader,dataset
from PIL import Image
import spacy

In [196]:
import spacy
import os
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset

# It's good practice to load the spacy model once globally
spacy_eng = spacy.load("en_core_web_sm")

class Vocabulary:
    def __init__(self, frequency_threshold):
        # Initial tokens
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>",}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3,}
        self.frequency_threshold = frequency_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4  # Start index after special tokens

        # Step 1: Count all word frequencies across all sentences
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] = frequencies.get(word, 0) + 1

        # Step 2: Add words to vocab if they meet the frequency threshold
        for word, count in frequencies.items():
            if count >= self.frequency_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [197]:
class FlickerDataset(Dataset):
    def __init__(self, root_dir, caption_file, vocab, transform=None):
        self.root_dir = root_dir
        self.df = pd.read_csv(caption_file)
        self.transform = transform

        # Use the vocabulary passed as an argument
        self.vocab = vocab

        self.images = self.df["images"]
        self.captions = self.df["caption"]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        try:
            img_path = os.path.join(self.root_dir, self.df.iloc[index, 0])
            caption = self.df.iloc[index, 1]
            image = Image.open(img_path).convert("RGB")

            if self.transform:
                image = self.transform(image)

            numericalized_caption = [self.vocab.stoi["<SOS>"]]
            numericalized_caption += self.vocab.numericalize(caption)
            numericalized_caption.append(self.vocab.stoi["<EOS>"])

            return image,torch.tensor(numericalized_caption)

        except Exception as e:
            print(f"❌ Error at index {index} — {self.df.iloc[index, 0]}")
            raise e


In [198]:
df.head()


Unnamed: 0,images,caption
1,1000092795.jpg,Two young guys with shaggy hair look at their...
2,1000092795.jpg,"Two young , White males are outside near many..."
3,1000092795.jpg,Two men in green shirts are standing in a yard .
4,1000092795.jpg,A man in a blue shirt standing in a garden .
5,1000092795.jpg,Two friends enjoy time spent together .


In [199]:
df.to_csv("captions.csv", index=False)


In [200]:
voab=Vocabulary(10)

In [201]:
voab.build_vocabulary(caption_list)

In [202]:
caption_list=df["caption"].dropna().astype(str).tolist()

In [203]:
len(caption_list)

158914

In [204]:
voab.itos

{0: '<PAD>',
 1: '<SOS>',
 2: '<EOS>',
 3: '<UNK>',
 4: ' ',
 5: 'two',
 6: 'young',
 7: 'guys',
 8: 'with',
 9: 'shaggy',
 10: 'hair',
 11: 'look',
 12: 'at',
 13: 'their',
 14: 'hands',
 15: 'while',
 16: 'hanging',
 17: 'out',
 18: 'in',
 19: 'the',
 20: 'yard',
 21: '.',
 22: ',',
 23: 'white',
 24: 'males',
 25: 'are',
 26: 'outside',
 27: 'near',
 28: 'many',
 29: 'bushes',
 30: 'men',
 31: 'green',
 32: 'shirts',
 33: 'standing',
 34: 'a',
 35: 'man',
 36: 'blue',
 37: 'shirt',
 38: 'garden',
 39: 'friends',
 40: 'enjoy',
 41: 'time',
 42: 'together',
 43: 'several',
 44: 'hard',
 45: 'hats',
 46: 'operating',
 47: 'giant',
 48: 'pulley',
 49: 'system',
 50: 'workers',
 51: 'down',
 52: 'from',
 53: 'up',
 54: 'above',
 55: 'on',
 56: 'piece',
 57: 'of',
 58: 'equipment',
 59: 'working',
 60: 'machine',
 61: 'wearing',
 62: 'four',
 63: 'top',
 64: 'tall',
 65: 'structure',
 66: 'three',
 67: 'large',
 68: 'rig',
 69: 'child',
 70: 'pink',
 71: 'dress',
 72: 'is',
 73: 'climbing

In [205]:
from torchvision import transforms

In [206]:
transforms= transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
        transforms.Normalize(
        mean=[0.485, 0.456, 0.406],   # ImageNet mean
        std=[0.229, 0.224, 0.225]     # ImageNet std
    )
])

In [207]:
dataset = FlickerDataset(
    root_dir=r"C:\Users\swaya\Desktop\Timepass\python\archive\flickr30k_images\flickr30k_images",  # folder of .jpgs
    caption_file="captions.csv",
    vocab=voab,
    transform=transforms
)


In [208]:
img, caption = dataset[0]

print(f"Image shape: {img.shape}")
print(f"Caption indices: {caption}")
print(f"Decoded: {[voab.itos[token.item()] for token in caption]}")


Image shape: torch.Size([3, 224, 224])
Caption indices: tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21,  2])
Decoded: ['<SOS>', ' ', 'two', 'young', 'guys', 'with', 'shaggy', 'hair', 'look', 'at', 'their', 'hands', 'while', 'hanging', 'out', 'in', 'the', 'yard', '.', '<EOS>']


In [209]:
img=df['images']

In [210]:
img

1         1000092795.jpg
2         1000092795.jpg
3         1000092795.jpg
4         1000092795.jpg
5         1000092795.jpg
               ...      
158911     998845445.jpg
158912     998845445.jpg
158913     998845445.jpg
158914     998845445.jpg
158915     998845445.jpg
Name: images, Length: 158915, dtype: object

In [211]:
df

Unnamed: 0,images,caption
1,1000092795.jpg,Two young guys with shaggy hair look at their...
2,1000092795.jpg,"Two young , White males are outside near many..."
3,1000092795.jpg,Two men in green shirts are standing in a yard .
4,1000092795.jpg,A man in a blue shirt standing in a garden .
5,1000092795.jpg,Two friends enjoy time spent together .
...,...,...
158911,998845445.jpg,A man in shorts and a Hawaiian shirt leans ov...
158912,998845445.jpg,"A young man hanging over the side of a boat ,..."
158913,998845445.jpg,A man is leaning off of the side of a blue an...
158914,998845445.jpg,"A man riding a small boat in a harbor , with ..."


In [212]:
from torch.nn.utils.rnn import pad_sequence

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        # batch = list of (image, caption_tensor)
        images = [item[0].unsqueeze(0) for item in batch]  # shape: [1, 3, H, W]
        captions = [item[1] for item in batch]  # shape: variable lengths

        images = torch.cat(images, dim=0)  # Now shape: [batch_size, 3, H, W]
        captions = pad_sequence(captions, batch_first=True, padding_value=self.pad_idx)

        return images, captions


In [213]:
pad_idx=voab.stoi["<PAD>"]

In [214]:
pad_idx

0

In [215]:
Data_loader=DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=MyCollate(pad_idx=pad_idx)
)

In [223]:
import torch
import torch.nn as nn

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """
        Initializes the EncoderCNN.
        The input size for the fully connected layer is determined automatically.
        """
        super(EncoderCNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
        )

        # Create a dummy input tensor to pass through the CNN
        # to determine the shape of the output.
        # The input is (batch_size, channels, height, width)
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 224, 224)
            cnn_output = self.cnn(dummy_input)

        # Calculate the number of features after the conv layers
        # cnn_output.shape will be (1, 64, 28, 28)
        # We flatten this to get the input size for the linear layer
        flattened_size = cnn_output.view(1, -1).size(1)

        # Now, initialize the fully connected layer with the correct input size
        self.fc = nn.Linear(flattened_size, embed_size)

    def forward(self, images):
        """The forward pass for the encoder."""
        # 1. Pass images through convolutional layers
        features = self.cnn(images)

        # 2. Flatten the features. The -1 infers the size from other dimensions.
        # Shape changes from (batch_size, 64, 28, 28) to (batch_size, 50176)
        features = features.view(features.size(0), -1)

        # 3. Pass flattened features through the fully connected layer
        features = self.fc(features)

        return features

# --- How to use it ---
# embed_size = 256
# encoder = EncoderCNN(embed_size)
# print(encoder)

# # Test with a dummy batch of images
# dummy_images = torch.randn(32, 3, 224, 224) # batch_size=32
# output_features = encoder(dummy_images)
# print(f"Output shape: {output_features.shape}") # Should be (32, 256)

In [224]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        embeddings = self.embed(captions[:, :-1])  # remove <EOS>
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs


In [225]:
embed_size = 256
hidden_size = 512
vocab_size = len(voab)
num_layers = 1

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

criterion = nn.CrossEntropyLoss(ignore_index=voab.stoi["<PAD>"])
params = list(decoder.parameters()) + list(encoder.parameters())
optimizer = optim.Adam(params, lr=0.001)


In [226]:
for epoch in range(3):
    for idx, (imgs, captions) in enumerate(Data_loader):
        features = encoder(imgs)
        # Slice the outputs to match the captions' length before reshaping
        outputs = outputs[:, :-1, :]

        loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions[:, 1:].reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if idx % 100 == 0:
            print(f"Epoch [{epoch}], Step [{idx}], Loss: {loss.item():.4f}")


ValueError: Expected input batch_size (736) to match target batch_size (928).

In [227]:
dummy_input = torch.randn(1, 3, 224, 224)
encoder = EncoderCNN(embed_size=256)
output = encoder(dummy_input)
print(output.shape)


torch.Size([1, 256])


In [188]:
encoder = EncoderCNN(embed_size=256)
dummy_input = torch.randn(1, 3, 224, 224)
output = encoder(dummy_input)
print(output.shape)  # should be [1, 256]


torch.Size([1, 256])


In [183]:
flattened = output.view(1, -1)
print("Flattened shape:", flattened.shape)


Flattened shape: torch.Size([1, 50176])
