<a href="https://colab.research.google.com/github/PhkhakadzeJumber/deep-learning-final/blob/main/data_training_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Go to notebook folder
#cd /content/drive/MyDrive/Colab\ Notebooks/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Make a local folder in Colab VM
%mkdir -p /content/my_repo
%cd /content/my_repo

# Clone your GitHub repo directly (recommended)
!git clone https://github.com/PhkhakadzeJumber/deep-learning-final.git .


/content/my_repo
Cloning into '.'...
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
# ==========================================
# 1. SETUP & IMPORTS
# ==========================================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

import pandas as pd
import numpy as np
import random
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from PIL import Image
import torchvision.transforms as transforms
from collections import defaultdict

In [None]:
# BEST PRACTICE: Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

# Device configuration (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
import sentencepiece as spm

file_path = '/content/drive/MyDrive/caption_data/captions.txt'

# read file
img_caption_pairs = []

with open(file_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# Remove header
lines = lines[1:]

for line in lines:
    img, caption = line.split(',', 1)
    img_caption_pairs.append((img, caption.lower()))

print("First (image, caption) pair:")
print(img_caption_pairs[0])

# save only captions for tokenizer
captions_file = '/content/captions_clean.txt'

with open(captions_file, 'w', encoding='utf-8') as f:
    for _, caption in img_caption_pairs:
        f.write(caption + '\n')

# train tokenizer
spm.SentencePieceTrainer.train(
    input=captions_file,
    model_prefix='/content/spm',
    vocab_size=8000,
    model_type='bpe',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

# load tokenizer
sp = spm.SentencePieceProcessor()
sp.load('/content/spm.model')

# building vocabulary
vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}

print("Vocabulary size:", len(vocab))
print("Special tokens:")
print({k: v for k, v in vocab.items() if k in ["<pad>", "<unk>", "<s>", "</s>"]})

# Example: subword tokenization of first caption
first_caption = img_caption_pairs[0][1]

subword_tokens = sp.encode(first_caption, out_type=str)
subword_ids = sp.encode(first_caption, out_type=int)

print("\nFirst caption:")
print(first_caption)

print("\nSubword tokens:")
print(subword_tokens)

print("\nSubword token IDs:")
print(subword_ids)


First (image, caption) pair:
('1000268201_693b08cb0e.jpg', 'a child in a pink dress is climbing up a set of stairs in an entry way .')
Vocabulary size: 8000
Special tokens:
{'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3}

First caption:
a child in a pink dress is climbing up a set of stairs in an entry way .

Subword tokens:
['▁a', '▁child', '▁in', '▁a', '▁pink', '▁dress', '▁is', '▁climbing', '▁up', '▁a', '▁set', '▁of', '▁stairs', '▁in', '▁an', '▁ent', 'ry', '▁way', '▁.']

Subword token IDs:
[4, 128, 15, 4, 325, 270, 40, 414, 207, 4, 719, 46, 1045, 15, 135, 1879, 715, 1603, 7]


In [None]:
from PIL import Image
import torchvision.transforms as transforms

class ImageCaptionDataset(Dataset):
    def __init__(self, img_caption_pairs, sp, image_root, transform=None):
        self.data = img_caption_pairs
        self.sp = sp # tokenizer
        self.image_root = image_root # path where the images are
        self.transform = transform

        self.bos_id = sp.bos_id()
        self.eos_id = sp.eos_id()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name, caption = self.data[idx]

        # ---- Load image ----
        img_path = f"{self.image_root}/{img_name}"
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        # ---- Tokenize caption ----
        caption_ids = self.sp.encode(caption, out_type=int)

        # Add <bos> and <eos>
        caption_ids = [self.bos_id] + caption_ids + [self.eos_id]

        caption_tensor = torch.tensor(caption_ids, dtype=torch.long)

        return image, caption_tensor, len(caption_tensor)

image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [None]:
def collate_fn(batch):
    images, captions, lengths = zip(*batch)

    images = torch.stack(images, dim=0)

    captions_padded = pad_sequence(
        captions,
        batch_first=True,
        padding_value=sp.pad_id()
    )

    lengths = torch.tensor(lengths)

    return images, captions_padded, lengths


In [None]:
# train/val split

img_to_captions = defaultdict(list)

for img, caption in img_caption_pairs:
  img_to_captions[img].append(caption)

all_images = list(img_to_captions.keys())

train_images, val_images = train_test_split(
    all_images,
    test_size=0.2,
    random_state=SEED
)

train_pairs = []
val_pairs = []

for img in train_images:
  for caption in img_to_captions[img]:
    train_pairs.append((img, caption))

for img in val_images:
  for caption in img_to_captions[img]:
    val_pairs.append((img, caption))

In [None]:
image_root = '/content/drive/MyDrive/caption_data/Images'

train_dataset = ImageCaptionDataset(train_pairs, sp, image_root, transform=image_transform)
val_dataset   = ImageCaptionDataset(val_pairs, sp, image_root, transform=image_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_fn
)

images, captions, lengths = next(iter(train_loader))

print("Images:", images.shape)        # (B, 3, 256, 256)
print("Captions:", captions.shape)    # (B, max_len)
print("Lengths:", lengths)

Images: torch.Size([64, 3, 256, 256])
Captions: torch.Size([64, 24])
Lengths: tensor([17,  9, 13, 10, 17, 17, 14,  9, 12, 16,  9, 12, 12, 12, 20, 16, 11, 20,
        16, 22, 11, 10, 19, 21, 12, 11, 20, 17, 14, 10, 12, 15, 24, 10, 12, 14,
        16, 17,  7, 21, 19, 10, 12, 14, 15, 14, 11, 20, 12,  9, 15,  9, 15, 15,
        19,  9, 13, 12, 21, 18, 18, 13, 14, 12])
