In [2]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Go to notebook folder
#cd /content/drive/MyDrive/Colab\ Notebooks/


Mounted at /content/drive


In [3]:
# ==========================================
# 1. SETUP & IMPORTS
# ==========================================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

import pandas as pd
import numpy as np
import random
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# BEST PRACTICE: Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

# Device configuration (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [20]:
import sentencepiece as spm

file_path = '/content/drive/MyDrive/caption_data/captions.txt'

# read file
img_caption_pairs = []

with open(file_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# Remove header
lines = lines[1:]

for line in lines:
    img, caption = line.split(',', 1)
    img_caption_pairs.append((img, caption.lower()))

print("First (image, caption) pair:")
print(img_caption_pairs[0])

# save only captions for tokenizer
captions_file = '/content/captions_clean.txt'

with open(captions_file, 'w', encoding='utf-8') as f:
    for _, caption in img_caption_pairs:
        f.write(caption + '\n')

# train tokenizer
spm.SentencePieceTrainer.train(
    input=captions_file,
    model_prefix='/content/spm',
    vocab_size=8000,
    model_type='bpe',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

# load tokenizer
sp = spm.SentencePieceProcessor()
sp.load('/content/spm.model')

# building vocabulary
vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}

print("Vocabulary size:", len(vocab))
print("Special tokens:")
print({k: v for k, v in vocab.items() if k in ["<pad>", "<unk>", "<s>", "</s>"]})

# Example: subword tokenization of first caption
first_caption = img_caption_pairs[0][1]

subword_tokens = sp.encode(first_caption, out_type=str)
subword_ids = sp.encode(first_caption, out_type=int)

print("\nFirst caption:")
print(first_caption)

print("\nSubword tokens:")
print(subword_tokens)

print("\nSubword token IDs:")
print(subword_ids)


First (image, caption) pair:
('1000268201_693b08cb0e.jpg', 'a child in a pink dress is climbing up a set of stairs in an entry way .')
Vocabulary size: 8000
Special tokens:
{'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3}

First caption:
a child in a pink dress is climbing up a set of stairs in an entry way .

Subword tokens:
['▁a', '▁child', '▁in', '▁a', '▁pink', '▁dress', '▁is', '▁climbing', '▁up', '▁a', '▁set', '▁of', '▁stairs', '▁in', '▁an', '▁ent', 'ry', '▁way', '▁.']

Subword token IDs:
[4, 128, 15, 4, 325, 270, 40, 414, 207, 4, 719, 46, 1045, 15, 135, 1879, 715, 1603, 7]
