# Note
Paths are set for colab, adjust if running locally

# Using Kaggle to download the dataset

In [None]:
from google.colab import files
import kagglehub

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
path = kagglehub.dataset_download("eeshawn/flickr30k")

# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Processing captions
needs preprocess_captions.py for running

In [None]:
import sys
sys.path.append('/content')
from preprocess_captions import preprocess_and_save

In [None]:
csv_path = "/root/.cache/kagglehub/datasets/eeshawn/flickr30k/versions/1/captions.txt" # update with your dataset path
local_out_caption = "/content/processed_captions" # local output
drive_out_caption = "/content/drive/MyDrive/captioneer/processed_captions" # optional for saving to google drive

In [None]:
result = preprocess_and_save(csv_path, local_out_caption, vocab_size=5000, max_len=30)
print(result["meta"])

{'vocab_size': 4998, 'num_images': 31783, 'max_len': 30, 'split_counts': {'train': 25426, 'val': 3178, 'test': 3179}}


# Checking captions

## Checking vocab

In [None]:
import json

with open(os.path.join(local_out_caption, "vocab_word2idx.json"), "r") as f:
    word2idx = json.load(f)

print("Vocabulary size:", len(word2idx))
print("Some sample words:", list(word2idx.keys())[:20])

Vocabulary size: 4998
Some sample words: ['<PAD>', '<UNK>', '<START>', '<END>', 'a', 'in', 'the', 'on', 'man', 'and', 'is', 'of', 'with', 'woman', 'two', 'are', 'people', 'to', 'at', 'an']


## Checking captions

In [None]:
with open(os.path.join(local_out_caption, "cleaned_captions.json"), "r") as f:
    cleaned_captions = json.load(f)

# Pick 5 random images and print their captions
import random
sample_imgs = random.sample(list(cleaned_captions.keys()), 5)

for img in sample_imgs:
    print("\nImage:", img)
    for cap in cleaned_captions[img]:
        print(" ", cap)



Image: 8063954905.jpg
  <START> a young girl in a traditional white karate uniform demonstrates her skill <END>
  <START> a little girl is practicing karate with people in the background <END>
  <START> young girl performing martial arts very passionately <END>
  <START> a girl is performing karate in front of people <END>
  <START> a young girl in a karate competition <END>

Image: 4824548067.jpg
  <START> a blond woman giving a piggyback ride to a blond girl in a pink coat and headphones <END>
  <START> a woman carries a child wearing green earphones on her back <END>
  <START> a woman holding her daughter on her shoulders walking <END>
  <START> a chile in a pink jacket on a woman shoulders <END>
  <START> a woman carries a young girl on her shoulders <END>

Image: 4677416815.jpg
  <START> a woman with long hair and wearing high heels is carrying something as she is walking beside a fence <END>
  <START> a woman wearing high heels and a long white screen printed tshirt walks by car

## Checking seqs

In [None]:
with open(os.path.join(local_out_caption, "sequences_train.json"), "r") as f:
    sequences_train = json.load(f)

# Show first image sequence
first_img = list(sequences_train.keys())[0]
print("\nFirst training image:", first_img)
print("Sequences (word indices):", sequences_train[first_img])



First training image: 3328495660.jpg
Sequences (word indices): [[5, 45, 15, 571, 4065, 9, 4, 1, 27, 80, 15, 2956, 4, 1, 114, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 16, 93, 1, 4, 861, 30, 7, 1500, 402, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 18, 17, 402, 259, 4, 95, 2836, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 49, 156, 137, 83, 3011, 1056, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 49, 156, 7, 4, 1500, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


# Saving to drive

In [None]:
import shutil
import os

if os.path.exists(drive_out_caption):
    shutil.rmtree(drive_out_caption)

shutil.copytree(local_out_caption, drive_out_caption)
print("Saved preprocessed captions to Drive at:", drive_out_caption)

# Processing Images
needs preprocess_images.py for running

In [None]:
from preprocess_images import preprocess_images

img_dir = '/root/.cache/kagglehub/datasets/eeshawn/flickr30k/versions/1/flickr30k_images' # update with your dataset path
local_out_img = '/content/processed_images' # local output
drive_out_img = '/content/drive/MyDrive/captioneer/processed_images' # optional for saving to google drive

result = preprocess_images(img_dir, local_out_img, batch_size=32)
print(f"Extracted features for {result['num_images']} images")


Extracting features:   0%|          | 0/994 [00:00<?, ?it/s]

Extracted features for 31783 images


# Checking Features

In [None]:
import numpy as np
import os

sample_file = os.path.join(local_out_img, '1000092795.npy')
feat = np.load(sample_file)
print("Feature shape:", feat.shape)  # should be (2048,)


Feature shape: (2048,)


# Saving to drive

In [None]:
shutil.copytree(local_out_img, drive_out_img)
print(f"Copied features to Drive at {drive_out_img}")

# Saving Scripts To Drive

In [None]:
import os
import shutil

captions_script_src = '/content/preprocess_captions.py'
images_script_src = '/content/preprocess_images.py'

scripts_dir = '/content/drive/MyDrive/captioneer/scripts'
os.makedirs(scripts_dir, exist_ok=True)

for src in [captions_script_src, images_script_src]:
    dst = os.path.join(scripts_dir, os.path.basename(src))
    if os.path.exists(dst):
        os.remove(dst)  # remove existing file
    shutil.copy(src, dst)

print(f"Scripts copied to {scripts_dir} (existing files replaced)")