# Note
Paths are set for colab, adjust if running locally

# Using Kaggle to download the dataset

In [None]:
from google.colab import files
import kagglehub

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
path = kagglehub.dataset_download("eeshawn/flickr30k")

# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Processing captions
needs preprocess_captions.py for running

In [None]:
import sys
sys.path.append('/content')
from preprocess_captions import preprocess_and_save

In [None]:
csv_path = "/root/.cache/kagglehub/datasets/eeshawn/flickr30k/versions/1/captions.txt" # update with your dataset path
local_out_caption = "/content/processed_captions" # local output
drive_out_caption = "/content/drive/MyDrive/captioneer/processed_captions" # optional for saving to google drive

In [None]:
result = preprocess_and_save(csv_path, local_out_caption, vocab_size=5000, max_len=30)
print(result["meta"])

{'vocab_size': 4998, 'num_images': 31783, 'max_len': 30, 'split_counts': {'train': 25426, 'val': 3178, 'test': 3179}}


# Checking captions

## Checking vocab

In [None]:
import json
import os

with open(os.path.join(local_out_caption, "vocab_word2idx.json"), "r") as f:
    word2idx = json.load(f)

print("Vocabulary size:", len(word2idx))
print("Some sample words:", list(word2idx.keys())[:20])

Vocabulary size: 4998
Some sample words: ['<PAD>', '<UNK>', '<START>', '<END>', 'a', 'in', 'the', 'on', 'man', 'and', 'is', 'of', 'with', 'woman', 'two', 'are', 'people', 'to', 'at', 'an']


## Checking captions

In [None]:
with open(os.path.join(local_out_caption, "cleaned_captions.json"), "r") as f:
    cleaned_captions = json.load(f)

# Pick 5 random images and print their captions
import random
sample_imgs = random.sample(list(cleaned_captions.keys()), 5)

for img in sample_imgs:
    print("\nImage:", img)
    for cap in cleaned_captions[img]:
        print(" ", cap)



Image: 7173939318.jpg
  <START> amidst a busy dock comes a red and white ship with a landscape of mountains and possibly middleeastern territory <END>
  <START> pier with groups of people some with fishing poles boats in water and buildings and monuments in background <END>
  <START> a beautiful seaside scene with people on a dock and a large white building in the background <END>
  <START> a boat pear with people boarding and disembarking some boats <END>
  <START> a group of people on a dock by a city boarding various boats <END>

Image: 3014615499.jpg
  <START> a man in a red shirt is being filmed by another man on a sports field with a huge audience of people watching and cheering <END>
  <START> a man in athletic attire walks across a green grassy sports field while cameramen look on <END>
  <START> a lone soccer player being spectated by many out on the field <END>
  <START> a guy is running on a soccer field by a filmier <END>
  <START> the crowd cheers a player on the field <E

## Checking seqs

In [None]:
with open(os.path.join(local_out_caption, "sequences_train.json"), "r") as f:
    sequences_train = json.load(f)

# Show first image sequence
first_img = list(sequences_train.keys())[0]
print("\nFirst training image:", first_img)
print("Sequences (word indices):", sequences_train[first_img])



First training image: 3328495660.jpg
Sequences (word indices): [[5, 45, 15, 571, 4065, 9, 4, 1, 27, 80, 15, 2956, 4, 1, 114, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 16, 93, 1, 4, 861, 30, 7, 1500, 402, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 18, 17, 402, 259, 4, 95, 2836, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 49, 156, 137, 83, 3011, 1056, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 47, 49, 156, 7, 4, 1500, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


# Saving to drive

In [None]:
import shutil
import os

if os.path.exists(drive_out_caption):
    shutil.rmtree(drive_out_caption)

shutil.copytree(local_out_caption, drive_out_caption)
print("Saved preprocessed captions to Drive at:", drive_out_caption)

Saved preprocessed captions to Drive at: /content/drive/MyDrive/captioneer/processed_captions


# Processing Images
needs preprocess_images.py for running

In [None]:
from preprocess_images import preprocess_images

img_dir = '/root/.cache/kagglehub/datasets/eeshawn/flickr30k/versions/1/flickr30k_images' # update with your dataset path
local_out_img = '/content/processed_images' # local output
drive_out_img = '/content/drive/MyDrive/captioneer/processed_images' # optional for saving to google drive

result = preprocess_images(img_dir, local_out_img)
print(f"Extracted features for {result['num_images']} images")


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 176MB/s]


Extracted features for 31783 images


# Checking Features

In [None]:
import numpy as np
import os

sample_file = os.path.join(local_out_img, '1000092795.npy')
feat = np.load(sample_file)
print("Feature shape:", feat.shape)  # should be (2048,)


Feature shape: (2048,)


# Saving to drive

In [None]:
shutil.copytree(local_out_img, drive_out_img)
print(f"Copied features to Drive at {drive_out_img}")

Copied features to Drive at /content/drive/MyDrive/captioneer/processed_images


# Creating tar.gz file
note: there may be some problems while copying the archive from colab storage to drive, sometimes the archive file exists in the drive when viewed in colab file manager, but not when viewed in the google drive website, if this happens you may need to download and upload the file to drive manually (~225mb)

In [None]:
local_images_dir = "/content/processed_images"     # local folder with .npy files
drive_base = "/content/drive/MyDrive/captioneer"   # change if your project folder is in a different location
drive_images_dir = "/content/drive/MyDrive/captioneerprocessed_images"
archive_local = "/content/processed_images.tar.gz"
archive_drive = "/content/drive/MyDrive/captioneerprocessed_images.tar.gz"
overwrite_files_on_drive = False   # set True to overwrite existing .npy files on Drive, False to skip existing

os.makedirs(drive_base, exist_ok=True)
os.makedirs(drive_images_dir, exist_ok=True)

# remove any existing local archive first
if os.path.exists(archive_local):
    os.remove(archive_local)

print("Creating local archive")
shutil.make_archive(base_name="/content/processed_images", format="gztar", root_dir=local_images_dir)
print("Local archive created at:", archive_local)

# move archive to Drive (overwrite if exists)
if os.path.exists(archive_drive):
    print("Existing archive found on Drive; it will be overwritten.")
    os.remove(archive_drive)
shutil.move(archive_local, archive_drive)
print("Moved archive to Drive at:", archive_drive)