In [1]:
# imports

from pathlib import Path
import json
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm
from rich.console import Console
import re

console = Console()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)

In [2]:
# define base paths

PROJECT_ROOT = Path("../").resolve()
DATA_DIR = PROJECT_ROOT / "data" / "COCO"

TRAIN_IMG_DIR = DATA_DIR / "train2014"
ANNOTATIONS_DIR = DATA_DIR / "annotations"

CAPTIONS_TRAIN = ANNOTATIONS_DIR / "captions_train2014.json"

OUTPUT_DIR = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

assert TRAIN_IMG_DIR.exists()
assert CAPTIONS_TRAIN.exists()

In [3]:
# load and merge

with open(CAPTIONS_TRAIN) as f:
    train_data = json.load(f)

images_df = pd.DataFrame(train_data["images"])
annotations_df = pd.DataFrame(train_data["annotations"])

df = annotations_df.merge(
    images_df[["id", "file_name"]],
    left_on="image_id",
    right_on="id",
    how="left"
)

df["image_path"] = df["file_name"].apply(lambda x: TRAIN_IMG_DIR / x)

assert df["image_path"].iloc[0].exists()
logger.info("Merge successful.")

2026-02-19 11:14:17,784 - INFO - Merge successful.


In [4]:
# clean captions

def clean_caption(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s.,']", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

df["caption"] = df["caption"].apply(clean_caption)

In [5]:
# remove duplicates from same image

before = len(df)

df = df.drop_duplicates(subset=["image_id", "caption"])

after = len(df)

console.print(f"[bold green]Removed {before - after} duplicate image-caption pairs[/bold green]")

In [6]:
# remove missing images

df = df[df["image_path"].apply(lambda x: x.exists())]

logger.info(f"Dataset size after image existence check: {len(df)}")

2026-02-19 11:14:19,495 - INFO - Dataset size after image existence check: 413964


In [7]:
# limit caption length

df["caption_length"] = df["caption"].apply(lambda x: len(x.split()))

df = df[df["caption_length"] <= 40]

In [8]:
# ~Save all samples

output_path = OUTPUT_DIR / "coco_full_data.csv"

df.to_csv(output_path, index=False)

In [9]:
# ~20k subset

unique_images = df["image_id"].unique()

np.random.seed(42)
selected_images = np.random.choice(unique_images, size=20000, replace=False)

subset_df = df[df["image_id"].isin(selected_images)].reset_index(drop=True)

console.print(f"[bold cyan]Subset size:[/bold cyan] {len(subset_df)}")

In [10]:
# save processed data

output_path = OUTPUT_DIR / "coco_train_20k.csv"

subset_df[["image_path", "caption"]].to_csv(output_path, index=False)

console.print(f"[bold yellow]Saved to:[/bold yellow] {output_path}")

In [11]:
# ~2k val subset

# ~20k subset

unique_images_val = df["image_id"].unique()

np.random.seed(43)
selected_images_val = np.random.choice(unique_images_val, size=2000, replace=False)

subset_df = df[df["image_id"].isin(selected_images_val)].reset_index(drop=True)

console.print(f"[bold cyan]Subset size:[/bold cyan] {len(subset_df)}")

In [12]:
# save processed data

output_path_val = OUTPUT_DIR / "coco_val_2k.csv"

subset_df[["image_path", "caption"]].to_csv(output_path_val, index=False)

console.print(f"[bold yellow]Saved to:[/bold yellow] {output_path_val}")