In [6]:
# imports

from pathlib import Path
import json
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm
from rich.console import Console
import re

console = Console()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)

In [7]:
# define base paths

PROJECT_ROOT = Path("../").resolve()
DATA_DIR = PROJECT_ROOT / "data" / "COCO"

TRAIN_IMG_DIR = DATA_DIR / "train2014"
VAL_IMG_DIR   = DATA_DIR / "val2014"
ANNOTATIONS_DIR = DATA_DIR / "annotations"

TRAIN_JSON = ANNOTATIONS_DIR / "captions_train2014.json"
VAL_JSON   = ANNOTATIONS_DIR / "captions_val2014.json"
OUTPUT_DIR = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

assert TRAIN_IMG_DIR.exists()
assert VAL_IMG_DIR.exists()
assert TRAIN_JSON.exists()
assert VAL_JSON.exists()

In [9]:
def clean_caption(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s.,']", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [10]:
def build_csv(annotation_path, image_dir, output_path):
    console.print(f"[bold cyan]Processing {annotation_path.name}[/bold cyan]")

    with open(annotation_path) as f:
        data = json.load(f)

    images_df = pd.DataFrame(data["images"])
    annotations_df = pd.DataFrame(data["annotations"])

    df = annotations_df.merge(
        images_df[["id", "file_name"]],
        left_on="image_id",
        right_on="id",
        how="left"
    )

    df["image_path"] = df["file_name"].apply(lambda x: image_dir / x)

    # Clean captions
    df["caption"] = df["caption"].apply(clean_caption)

    # Remove duplicates within same image
    before = len(df)
    df = df.drop_duplicates(subset=["image_id", "caption"])
    console.print(f"Removed {before - len(df)} duplicate pairs")

    # Remove missing images
    df = df[df["image_path"].apply(lambda x: x.exists())]

    # Filter very long captions
    df["caption_length"] = df["caption"].apply(lambda x: len(x.split()))
    df = df[df["caption_length"] <= 40]

    df = df[["image_path", "caption"]].reset_index(drop=True)

    df.to_csv(output_path, index=False)

    console.print(f"[bold green]Saved:[/bold green] {output_path}")
    console.print(f"Total samples: {len(df)}\n")

In [11]:
# generate full train csv

build_csv(
    TRAIN_JSON,
    TRAIN_IMG_DIR,
    OUTPUT_DIR / "coco_train_full.csv"
)

In [12]:
# generate val csv

build_csv(
    VAL_JSON,
    VAL_IMG_DIR,
    OUTPUT_DIR / "coco_val.csv"
)

In [14]:
val_df = pd.read_csv(OUTPUT_DIR / "coco_val.csv")
val_subset = val_df.sample(5200, random_state=42)
val_subset.to_csv(OUTPUT_DIR / "coco_val_2000.csv", index=False)