# Build Clotho Development Parquet

This notebook converts the **Clotho v2.x development split** that you downloaded
from Zenodo into a single **Parquet** file with:

- `audio`: a Hugging Face `Audio` column (waveform + sampling rate)
- `caption`: one caption per row
- `file_name`: original audio file name
- `caption_idx`: which caption (1‚Äì5)
- `split`: `"development"`

Edit the paths in the config cell below if needed, then run all cells.


In [None]:
from pathlib import Path
from typing import Dict

import pandas as pd
from datasets import Dataset, Audio

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
# Adjust these paths to match your repo layout.

# Directory that contains the *audio files* for the development split.
# From your screenshot, this is likely: v2_code_base/data/clotho/development
CLOTHO_DEV_AUDIO_DIR = Path.cwd() / "data" / "clotho" / "development"

# Path to the *captions CSV* for development.
# From your screenshot, this looked like: v2_code_base/data/clotho_captions_development.csv
CLOTHO_DEV_CAPTIONS_CSV = Path.cwd() / "data" / "clotho_captions_development.csv"
# Output Parquet path (will be created if missing)
OUTPUT_PARQUET_PATH = Path.cwd() / "data" / "alignment_offline" / "clotho_development.parquet"
OUTPUT_PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)

# Clotho audio sampling rate (per paper: 44.1 kHz)
TARGET_SR = 44_100

print("Audio dir :", CLOTHO_DEV_AUDIO_DIR.resolve())
print("Captions  :", CLOTHO_DEV_CAPTIONS_CSV.resolve())
print("Output    :", OUTPUT_PARQUET_PATH.resolve())


In [None]:
def build_clotho_dev_parquet(
    audio_dir: Path,
    captions_csv: Path,
    output_path: Path,
    target_sr: int = TARGET_SR,
) -> Path:
    """Build a Parquet file for the Clotho *development* split.

    One row per (audio file, caption), with columns:
      - audio      : HF Audio column
      - caption    : text
      - file_name  : original wav file name
      - caption_idx: which caption (1..5)
      - split      : 'development'
    """
    audio_dir = Path(audio_dir)
    captions_csv = Path(captions_csv)
    output_path = Path(output_path)

    assert audio_dir.exists(), f"Audio directory not found: {audio_dir}"
    assert captions_csv.exists(), f"Captions CSV not found: {captions_csv}"

    if output_path.exists():
        print(f"üìÇ Parquet already exists, skipping build: {output_path}")
        return output_path

    print(f"\nüì• Loading captions from: {captions_csv}")
    df = pd.read_csv(captions_csv, header=0)

    # Expect columns like 'File_name', 'Caption_1', ..., 'Caption_5'
    if "file_name" not in df.columns:
        raise ValueError(f"Expected 'file_name' column in {captions_csv}, got {df.columns.tolist()}")

    caption_cols = [c for c in df.columns if c.lower().startswith("caption")]
    if not caption_cols:
        raise ValueError(f"No caption columns found in {captions_csv} (columns={df.columns.tolist()})")

    print(f"   Found caption columns: {caption_cols}")

    # Convert to long format: one row per (file_name, caption)
    df_long = df.melt(
        id_vars=["file_name"],
        value_vars=caption_cols,
        var_name="caption_idx",
        value_name="caption",
    )

    # Clean up caption index: 'Caption_1' -> 1
    df_long["caption_idx"] = (
        df_long["caption_idx"]
        .astype(str)
        .str.extract(r"(\d+)$")
        .astype(int)
    )

    # Drop missing captions
    df_long = df_long.dropna(subset=["caption"])
    df_long["caption"] = df_long["caption"].astype(str).str.strip()

    # Build full audio path
    df_long["file_name"] = df_long["file_name"].astype(str)
    df_long["audio_path"] = df_long["file_name"].apply(lambda fn: str(audio_dir / fn))
    df_long["split"] = "development"

    # Filter rows where audio actually exists
    exists_mask = df_long["audio_path"].apply(lambda p: Path(p).exists())
    missing = (~exists_mask).sum()
    if missing > 0:
        print(f"   ‚ö†Ô∏è {missing} rows refer to missing audio files ‚Äì dropping them.")
    df_long = df_long[exists_mask]

    print(f"   Final rows in long-form table: {len(df_long):,}")

    # Build HF Dataset and cast audio
    print("üß± Creating HuggingFace Dataset...")
    ds = Dataset.from_pandas(df_long[["file_name", "caption", "caption_idx", "audio_path", "split"]],
                             preserve_index=False)

    print("üéß Casting 'audio_path' to Audio column (lazy loading from wav files)...")
    ds = ds.cast_column("audio_path", Audio(sampling_rate=target_sr))
    ds = ds.rename_column("audio_path", "audio")

    print(f"üíæ Saving Parquet to: {output_path}")
    ds.to_parquet(str(output_path))
    print("‚úÖ Done.")
    return output_path


In [None]:
# ------------------------------------------------------------------
# Run builder + quick sanity check
# ------------------------------------------------------------------
parquet_path = build_clotho_dev_parquet(
    audio_dir=CLOTHO_DEV_AUDIO_DIR,
    captions_csv=CLOTHO_DEV_CAPTIONS_CSV,
    output_path=OUTPUT_PARQUET_PATH,
)

from datasets import load_dataset

print("\nüîé Loading back Parquet for sanity check...")
ds = load_dataset("parquet", data_files={"train": str(parquet_path)})["train"]
print(ds)
print("Columns:", ds.column_names)

example = ds[0]
print("\nExample row:")
print("  file_name  :", example["file_name"])
print("  caption    :", example["caption"])
print("  caption_idx:", example["caption_idx"])
print("  audio sr   :", example["audio"]["sampling_rate"])
print("  audio shape:", example["audio"]["array"].shape)
