
# 00 ‚Äî Build Local Alignment Datasets (PixMo-Cap & MusicCaps)

This notebook creates small **local Parquet subsets** of the online Hugging Face datasets:

- **Vision‚ÄìText:** `allenai/pixmo-cap`
- **Audio‚ÄìText:** `google/MusicCaps`

It will:

1. Download each dataset (once),  
2. Select **N samples** for alignment experiments,  
3. Save them as **`.parquet`** files under a local `data/` folder,  
4. (Optionally) show how to load them back via `datasets.load_dataset("parquet", ...)`.

All your alignment / training notebooks can then read from these local Parquet files instead of re-downloading from the hub.


In [10]:

# Standard imports
import os
from pathlib import Path

from datasets import load_dataset

# Optional: pandas only for quick inspection (not required for saving)
import pandas as pd

# Base directory where this notebook lives
PROJECT_ROOT = Path.cwd()

DATA_DIR = PROJECT_ROOT / "data" / "alignment_subsets"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# ---- CONFIG ----
# Number of samples to keep from each dataset
N_PIXMO_SAMPLES = 50_000   # adjust based on your GPU/RAM
N_MUSICCAPS_SAMPLES = 25_000

# Output Parquet paths
PIXMO_PARQUET_PATH = DATA_DIR / f"pixmocap_train_subset_{N_PIXMO_SAMPLES}.parquet"
MUSICCAPS_PARQUET_PATH = DATA_DIR / f"musiccaps_train_subset_{N_MUSICCAPS_SAMPLES}.parquet"

print(f"Project root: {PROJECT_ROOT}")
print(f"Saving PixMo-Cap subset to: {PIXMO_PARQUET_PATH}")
print(f"Saving MusicCaps subset to: {MUSICCAPS_PARQUET_PATH}")


Project root: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/v2_code_base
Saving PixMo-Cap subset to: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/v2_code_base/data/alignment_subsets/pixmocap_train_subset_50000.parquet
Saving MusicCaps subset to: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/v2_code_base/data/alignment_subsets/musiccaps_train_subset_25000.parquet


In [11]:

%%writefile dataset_builders.py
"""
dataset_builders.py

Utility functions to create local Parquet subsets of the PixMo-Cap
and MusicCaps datasets for alignment experiments.
"""

from pathlib import Path
from typing import Optional

from datasets import load_dataset



def build_pixmocap_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: Optional[int] = None,
    shuffle_seed: int = 42,
) -> None:
    """
    Download a subset of PixMo-Cap and save it as a Parquet file.

    The resulting Parquet file will keep all original columns, including:
    - `image_url`: used by InMemoryImageTextDataset
    - `caption`   : used as text field

    Args:
        output_path: Where to save the Parquet file.
        split: HF split to use (default: "train").
        max_samples: If provided, randomly select at most this many samples.
        shuffle_seed: Seed for shuffling before subsetting.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"üì• Loading PixMo-Cap split='{split}' from Hugging Face...")
    ds = load_dataset("allenai/pixmo-cap", split=split)

    print(f"   Total available samples: {len(ds):,}")
    if max_samples is not None and max_samples < len(ds):
        print(f"   Shuffling and selecting first {max_samples:,} samples (seed={shuffle_seed})...")
        ds = ds.shuffle(seed=shuffle_seed).select(range(max_samples))
    else:
        print("   Using full split (no subsetting).")


    print(f"üíæ Saving subset to Parquet: {output_path}")
    ds.to_parquet(str(output_path))
    print("‚úÖ Done! PixMo-Cap subset saved.")


def build_musiccaps_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: Optional[int] = None,
    shuffle_seed: int = 42,
) -> None:
    """
    Download a subset of MusicCaps and save it as a Parquet file.

    The resulting Parquet file will keep all original columns, including:
    - `audio`   : HF Audio column (waveforms + metadata)
    - `caption` : used as text field

    Args:
        output_path: Where to save the Parquet file.
        split: HF split to use (default: "train").
        max_samples: If provided, randomly select at most this many samples.
        shuffle_seed: Seed for shuffling before subsetting.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"üì• Loading MusicCaps split='{split}' from Hugging Face...")
    ds = load_dataset("google/MusicCaps", split=split)

    print(f"   Total available samples: {len(ds):,}")
    if max_samples is not None and max_samples < len(ds):
        print(f"   Shuffling and selecting first {max_samples:,} samples (seed={shuffle_seed})...")
        ds = ds.shuffle(seed=shuffle_seed).select(range(max_samples))
    else:
        print("   Using full split (no subsetting).")


    print(f"üíæ Saving subset to Parquet: {output_path}")
    ds.to_parquet(str(output_path))
    print("‚úÖ Done! MusicCaps subset saved.")


Overwriting dataset_builders.py


In [12]:
from pathlib import Path
from typing import Optional

from datasets import load_dataset
from tqdm import tqdm

from multiprocessing import Pool, cpu_count


In [13]:

def _validate_pixmo_index(args):
    """
    Worker function for multiprocessing validation of PixMo-Cap rows.

    Args:
        args: tuple (idx, dataset)

    Returns:
        None if row is OK,
        or (idx, missing_keys) if some fields are missing.
    """
    idx, ds = args
    row = ds[idx]

    missing = []
    if "image_url" not in row:
        missing.append("image_url")
    if "caption" not in row:
        missing.append("caption")

    if missing:
        return idx, missing
    return None


def _log_sample_preview(ds, n_preview=3):
    """Print the first N rows for sanity."""
    print("\nüîç Sample preview:")
    for i in range(min(n_preview, len(ds))):
        print(f"  ‚Ä¢ [{i}] { {k: str(ds[i][k])[:80] for k in ds.column_names} }")
    print()


def _log_progress(i, total, dataset_name, step=5000):
    """Log every fixed interval."""
    if i % step == 0:
        pct = (i / total) * 100
        print(f"[{dataset_name}] Processed {i:,}/{total:,} ({pct:.2f}%)...")



In [None]:
# -----------------------------------------------------------------------------
#                               PIXMO‚ÄìCAP BUILDER
# -----------------------------------------------------------------------------
def build_pixmocap_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: Optional[int] = None,
    shuffle_seed: int = 42,
) -> None:

    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"\nüì• Loading PixMo-Cap split='{split}' from Hugging Face...")
    ds = load_dataset("allenai/pixmo-cap", split=split)

    total = len(ds)
    print(f"   Total available samples: {total:,}")

    # Preview
    _log_sample_preview(ds)

    # Subset
    if max_samples is not None and max_samples < total:
        print(f"   Shuffling + selecting {max_samples:,} samples (seed={shuffle_seed})...")
        ds = ds.shuffle(seed=shuffle_seed).select(range(max_samples))
    else:
        print("   Using full split.")

    print(f"   Final subset size: {len(ds):,}")

    # ----------------- MULTIPROCESSING VALIDATION -----------------
    print("\nüìù Scanning dataset for basic validation (multiprocessing)...")

    num_workers = min(cpu_count(), 8)  # tune this if needed
    print(f"   Using {num_workers} worker processes")

    missing_count = 0
    total_rows = len(ds)

    # Generator of tasks: each worker gets (idx, ds)
    # ds will be copied to each worker process once (acceptable for moderate sizes)
    tasks = ((i, ds) for i in range(total_rows))

    with Pool(processes=num_workers) as pool:
        for j, result in enumerate(
            tqdm(
                pool.imap_unordered(_validate_pixmo_index, tasks, chunksize=512),
                total=total_rows,
                desc="Validating PixMo samples (mp)",
            )
        ):
            # Progress log (every 10k rows)
            if j % 10_000 == 0 and j > 0:
                _log_progress(j, total_rows, "PixMo-Cap", step=10_000)

            # If something is missing, result is (idx, missing_keys)
            if result is not None:
                idx, missing_keys = result
                missing_count += 1
                print(f"‚ö†Ô∏è Missing {missing_keys} at index {idx}")

    print(f"\n‚úÖ Validation complete. Rows with missing fields: {missing_count:,}/{total_rows:,}")

    # ----------------- SAVE TO PARQUET -----------------
    print(f"\nüíæ Saving subset to Parquet: {output_path}")
    ds.to_parquet(str(output_path))

    print("‚úÖ Done! PixMo-Cap subset saved.")
    print("üìå File:", output_path)
    print("-" * 60)


In [15]:
print("\n=== Building PixMo-Cap subset ===")
build_pixmocap_parquet(
    output_path=PIXMO_PARQUET_PATH,
    split="train",
    max_samples=N_PIXMO_SAMPLES,
    shuffle_seed=42,
)



=== Building PixMo-Cap subset ===

üì• Loading PixMo-Cap split='train' from Hugging Face...
   Total available samples: 717,042

üîç Sample preview:
  ‚Ä¢ [0] {'image_url': 'https://pixmo.s3.us-west-2.amazonaws.com/birds/1491.png', 'caption': 'This photograph depicts a striking black bird, possibly a grackle or similar spe', 'transcripts': '["This is a picture of a long black bird with a lot of iridescent accents. It lo'}
  ‚Ä¢ [1] {'image_url': 'https://i.pinimg.com/736x/a8/d4/30/a8d430e8b24249577d09ea0a9c4bec54.jpg', 'caption': 'This nighttime image captures a dynamic point of view from a motorcycle rider on', 'transcripts': '["This image is outside at night. You can just see the part of the motorcycle th'}
  ‚Ä¢ [2] {'image_url': 'https://i.redd.it/tujlim4fvo2d1.png', 'caption': 'The image showcases a dynamic, animated scene featuring four distinct characters', 'transcripts': '["This is a picture of, I think, four characters, animated characters. And the f'}

   Shuffling + selec

Validating PixMo samples (mp):  20%|‚ñà‚ñâ        | 9871/50000 [00:13<00:26, 1488.47it/s]

[PixMo-Cap] Processed 10,000/50,000 (20.00%)...


Validating PixMo samples (mp):  41%|‚ñà‚ñà‚ñà‚ñà      | 20481/50000 [00:20<00:14, 2007.98it/s]

[PixMo-Cap] Processed 20,000/50,000 (40.00%)...


Validating PixMo samples (mp):  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 29697/50000 [00:24<00:06, 3049.05it/s]

[PixMo-Cap] Processed 30,000/50,000 (60.00%)...


Validating PixMo samples (mp):  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 39937/50000 [00:29<00:05, 1912.78it/s]

[PixMo-Cap] Processed 40,000/50,000 (80.00%)...


Validating PixMo samples (mp): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:32<00:00, 1515.45it/s]


‚úÖ Validation complete. Rows with missing fields: 0/50,000

üíæ Saving subset to Parquet: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/v2_code_base/data/alignment_subsets/pixmocap_train_subset_50000.parquet





Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

‚úÖ Done! PixMo-Cap subset saved.
üìå File: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/v2_code_base/data/alignment_subsets/pixmocap_train_subset_50000.parquet
------------------------------------------------------------


In [None]:

# -----------------------------------------------------------------------------
#                           MUSICCAPS BUILDER
# -----------------------------------------------------------------------------
def build_musiccaps_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: Optional[int] = None,
    shuffle_seed: int = 42,
) -> None:

    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"\nüì• Loading MusicCaps split='{split}' from Hugging Face...")
    ds = load_dataset("google/MusicCaps", split=split)

    total = len(ds)
    print(f"   Total available samples: {total:,}")

    # Preview
    _log_sample_preview(ds)

    # Subset
    if max_samples is not None and max_samples < total:
        print(f"   Shuffling + selecting {max_samples:,} samples (seed={shuffle_seed})...")
        ds = ds.shuffle(seed=shuffle_seed).select(range(max_samples))
    else:
        print("   Using full split.")

    # Log progress with safe step interval
    print("\nüìù Scanning dataset for audio & caption fields...")
    for i in tqdm(range(len(ds)), desc="Validating MusicCaps samples"):
        _log_progress(i, len(ds), "MusicCaps", step=10_000)

        row = ds[i]
        if "audio" not in row:
            print(f"‚ö†Ô∏è Missing 'audio' column in index {i}")
        if "caption" not in row:
            print(f"‚ö†Ô∏è Missing 'caption' column in index {i}")

    print(f"\nüíæ Saving subset to Parquet: {output_path}")
    ds.to_parquet(str(output_path))

    print("‚úÖ Done! MusicCaps subset saved.")
    print("üìå File:", output_path)
    print("-" * 60)


In [None]:


print("\n=== Building MusicCaps subset ===")
try:
    build_musiccaps_parquet(
        output_path=MUSICCAPS_PARQUET_PATH,
        split="train",
        max_samples=N_MUSICCAPS_SAMPLES,
        shuffle_seed=42,
    )
except Exception as e:
    print(f"‚ö†Ô∏è MusicCaps build failed: {e}")



## Quick sanity check (optional)

The following cell shows how to load the saved Parquet subsets back using `datasets.load_dataset`,
and how to plug them into your existing in-memory datasets for alignment.


In [None]:
from datasets import load_dataset

print("\n=== Verifying PixMo-Cap Parquet subset ===")
pixmo_local = load_dataset(
    "parquet",
    data_files={"train": str(PIXMO_PARQUET_PATH)},
)
print(pixmo_local)
print("  Columns:", pixmo_local["train"].column_names)
print("  Example row:", pixmo_local["train"][0])



=== Verifying PixMo-Cap Parquet subset ===


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image_url', 'caption', 'transcripts'],
        num_rows: 50000
    })
})
  Columns: ['image_url', 'caption', 'transcripts']
  Example row: {'image_url': 'https://objaverse-renders.s3.us-west-2.amazonaws.com/robustness/drawer/89d1fcd0ec0f486c8fc3e5d811a1fe5c-000.png', 'caption': "This digital graphic features a modern, square wooden chest of drawers positioned at the center of the image, which is set against a plain white background. The chest's design is minimalistic with clean lines, highlighted by a slight shadow cast from the top onto the drawers and side, adding depth to the illustration. The chest boasts long, straight white drawer pulls that complement its sleek appearance. A vertical wood grain pattern is evident on the drawers, further emphasizing the natural texture and elegance of the piece. The outer edges of the graphic appear slightly serrated, creating an imperfect, textured finish around the silhouette of the object.

In [None]:


print("\n=== Verifying MusicCaps Parquet subset (if available) ===")
if MUSICCAPS_PARQUET_PATH.exists():
    musiccaps_local = load_dataset(
        "parquet",
        data_files={"train": str(MUSICCAPS_PARQUET_PATH)},
    )
    print(musiccaps_local)
    print("  Columns:", musiccaps_local["train"].column_names)
    print("  Example row:", musiccaps_local["train"][0])
else:
    print("  Skipped: MusicCaps Parquet file not found.")
