
# 00 ‚Äî Build Offline Alignment Datasets (PixMo-Cap & MusicCaps)

This notebook **downloads media once** from Hugging Face datasets and saves
everything into **Parquet files**, so that all other notebooks can work
purely from local Parquet without hitting the network or storing thousands of
separate image/audio files.

It will:

1. Build a PixMo-Cap subset where **images are downloaded from `image_url`**
   and stored as **raw bytes inside Parquet**.
2. Build a MusicCaps subset where audio is downloaded via Hugging Face's
   `Audio` feature and stored **directly in Parquet**.


In [1]:

import os
from pathlib import Path
from typing import Optional, Dict, List, Tuple

import json
from urllib.parse import urlparse
from multiprocessing import Pool, cpu_count

import requests
import pandas as pd
from datasets import load_dataset, Audio
from tqdm import tqdm

# Base directory where this notebook lives
PROJECT_ROOT = Path.cwd()

DATA_DIR = PROJECT_ROOT / "data" / "alignment_offline"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# ---- CONFIG ----
# Target number of *successful* samples to keep
N_PIXMO_SAMPLES = 20_000   # keep this moderate, images are heavy in Parquet
N_MUSICC_SAMPLES = 10_000  # audio is heavier than images in general

# Output Parquet paths
PIXMO_PARQUET_PATH = DATA_DIR / f"pixmocap_offline_{N_PIXMO_SAMPLES}.parquet"
MUSICC_PARQUET_PATH = DATA_DIR / f"musiccaps_offline_{N_MUSICC_SAMPLES}.parquet"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir    : {DATA_DIR}")
print(f"PixMo Parquet   : {PIXMO_PARQUET_PATH}")
print(f"MusicCaps Parquet: {MUSICC_PARQUET_PATH}")


Project root: /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base
Data dir    : /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base/data/alignment_offline
PixMo Parquet   : /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base/data/alignment_offline/pixmocap_offline_20000.parquet
MusicCaps Parquet: /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base/data/alignment_offline/musiccaps_offline_10000.parquet


In [2]:

def _log_sample_preview(ds, n_preview: int = 3):
    """Print the first N rows for sanity."""
    print("\nüîç Sample preview:")
    for i in range(min(n_preview, len(ds))):
        row = ds[i]
        summary = {k: str(row[k])[:80] for k in row.keys()}
        print(f"  ‚Ä¢ [{i}] {summary}")
    print()


def _download_image_bytes(args):
    """Worker: download image bytes from URL.

    Args:
        args: tuple (idx, url)

    Returns:
        (idx, bytes) on success, or (idx, None) on failure.
    """
    idx, url = args
    try:
        resp = requests.get(url, timeout=15)
        resp.raise_for_status()
        return idx, resp.content
    except Exception:
        return idx, None


## Build PixMo-Cap offline Parquet (images stored as bytes)

In [3]:

def build_pixmocap_offline_parquet(
    output_path: Path,
    split: str = "train",
    max_successful_samples: int = 20_000,
    shuffle_seed: int = 42,
    oversample_factor: float = 1.5,
    chunk_size: int = 5_000,
):
    """
    Build an offline PixMo-Cap subset where images are stored as raw bytes in a Parquet file.

    - Loads the `allenai/pixmo-cap` split.
    - Shuffles deterministically.
    - Iterates through the dataset in chunks, downloading `image_url` via HTTP.
    - Skips any failed URLs.
    - Stops once `max_successful_samples` successful images are collected
      (or the dataset is exhausted).
    - Saves a Parquet with columns:

        - `image_bytes`: raw bytes of the image file
        - `caption`    : text caption
        - `image_url`  : original URL
        - `sample_id`  : stable ID like "pixmo_0001234"

    Notes:
        - This can create a large Parquet file. Be mindful of disk usage.
        - Storing bytes is convenient but disk-heavy; adjust `max_successful_samples` accordingly.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"\nüì• Loading PixMo-Cap split='{split}' from Hugging Face...")
    ds = load_dataset("allenai/pixmo-cap", split=split)
    total = len(ds)
    print(f"   Total available samples in HF: {total:,}")

    # Shuffle once for determinism
    ds = ds.shuffle(seed=shuffle_seed)

    # Optional quick preview
    _log_sample_preview(ds)

    target = max_successful_samples
    print(f"üéØ Target successful samples: {target:,}")

    kept: List[Dict[str, object]] = []
    start_idx = 0

    while len(kept) < target and start_idx < total:
        end_idx = min(start_idx + chunk_size, total)
        print(f"\nüîÅ Processing chunk {start_idx:,}‚Äì{end_idx-1:,} ...")

        batch = ds.select(range(start_idx, end_idx))

        # Oversample inside this chunk, since some URLs will fail
        # We simply attempt all URLs in this chunk and rely on skipping failures.
        download_tasks: List[Tuple[int, str]] = []
        captions: Dict[int, str] = {}
        urls: Dict[int, str] = {}

        for local_idx, row in enumerate(batch):
            global_idx = start_idx + local_idx
            image_url = row.get("image_url", None)
            caption = row.get("caption", "")

            if not image_url:
                print(f"‚ö†Ô∏è Missing image_url at HF index {global_idx}, skipping.")
                continue

            download_tasks.append((global_idx, image_url))
            captions[global_idx] = caption
            urls[global_idx] = image_url

        if not download_tasks:
            print("‚ö†Ô∏è No valid URLs in this chunk, moving to next.")
            start_idx = end_idx
            continue

        num_workers = min(cpu_count(), 32)
        print(f"   Starting image downloads with {num_workers} workers "
              f"for {len(download_tasks):,} candidate samples...")

        with Pool(processes=num_workers) as pool:
            for idx, img_bytes in tqdm(
                pool.imap_unordered(_download_image_bytes, download_tasks, chunksize=256),
                total=len(download_tasks),
                desc="üì∑ Downloading PixMo images",
            ):
                if img_bytes is None:
                    # download failed
                    continue

                kept.append(
                    {
                        "image_bytes": img_bytes,
                        "caption": captions[idx],
                        "image_url": urls[idx],
                        "sample_id": f"pixmo_{idx:07d}",
                    }
                )

                if len(kept) >= target:
                    break

        print(f"   ‚úÖ Successful so far: {len(kept):,} / {target:,}")
        start_idx = end_idx

    if len(kept) < target:
        print(
            f"\n‚ö†Ô∏è Only {len(kept):,} successful downloads out of requested {target:,}. "
            f"Proceeding with available samples."
        )

    print(f"\nüíæ Saving {len(kept):,} samples to Parquet: {output_path}")
    df = pd.DataFrame(kept)
    df.to_parquet(output_path, index=False)

    print("‚úÖ Done! PixMo-Cap offline Parquet saved.")
    print(f"üìå File: {output_path}")
    print("-" * 60)


In [4]:
# print("=== Building PixMo-Cap offline Parquet ===")

# if PIXMO_PARQUET_PATH.exists():
#     print(f"üìÇ PixMo Parquet already exists, skipping build:")
#     print(f"   ‚Üí {PIXMO_PARQUET_PATH}")
# else:
#     print(f"üìÑ PixMo Parquet not found. Building now‚Ä¶")
#     build_pixmocap_offline_parquet(
#         output_path=PIXMO_PARQUET_PATH,
#         split="train",
#         max_successful_samples=N_PIXMO_SAMPLES,
#         shuffle_seed=42,
#     )
#     print("‚úÖ PixMo offline Parquet build complete.")


## Quick sanity check: load back the Parquet files

In [5]:
# from datasets import load_dataset

# if PIXMO_PARQUET_PATH.exists():
#     print("\nüîé Verifying PixMo-Cap Parquet...")
#     pixmo_local = load_dataset(
#         "parquet", data_files={"train": str(PIXMO_PARQUET_PATH)}
#     )
#     print(pixmo_local)
#     print("Columns:", pixmo_local["train"].column_names)
#     print("Example row:", {k: str(v)[:80] for k, v in pixmo_local["train"][0].items()})


## Build MusicCaps offline Parquet (audio stored via HF Audio feature)

In [6]:
# --- Imports & Config --------------------------------------------------------
import os
import subprocess
import shutil
from pathlib import Path
from typing import Any, Dict

import numpy as np
import soundfile as sf
from datasets import load_dataset
from tqdm import tqdm

# Output path (final Parquet)
MUSICC_PARQUET_PATH = Path("data/alignment_offline/musiccaps_waveform.parquet")
MUSICC_PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)


In [39]:
def _download_musiccaps_clip_to_array(
    ytid: str,
    start_s: float,
    end_s: float,
    tmp_dir: Path,
) -> tuple[np.ndarray, int] | tuple[None, None]:
    """
    Downloads a MusicCaps clip to a temporary wav file, loads waveform into memory,
    deletes file immediately, and returns (waveform, sampling_rate).
    """
    tmp_dir.mkdir(parents=True, exist_ok=True)
    tmp_path = tmp_dir / f"{ytid}.wav"

    url = f"https://www.youtube.com/watch?v={ytid}"
    cmd = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio \
        -o "{tmp_path}" --download-sections "*{start_s}-{end_s}" "{url}"
    """.strip()

    try:
        subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        print(f"[yt-dlp FAIL] ytid={ytid} | err={e}")
        if tmp_path.exists():
            tmp_path.unlink(missing_ok=True)
        return None, None

    if not tmp_path.exists():
        return None, None

    try:
        waveform, sr = sf.read(tmp_path, always_2d=False)
        waveform = np.asarray(waveform, dtype=np.float32)
    except subprocess.CalledProcessError as e:
        print(f"[yt-dlp FAIL] ytid={ytid}")
        try:
            msg = e.output.decode("utf-8", errors="ignore")
        except Exception:
            msg = str(e)
        print("---- yt-dlp output (truncated) ----")
        print(msg[:500])
        print("---- end ----")
        return None, None


    tmp_path.unlink(missing_ok=True)

    return waveform, sr


In [40]:
import numpy as np
import soundfile as sf
import subprocess
from pathlib import Path
from typing import Any, Dict

def _download_musiccaps_clip_to_array(
    ytid: str,
    start_s: float,
    end_s: float,
    tmp_dir: Path,
) -> tuple[np.ndarray, int] | tuple[None, None]:
    """
    Download full audio for a MusicCaps clip, extract [start_s, end_s] in Python,
    delete the temp file, and return (waveform, sampling_rate).

    Uses yt-dlp to:
      - download best audio
      - extract to wav via ffmpeg
    Then we slice the waveform ourselves.
    """
    tmp_dir.mkdir(parents=True, exist_ok=True)
    tmp_path = tmp_dir / f"{ytid}.wav"

    url = f"https://www.youtube.com/watch?v={ytid}"
    # NOTE: no --download-sections, just extract full audio to wav
    cmd = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio \
        -o "{tmp_path}" "{url}"
    """.strip()

    try:
        subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        print(f"[yt-dlp FAIL] ytid={ytid}")
        try:
            msg = e.output.decode("utf-8", errors="ignore")
        except Exception:
            msg = str(e)
        print("---- yt-dlp output (truncated) ----")
        print(msg[:500])
        print("---- end ----")
        if tmp_path.exists():
            tmp_path.unlink(missing_ok=True)
        return None, None

    if not tmp_path.exists():
        return None, None

    try:
        waveform, sr = sf.read(tmp_path, always_2d=False)
        waveform = np.asarray(waveform, dtype=np.float32)
    except Exception as e:
        print(f"[sf.read FAIL] ytid={ytid} | err={e}")
        tmp_path.unlink(missing_ok=True)
        return None, None

    # Now trim [start_s, end_s] in samples
    start_idx = int(start_s * sr)
    end_idx = int(end_s * sr)

    # Safety: clip indices to valid range
    start_idx = max(0, min(start_idx, waveform.shape[-1]))
    end_idx = max(start_idx + 1, min(end_idx, waveform.shape[-1]))

    clipped = waveform[start_idx:end_idx]

    tmp_path.unlink(missing_ok=True)

    return clipped, sr


In [41]:
from multiprocessing import cpu_count
from datasets import load_dataset

def build_musiccaps_waveform_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: int | None = None,
    num_workers: int = 8,
    tmp_audio_dir: Path | None = None,
) -> None:
    """
    Build an OFFLINE MusicCaps Parquet with clipped waveforms.

    - Downloads full audio via yt-dlp (audio-only wav).
    - Trims [start_s, end_s] in Python.
    - Stores:
        audio = { "array": np.ndarray(float32), "sampling_rate": int }
      + all original metadata + caption
    - Deletes all temp wavs; only Parquet remains.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    if tmp_audio_dir is None:
        tmp_audio_dir = output_path.parent / "musiccaps_tmp"
    tmp_audio_dir = Path(tmp_audio_dir)

    print(f"\nüì• Loading google/MusicCaps split='{split}'...")
    ds = load_dataset("google/MusicCaps", split=split)
    total = len(ds)
    print(f"   Total rows: {total:,}")

    if max_samples is not None and max_samples < total:
        ds = ds.select(range(max_samples))
        print(f"   Selected first {max_samples:,} rows.")
    print(f"   Processing {len(ds):,} rows.")

    def _add_audio_dict(example: Dict[str, Any]) -> Dict[str, Any]:
        ytid = example["ytid"]
        start_s = float(example["start_s"])
        end_s = float(example["end_s"])

        waveform, sr = _download_musiccaps_clip_to_array(
            ytid=ytid,
            start_s=start_s,
            end_s=end_s,
            tmp_dir=tmp_audio_dir,
        )
        ok = waveform is not None
        example["download_ok"] = ok

        if ok:
            example["audio"] = {
                "array": waveform,
                "sampling_rate": int(sr),
            }
        else:
            example["audio"] = None

        return example

    num_workers = min(cpu_count(), num_workers)
    print(f"\nüéß Downloading + loading audio clips using {num_workers} workers...")
    ds = ds.map(
        _add_audio_dict,
        num_proc=num_workers,
        desc="MusicCaps yt-dlp ‚Üí waveform",
    )

    before = len(ds)
    ds = ds.filter(lambda e: e["download_ok"] and e["audio"] is not None)
    after = len(ds)
    print(f"   Successful: {after:,} / {before:,}")

    if after == 0:
        raise RuntimeError("No successful MusicCaps downloads.")

    ds = ds.remove_columns(["download_ok"])

    # Clean tmp dir if any junk
    if tmp_audio_dir.exists():
        try:
            for p in tmp_audio_dir.glob("*.wav"):
                p.unlink()
            tmp_audio_dir.rmdir()
        except OSError:
            pass

    print(f"\nüíæ Saving waveform Parquet to: {output_path}")
    ds.to_parquet(str(output_path))
    print("‚úÖ MusicCaps waveform Parquet complete.")
    print(f"üìå File: {output_path}")
    print("-" * 60)


## Run builders

In [42]:
print("\n=== Building MusicCaps waveform Parquet ===")

build_musiccaps_waveform_parquet(
    output_path=MUSICC_PARQUET_PATH,
    split="train",
    max_samples=5_000,     # or None for all 5,521
    num_workers=8,
)

print("Done.")



=== Building MusicCaps waveform Parquet ===

üì• Loading google/MusicCaps split='train'...
   Total rows: 5,521
   Selected first 5,000 rows.
   Processing 5,000 rows.

üéß Downloading + loading audio clips using 8 workers...


MusicCaps yt-dlp ‚Üí waveform (num_proc=8):   0%|          | 0/5000 [00:00<?, ? examples/s]

[yt-dlp FAIL] ytid=l8P2wU-JyI8
---- yt-dlp output (truncated) ----
ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

---- end ----
[yt-dlp FAIL] ytid=BWvKAcOV_co
---- yt-dlp output (truncated) ----
ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

---- end ----
[yt-dlp FAIL] ytid=P4aTFrJws40
---- yt-dlp output (truncated) ----
ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

---- end ----
[yt-dlp FAIL] ytid=WhNun_U3cRU
---- yt-dlp output (truncated) ----
ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

---- end ----
[yt-dlp FAIL] ytid=ctJI7pCbxAo
---- yt-dlp output (truncated) ----
ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

---- end ----
[yt-dlp FAIL] ytid=-0Gj8-vB1q4

Process ForkPoolWorker-46:
Process ForkPoolWorker-47:
Process ForkPoolWorker-48:
Process ForkPoolWorker-50:
Process ForkPoolWorker-52:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-51:
Process ForkPoolWorker-45:
Process ForkPoolWorker-49:
  File "/home/hice1/vchopra37/scratch/projects/edge_glass/edge_glass_env/lib/python3.12/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/hice1/vchopra37/scratch/projects/edge_glass/edge_glass_env/lib/python3.12/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
  File "/home/hice1/vchopra37/scratch/projects/edge_glass/edge_glass_env/lib/python3.12/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/hice1/vchopra37/scratch/projects/edge_glass/edge_glass_env/lib/python3.12/site-packages/multip

TimeoutError: 

## Quick sanity check: load back the Parquet files

In [None]:
# Quick sanity check
from datasets import load_dataset

if MUSICC_PARQUET_PATH.exists():
    print("\nüîé Verifying MusicCaps Parquet...")
    musicc_local = load_dataset(
        "parquet", data_files={"train": str(MUSICC_PARQUET_PATH)}
    )["train"]

    print(musicc_local)
    print("Columns:", musicc_local.column_names)

    # Show truncated content of first row
    ex = musicc_local[0]
    print("\nExample row keys:", ex.keys())
    print("Waveform shape:", ex["audio"]["array"].shape, 
          "SR:", ex["audio"]["sampling_rate"])
    print("Caption:", ex["caption"])
