
# MusicCaps ‚Üí Waveform Parquet Builder (Mac / Local)

This notebook downloads the **Google MusicCaps** dataset audio from YouTube,
extracts the specified `[start_s, end_s]` segment for each clip, and saves
everything into a **single Parquet file** with:

- `audio.array` ‚Äì the waveform as a float32 numpy array
- `audio.sampling_rate` ‚Äì sampling rate (Hz)
- all original MusicCaps metadata (caption, ytid, etc.)

You run this **once** on your Mac, then copy the the Parquet to PACE and train
purely offline from that file.


## 1. Install dependencies (run once)

In [1]:

# If needed, install these in your Mac environment (uncomment as needed):
# ! uv pip install yt-dlp soundfile datasets

# On macOS, make sure ffmpeg is installed, e.g.:
#   brew install ffmpeg
#
# And confirm:
# ! ffmpeg -version | head -n 1


## 2. Imports and configuration

In [2]:

import subprocess
from pathlib import Path
from typing import Any, Dict, Tuple

import numpy as np
import soundfile as sf
from datasets import load_dataset
from multiprocessing import cpu_count

# ---- PATHS ----
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

MUSICC_PARQUET_PATH = DATA_DIR / "musiccaps_waveform.parquet"
TMP_AUDIO_DIR = DATA_DIR / "musiccaps_tmp"

# ---- CONFIG ----
MAX_SAMPLES = None   # e.g. 5000 for subset, or None for full 5521
NUM_WORKERS = 32    # adjust based on your Mac cores

print(f"Project root: {PROJECT_ROOT}")
print(f"Output Parquet: {MUSICC_PARQUET_PATH}")
print(f"Tmp audio dir: {TMP_AUDIO_DIR}")


Project root: /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base
Output Parquet: /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base/data/musiccaps_waveform.parquet
Tmp audio dir: /Users/vedaangchopra/all_data/complete_technical_work/all_projects_implemented/Edge Assistant/code_base/v2_code_base/data/musiccaps_tmp


## 3. Helper ‚Äì download clip and slice with ffmpeg via yt-dlp

In [3]:
def _download_musiccaps_clip_to_array(
    ytid: str,
    start_s: float,
    end_s: float,
    tmp_dir: Path,
) -> tuple[np.ndarray | None, int | None, str]:
    """
    Download the [start_s, end_s] segment for a MusicCaps clip as WAV using yt-dlp,
    load into memory, delete the temp file, and return:
        (waveform, sampling_rate, fail_reason)

    - On success: (np.ndarray, int, "ok")
    - On failure: (None, None, <short_reason_string>)
    """
    tmp_dir.mkdir(parents=True, exist_ok=True)
    tmp_path = tmp_dir / f"{ytid}.wav"

    url = f"https://www.youtube.com/watch?v={ytid}"

    cmd = f"""
        yt-dlp --quiet --no-warnings \
        -f "bestaudio/best" \
        --cookies-from-browser chrome \
        -x --audio-format wav \
        -o "{tmp_path}" \
        --download-sections "*{start_s}-{end_s}" \
        "{url}"
    """.strip()

    try:
        # 120s timeout per clip to avoid hanging workers
        subprocess.check_output(
            cmd,
            shell=True,
            stderr=subprocess.STDOUT,
            timeout=120,
        )
    except subprocess.TimeoutExpired:
        print(f"[yt-dlp TIMEOUT] ytid={ytid}")
        if tmp_path.exists():
            tmp_path.unlink(missing_ok=True)
        return None, None, "timeout"
    except subprocess.CalledProcessError as e:
        # Inspect yt-dlp error to categorize
        reason = "yt_dlp_error"
        try:
            msg = e.output.decode("utf-8", errors="ignore")
        except Exception:
            msg = str(e)

        if "Video unavailable" in msg and "terminated" in msg:
            reason = "video_terminated"
        elif "Video unavailable" in msg and "private" in msg:
            reason = "private"
        elif "Requested format is not available" in msg:
            reason = "format_unavailable"

        print(f"[yt-dlp FAIL] ytid={ytid} | reason={reason}")
        print("---- yt-dlp output (truncated) ----")
        print(msg[:300])
        print("---- end ----")

        if tmp_path.exists():
            tmp_path.unlink(missing_ok=True)
        return None, None, reason

    if not tmp_path.exists():
        return None, None, "no_output_file"

    try:
        waveform, sr = sf.read(tmp_path, always_2d=False)
        waveform = np.asarray(waveform, dtype=np.float32)
    except Exception as e:
        print(f"[sf.read FAIL] ytid={ytid} | err={e}")
        tmp_path.unlink(missing_ok=True)
        return None, None, "sf_read_error"

    # Clean up file immediately
    tmp_path.unlink(missing_ok=True)

    return waveform, sr, "ok"



## 4. Builder ‚Äì MusicCaps ‚Üí waveform Parquet

In [4]:
from collections import Counter

def build_musiccaps_waveform_parquet(
    output_path: Path,
    split: str = "train",
    max_samples: int | None = None,
    num_workers: int = 8,
    tmp_audio_dir: Path | None = None,
) -> None:
    """
    Build an OFFLINE MusicCaps Parquet with clipped waveforms.

    - Loads google/MusicCaps metadata.
    - For each row:
        * yt-dlp + ffmpeg extract [start_s, end_s] into a temp .wav.
        * Loads waveform into memory, deletes .wav.
        * Stores `audio` dict and logs `download_ok` + `fail_reason`.
    - Logs summary: total, success, failures by reason.
    - Saves successful rows to a single Parquet file.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    if tmp_audio_dir is None:
        tmp_audio_dir = output_path.parent / "musiccaps_tmp"
    tmp_audio_dir = Path(tmp_audio_dir)

    print(f"\nüì• Loading google/MusicCaps split='{split}'...")
    ds = load_dataset("google/MusicCaps", split=split)
    total = len(ds)
    print(f"   Total rows in MusicCaps: {total:,}")

    if max_samples is not None and max_samples < total:
        ds = ds.select(range(max_samples))
        print(f"   Selected first {max_samples:,} rows.")
    print(f"   Processing {len(ds):,} rows.")

    def _add_audio_dict(example: Dict[str, Any]) -> Dict[str, Any]:
        ytid = example["ytid"]
        start_s = float(example["start_s"])
        end_s = float(example["end_s"])

        waveform, sr, reason = _download_musiccaps_clip_to_array(
            ytid=ytid,
            start_s=start_s,
            end_s=end_s,
            tmp_dir=tmp_audio_dir,
        )

        ok = waveform is not None
        example["download_ok"] = ok
        example["fail_reason"] = None if ok else reason

        if ok:
            example["audio"] = {
                "array": waveform,
                "sampling_rate": int(sr),
            }
        else:
            example["audio"] = None

        return example

    num_workers = min(cpu_count(), num_workers)
    print(f"\nüéß Downloading + slicing audio using {num_workers} workers...")
    ds = ds.map(
        _add_audio_dict,
        num_proc=num_workers,
        desc="MusicCaps yt-dlp ‚Üí waveform",
    )

    # ---- Summary logs BEFORE filtering ----
    total_rows = len(ds)
    download_ok_list = ds["download_ok"]
    fail_reason_list = ds["fail_reason"]

    success_count = sum(bool(x) for x in download_ok_list)
    fail_count = total_rows - success_count

    print("\nüìä Download summary (before filtering):")
    print(f"   Total rows   : {total_rows:,}")
    print(f"   Successful   : {success_count:,}")
    print(f"   Failed       : {fail_count:,}")

    if fail_count > 0:
        reason_counts = Counter(r or "ok" for r in fail_reason_list)
        print("   Breakdown by reason:")
        for reason, cnt in sorted(reason_counts.items(), key=lambda x: (-x[1], x[0])):
            print(f"     - {reason:20s}: {cnt:,}")

    # ---- Filter to successful only ----
    before = len(ds)
    ds = ds.filter(lambda e: e["download_ok"] and e["audio"] is not None)
    after = len(ds)
    print(f"\n‚úÖ Rows kept for Parquet (successful only): {after:,} / {before:,}")

    if after == 0:
        raise RuntimeError("No successful MusicCaps downloads. Check yt-dlp / ffmpeg / cookies.")

    # We don't need download_ok / fail_reason in the final Parquet
    ds = ds.remove_columns(["download_ok", "fail_reason"])

    # Cleanup tmp dir if any leftovers
    if tmp_audio_dir.exists():
        try:
            for p in tmp_audio_dir.glob("*.wav"):
                p.unlink()
            tmp_audio_dir.rmdir()
        except OSError:
            # dir not empty or other issue; fine to leave it
            pass

    print(f"\nüíæ Saving waveform Parquet to: {output_path}")
    ds.to_parquet(str(output_path))
    print("üéâ MusicCaps waveform Parquet complete.")
    print(f"üìå File: {output_path}")


## 5. Run the builder

In [None]:

# print("\n=== Building MusicCaps waveform Parquet ===")
# if MUSICC_PARQUET_PATH.exists():
#     print(f"üìÇ Parquet already exists, skipping build: {MUSICC_PARQUET_PATH}")
# else:
#     build_musiccaps_waveform_parquet(
#         output_path=MUSICC_PARQUET_PATH,
#         split="train",
#         max_samples=MAX_SAMPLES,
#         num_workers=NUM_WORKERS,
#         tmp_audio_dir=TMP_AUDIO_DIR,
#     )
#     print("‚úÖ Build complete.")


## 6. Quick sanity check

In [None]:

from datasets import load_dataset

if MUSICC_PARQUET_PATH.exists():
    print("\nüîé Verifying MusicCaps Parquet...")
    musiccaps_local = load_dataset(
        "parquet",
        data_files={"train": str(MUSICC_PARQUET_PATH)},
    )["train"]

    print(musiccaps_local)
    print("Columns:", musiccaps_local.column_names)

    ex = musiccaps_local[0]
    print("\nExample audio:")
    print("  sampling_rate:", ex["audio"]["sampling_rate"])
    print("  array shape  :", ex["audio"]["array"].shape)
    print("  caption      :", ex["caption"])


In [10]:
# # -----------------------------------------------------------------------------
# #                           MECAT‚ÄìCAPTION BUILDER
# # -----------------------------------------------------------------------------
# from pathlib import Path
# from typing import Optional, Dict, Any

# from datasets import load_dataset, Audio
# from tqdm import tqdm


# def build_mecat_caption_parquet(
#     output_path: Path,
#     split: str = "train",
#     max_samples: Optional[int] = None,
#     sampling_rate: int = 16_000,
#     caption_key: str = "long",
# ) -> None:
#     """
#     Build an OFFLINE Parquet for `mispeech/MECAT-Caption`.

#     - Uses the already-packaged audio (`flac`) ‚Üí cast to Audio(sampling_rate).
#     - Adds a flat `caption` column from the nested `json[caption_key]`.
#     - Keeps:
#         * __key__
#         * audio  (Audio column)
#         * json   (all caption variants: long/short/speech/music/sound/environment)
#         * caption (chosen main caption)
#     - Saves everything to a single Parquet file.

#     Args:
#         output_path: Where to save the Parquet.
#         split: HF split to use ("train" or "test"; MECAT-Caption mainly uses "train").
#         max_samples: If provided, keep at most this many rows.
#         sampling_rate: Target sampling rate for the Audio column.
#         caption_key: Which key inside `json` to use as the main flat `caption`
#                      (e.g. "long", "short", "speech", "music", "sound", "environment").
#     """
#     output_path = Path(output_path)
#     output_path.parent.mkdir(parents=True, exist_ok=True)

#     print(f"\nüì• Loading mispeech/MECAT-Caption split='{split}'...")
#     ds = load_dataset("mispeech/MECAT-Caption", split=split)
#     total = len(ds)
#     print(f"   Total rows in MECAT-Caption: {total:,}")

#     # Optional subsetting
#     if max_samples is not None and max_samples < total:
#         ds = ds.select(range(max_samples))
#         print(f"   Selected first {max_samples:,} rows.")
#     print(f"   Working with {len(ds):,} rows.")

#     # Cast 'flac' to proper Audio column
#     print("\nüéß Casting 'flac' column to Audio...")
#     ds = ds.cast_column("flac", Audio(sampling_rate=sampling_rate))

#     # Add a flat `caption` column from json[caption_key]
#     def _add_caption(example: Dict[str, Any]) -> Dict[str, Any]:
#         j = example.get("json", {}) or {}
#         # Fallback to 'long' if chosen key is missing
#         caption = j.get(caption_key, j.get("long", ""))
#         example["caption"] = caption
#         return example

#     print(f"üìù Adding 'caption' column from json['{caption_key}']...")
#     ds = ds.map(_add_caption, desc="Adding caption")

#     # Rename 'flac' -> 'audio' to match your other datasets (PixMo, MusicCaps)
#     print("üîÅ Renaming 'flac' ‚Üí 'audio'...")
#     ds = ds.rename_column("flac", "audio")

#     # Quick preview
#     print("\nüîé Example row:")
#     ex = ds[0]
#     print("  __key__      :", ex.get("__key__", "")[:80])
#     print("  caption      :", ex.get("caption", "")[:120], "...")
#     print("  audio.sr     :", ex["audio"]["sampling_rate"])
#     print("  audio.shape  :", ex["audio"]["array"].shape)
#     print("  json keys    :", list(ex.get("json", {}).keys()))

#     # Save to Parquet
#     print(f"\nüíæ Saving MECAT-Caption Parquet to: {output_path}")
#     ds.to_parquet(str(output_path))
#     print("‚úÖ MECAT-Caption Parquet saved.")
#     print(f"üìå File: {output_path}")
#     print("-" * 60)


In [9]:
# MECAT_PARQUET_PATH = Path("data/alignment_offline/mecat_caption.parquet")

# if MECAT_PARQUET_PATH.exists():
#     print(f"üìÇ MECAT Parquet already exists, skipping build: {MECAT_PARQUET_PATH}")
# else:
#     build_mecat_caption_parquet(
#         output_path=MECAT_PARQUET_PATH,
#         split="train",
#         max_samples=20_000,
#         sampling_rate=16_000,
#         caption_key="long",
#     )
