# VALOR‚Äë32K Online Builder (Images + Audio ‚Üí Parquet)

This notebook builds a **tri‚Äëmodal-friendly** dataset from VALOR‚Äë32K annotations **without** storing video clips permanently on disk.

For each annotation:

1. Parse `video_id` ‚Üí `yt_id`, `start`, `end`.
2. Use **yt‚Äëdlp** to download *only the segment* to a **temporary file**.
3. Extract:
   - a middle **video frame** ‚Üí JPEG bytes
   - the full **audio segment** ‚Üí 16kHz mono WAV bytes
4. Delete the temporary file immediately.
5. Store a row in memory with:
   - `video_id`, `yt_id`, `start`, `end`, `caption`
   - `image_jpeg` (bytes)
   - `audio_wav` (bytes)

Finally, we save the result to a **Parquet** file for use in your alignment notebooks.

> ‚ö†Ô∏è Requirements: `yt-dlp`, `ffmpeg`, `opencv-python`, `pandas`, `pyarrow` must be installed in the environment where you run this.

In [None]:
# =============================================================
# Paths & High-Level Config
# =============================================================
from pathlib import Path

# Root of your project (edit this if needed)
ROOT_DIR = Path.cwd()

# VALOR annotations JSON file (train split for example)
# Example content (list of dicts):
# [
#   {"video_id": "x-2Abohj8VY_30.000_40.000", "desc": "With the rumble, ..."},
#   {"video_id": "ILE12hEW5Ck_30.000_40.000", "desc": "In one room, ..."},
#   ...
# ]
ANNOTATIONS_JSON = ROOT_DIR / "data" / "valor_32k" / "desc_train.json"  # <-- EDIT

# Output Parquet path
OUTPUT_PARQUET = ROOT_DIR / "data" / "alignment_subsets" / "valor_32k_train_online.parquet"
OUTPUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)

# Number of workers (processes) for parallel download + extraction
NUM_WORKERS = 8

# Optional: cap number of samples for debugging (0 = use all)
MAX_SAMPLES = 0

# Image resize (width, height); set to 0 to keep original frame size
IMG_WIDTH = 224
IMG_HEIGHT = 224

print("ROOT_DIR:", ROOT_DIR)
print("Annotations JSON:", ANNOTATIONS_JSON)
print("Output Parquet:", OUTPUT_PARQUET)

In [None]:
# =============================================================
# Imports & Environment Checks
# =============================================================
import os
import json
import math
import tempfile
import subprocess
import multiprocessing as mp
from functools import partial
from typing import Any, Dict, Optional, Tuple

import cv2
import pandas as pd
from tqdm.auto import tqdm

print("Python version:", os.sys.version)

# Check yt-dlp
def _check_binary(name: str) -> bool:
    from shutil import which
    return which(name) is not None

print("yt-dlp available:", _check_binary("yt-dlp"))
print("ffmpeg available:", _check_binary("ffmpeg"))

if not _check_binary("yt-dlp"):
    print("‚ö†Ô∏è yt-dlp not found in PATH. Please install it before running this notebook.")
if not _check_binary("ffmpeg"):
    print("‚ö†Ô∏è ffmpeg not found in PATH. Please install it before running this notebook.")

In [None]:
# =============================================================
# Helper Functions: parse video_id, download clip, extract media
# =============================================================
def parse_video_id(video_id: str) -> Tuple[str, float, float]:
    """Split VALOR-style video_id into (yt_id, start, end).

    Example: "x-2Abohj8VY_30.000_40.000" -> ("x-2Abohj8VY", 30.0, 40.0)
    """
    parts = video_id.rsplit("_", 2)
    if len(parts) != 3:
        raise ValueError(f"Unexpected video_id format: {video_id}")
    yt_id, start_str, end_str = parts
    return yt_id, float(start_str), float(end_str)


def download_clip_to_temp(yt_id: str, start: float, end: float) -> Optional[Path]:
    url = f"https://www.youtube.com/watch?v={yt_id}"
    duration = max(0.0, end - start)
    if duration <= 0:
        print(f"[ERROR] Invalid duration {duration} for {yt_id}")
        return None

    tmp_dir = Path(tempfile.mkdtemp(prefix="valor_clip_"))
    out_path = tmp_dir / "clip.mp4"

    section = f"*{start}-{end}"

    cmd = [
        "yt-dlp",
        "-f", "mp4",
        "--download-sections", section,
        "-o", str(out_path),
        "--cookies-from-browser", "chrome",
        url,
    ]

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=True
        )
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] yt-dlp failed for {yt_id}:")
        print("STDOUT:\n", e.stdout)
        print("STDERR:\n", e.stderr)
        return None

    if not out_path.exists():
        print(f"[ERROR] Clip file not created for {yt_id}")
        return None

    return out_path

def extract_middle_frame(video_path: Path, resize: Optional[Tuple[int, int]] = None) -> Optional[bytes]:
    """Extract a middle frame from a video and return it as JPEG bytes."""
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        return None

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count <= 0:
        cap.release()
        return None

    middle_idx = frame_count // 2
    cap.set(cv2.CAP_PROP_POS_FRAMES, middle_idx)
    ok, frame = cap.read()
    cap.release()

    if not ok or frame is None:
        return None

    if resize is not None:
        w, h = resize
        frame = cv2.resize(frame, (w, h), interpolation=cv2.INTER_AREA)

    success, buf = cv2.imencode(".jpg", frame)
    if not success:
        return None
    return buf.tobytes()


def extract_audio_wav(video_path: Path, sr: int = 16000) -> Optional[bytes]:
    """Extract mono WAV audio at the given sample rate from video.

    Returns WAV bytes, or None on failure.
    """
    cmd = [
        "ffmpeg",
        "-i", str(video_path),
        "-vn",              # no video
        "-ac", "1",        # mono
        "-ar", str(sr),     # sample rate
        "-f", "wav",
        "-loglevel", "quiet",
        "pipe:1",           # write to stdout
    ]
    try:
        proc = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
        return proc.stdout
    except subprocess.CalledProcessError:
        return None


In [None]:
# =============================================================
# Worker Function for Multiprocessing
# =============================================================
import time

def process_annotation(entry: Dict[str, Any], resize: Optional[Tuple[int, int]] = None) -> Optional[Dict[str, Any]]:
    """
    Convert one VALOR entry ‚Üí Parquet row with logs.

    Logs:
      - Started processing
      - Download success/failure
      - Frame extraction success/failure
      - Audio extraction success/failure
      - Cleanup status
      - Final success/failure
    """
    video_id = entry.get("video_id")
    caption = entry.get("desc") or entry.get("caption") or ""
    if not video_id:
        print(f"[WARN] Missing video_id in entry: {entry}")
        return None

    try:
        yt_id, start, end = parse_video_id(video_id)
    except Exception as e:
        print(f"[ERROR] Failed to parse video_id={video_id}: {e}")
        return None

    print(f"\n--- Processing {video_id} ---")
    print(f"[INFO] yt_id={yt_id}  start={start}  end={end}")

    t0 = time.time()

    print("[INFO] Downloading clip segment...")
    clip_path = download_clip_to_temp(yt_id, start, end)
    if clip_path is None:
        print(f"[ERROR] Failed to download segment for {video_id}")
        return None
    print(f"[OK] Downloaded to: {clip_path}")

    # Extract frame + audio
    try:
        print("[INFO] Extracting middle frame...")
        frame_bytes = extract_middle_frame(clip_path, resize=resize)
        if frame_bytes is None:
            print("[ERROR] Failed to extract middle frame")
            raise RuntimeError("frame extraction failure")
        print("[OK] Frame extracted")

        print("[INFO] Extracting audio wav...")
        audio_bytes = extract_audio_wav(clip_path, sr=16000)
        if audio_bytes is None:
            print("[ERROR] Failed to extract audio")
            raise RuntimeError("audio extraction failure")
        print(f"[OK] Audio extracted: {len(audio_bytes)} bytes")

    except Exception as e:
        print(f"[ERROR] Extraction failed for {video_id}: {e}")
        # Cleanup
        try:
            tmp_dir = clip_path.parent
            if clip_path.exists(): clip_path.unlink()
            tmp_dir.rmdir()
        except:
            pass
        return None

    # Cleanup
    print("[INFO] Cleaning up temporary files...")
    try:
        tmp_dir = clip_path.parent
        if clip_path.exists():
            clip_path.unlink()
        tmp_dir.rmdir()
        print("[OK] Cleanup complete")
    except Exception as e:
        print(f"[WARN] Cleanup issue: {e}")

    dt = time.time() - t0
    print(f"[SUCCESS] {video_id} processed in {dt:.2f}s")

    return {
        "video_id": video_id,
        "yt_id": yt_id,
        "start": start,
        "end": end,
        "caption": caption,
        "image_jpeg": frame_bytes,
        "audio_wav": audio_bytes,
    }


In [None]:
# =============================================================
# Build VALOR-32K Parquet (Online Download)
# =============================================================
assert ANNOTATIONS_JSON.is_file(), f"Annotations JSON not found: {ANNOTATIONS_JSON}"

with ANNOTATIONS_JSON.open("r", encoding="utf-8") as f:
    annotations = json.load(f)

if not isinstance(annotations, list):
    raise ValueError("Annotations JSON must be a list of dicts.")

if MAX_SAMPLES and MAX_SAMPLES > 0:
    annotations = annotations[:MAX_SAMPLES]

print(f"Total annotations to process: {len(annotations)}")

resize = (IMG_WIDTH, IMG_HEIGHT) if IMG_WIDTH > 0 and IMG_HEIGHT > 0 else None

results = []
num_failed = 0

In [None]:
from multiprocessing.dummy import Pool as ThreadPool  # <‚Äî threads, not processes

if NUM_WORKERS <= 1:
    # Single-process
    for entry in tqdm(annotations, desc="Processing VALOR entries (single process)"):
        row = process_annotation(entry, resize=resize)
        if row is None:
            num_failed += 1
        else:
            results.append(row)
else:
    # Multi-threaded (better for Jupyter + macOS)
    worker = partial(process_annotation, resize=resize)
    with ThreadPool(processes=NUM_WORKERS) as pool:
        for row in tqdm(
            pool.imap_unordered(worker, annotations, chunksize=4),
            total=len(annotations),
            desc="Processing VALOR entries (threads)"
        ):
            if row is None:
                num_failed += 1
            else:
                results.append(row)


In [None]:
print(f"\n‚úÖ Finished processing annotations.")
print(f"   Successful rows: {len(results)}")
print(f"   Failed/missed  : {num_failed}")

if not results:
    print("‚ö†Ô∏è No valid rows collected. Skipping Parquet save.")
else:
    df = pd.DataFrame(results)
    print(df.head())
    df.to_parquet(OUTPUT_PARQUET, index=False)
    print(f"\nüíæ Saved VALOR-32K image+audio+caption parquet to: {OUTPUT_PARQUET}")