In [None]:
from datasets import load_dataset, Audio
import os
import json
import soundfile as sf  # pip install soundfile
from tqdm.auto import tqdm

# 0. Configure your base output directory
BASE_OUT = "/external3/databases/ai4bharat_indicvoices/bengali"

# 1. Point at your Parquet shards
data_files = {
    "train": os.path.join(BASE_OUT, "train-*.parquet"),
    "valid": os.path.join(BASE_OUT, "valid-*.parquet"),
}

# 2. Load metadata-only and then cast the audio column to actual waveforms
ds = load_dataset("parquet", data_files=data_files)
ds = ds.cast_column("audio_filepath", Audio(sampling_rate=16000))

# 3. Columns to keep in JSONL
columns_to_keep = [
    "audio_filepath",
    "text",
    "duration",
    "task_name",
    "gender",
    "age_group",
    "state"
]

# 4. Process each split, skipping already-done files, with progress bars
for split in ("train", "valid"):
    out_wav_dir = os.path.join(BASE_OUT, f"wavs_{split}")
    os.makedirs(out_wav_dir, exist_ok=True)

    jsonl_path = os.path.join(BASE_OUT, f"{split}_manifest.jsonl")
    mode = "a" if os.path.exists(jsonl_path) else "w"

    new_count = 0
    total = len(ds[split])
    with open(jsonl_path, mode, encoding="utf-8") as fout:
        for example in tqdm(ds[split], total=total, desc=f"Processing {split}"):
            audio_dict = example["audio_filepath"]
            orig_path = audio_dict["path"]  # original FLAC path
            stem = os.path.splitext(os.path.basename(orig_path))[0]
            wav_name = f"{stem}.wav"
            dst = os.path.join(out_wav_dir, wav_name)

            # Skip if this WAV is already on disk
            if os.path.exists(dst):
                continue

            # Write out the WAV
            array = audio_dict["array"]
            sr = audio_dict["sampling_rate"]
            sf.write(dst, array, sr)

            # Build and write filtered JSON record
            record = {
                k: (dst if k == "audio_filepath" else example[k])
                for k in columns_to_keep
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")
            new_count += 1

    print(f"{split}: +{new_count} new WAVs → {out_wav_dir}/")
    print(f"{split}: metadata appended → {jsonl_path}")
