In [1]:
!wget https://www.openslr.org/resources/47/primewords_md_2018_set1.tar.gz

--2025-05-09 06:52:49--  https://www.openslr.org/resources/47/primewords_md_2018_set1.tar.gz
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://openslr.elda.org/resources/47/primewords_md_2018_set1.tar.gz [following]
--2025-05-09 06:52:50--  https://openslr.elda.org/resources/47/primewords_md_2018_set1.tar.gz
Resolving openslr.elda.org (openslr.elda.org)... 141.94.109.138, 2001:41d0:203:ad8a::
Connecting to openslr.elda.org (openslr.elda.org)|141.94.109.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9057625192 (8.4G) [application/x-gzip]
Saving to: ‘primewords_md_2018_set1.tar.gz’


2025-05-09 06:57:21 (31.9 MB/s) - ‘primewords_md_2018_set1.tar.gz’ saved [9057625192/9057625192]



In [1]:
!tar -xzvf /external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/primewords_md_2018_set1.tar.gz

primewords_md_2018_set1/
primewords_md_2018_set1/set1_transcript.json
primewords_md_2018_set1/audio_files/
primewords_md_2018_set1/audio_files/3/
primewords_md_2018_set1/audio_files/3/36/
primewords_md_2018_set1/audio_files/3/36/366197d9-1152-4226-82ee-94e9296cb48c.wav
primewords_md_2018_set1/audio_files/3/36/362edab9-f744-48a9-8047-2e4389f220c6.wav
primewords_md_2018_set1/audio_files/3/36/36353c44-6add-426c-bba7-846d97518979.wav
primewords_md_2018_set1/audio_files/3/36/36edba3e-a123-40d0-a25a-3dbe7e792c62.wav
primewords_md_2018_set1/audio_files/3/36/36b7791e-c5f5-4752-9f09-623fdff22c4d.wav
primewords_md_2018_set1/audio_files/3/36/36945268-c31d-4e12-9011-a4ee35eefdfb.wav
primewords_md_2018_set1/audio_files/3/36/361cbb4a-ebfa-4115-a036-514c204d55dc.wav
primewords_md_2018_set1/audio_files/3/36/36331eb1-4302-40df-8e35-ffa7043cd60a.wav
primewords_md_2018_set1/audio_files/3/36/36f87cd2-17cf-4666-9281-62b0be31f683.wav
primewords_md_2018_set1/audio_files/3/36/36ba348c-0262-40d6-9beb-4a406caee

In [1]:
#!/usr/bin/env python3
import os
import json
import shutil

# ─── Configuration (hard-coded) ────────────────────────────────────────────────
AUDIO_ROOT      = "/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/primewords_md_2018_set1/audio_files"
INPUT_JSON      = "/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/primewords_md_2018_set1/set1_transcript.json"
WAVS_DIR        = "/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/wavs/"
OUTPUT_MANIFEST = "/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/manifest.jsonl"
# ────────────────────────────────────────────────────────────────────────────────

def build_file_index(audio_root):
    """
    Walk `audio_root` and return a dict mapping base-filename.wav → full path.
    """
    index = {}
    for root, _, files in os.walk(audio_root):
        for fn in files:
            if fn.lower().endswith('.wav'):
                index[fn] = os.path.join(root, fn)
    return index

def main():
    # 1) Load transcript JSON
    with open(INPUT_JSON, 'r', encoding='utf-8') as f:
        records = json.load(f)

    # 2) Build audio lookup
    file_index = build_file_index(AUDIO_ROOT)
    if not file_index:
        raise RuntimeError(f"No .wav files found under {AUDIO_ROOT!r}")

    # 3) Prepare wavs directory
    os.makedirs(WAVS_DIR, exist_ok=True)

    # 4) Open manifest for writing
    with open(OUTPUT_MANIFEST, 'w', encoding='utf-8') as out_f:
        for rec in records:
            # key might be "file" or "audio_filepath"
            fname = rec.get('file') or rec.get('audio_filepath')
            if not fname:
                print("Skipping record without 'file' or 'audio_filepath':", rec)
                continue

            src = file_index.get(fname)
            if src is None:
                print(f"Warning: audio file {fname!r} not found under {AUDIO_ROOT}")
                continue

            # copy to wavs/
            dst = os.path.join(WAVS_DIR, fname)
            shutil.copy2(src, dst)

            # build manifest entry
            entry = {
                "audio_filepath": dst,
                "text": rec.get("text", "").strip()
            }
            # include duration if available
            length = rec.get("length") or rec.get("duration")
            if length is not None:
                try:
                    entry["duration"] = float(length)
                except (TypeError, ValueError):
                    pass

            out_f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Done! → Manifest: {OUTPUT_MANIFEST!r},  WAVs dir: {WAVS_DIR!r}")

if __name__ == "__main__":
    main()


Done! → Manifest: '/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/manifest.jsonl',  WAVs dir: '/external4/datasets/Mandarin/Primewords_Chinese_Corpus_Set_1/wavs/'
