In [3]:
!wget https://www.openslr.org/resources/18/resource.tgz

--2025-05-09 07:00:05--  https://www.openslr.org/resources/18/resource.tgz


Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://openslr.elda.org/resources/18/resource.tgz [following]
--2025-05-09 07:00:05--  https://openslr.elda.org/resources/18/resource.tgz
Resolving openslr.elda.org (openslr.elda.org)... 141.94.109.138, 2001:41d0:203:ad8a::
Connecting to openslr.elda.org (openslr.elda.org)|141.94.109.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24813708 (24M) [application/x-gzip]
Saving to: ‘resource.tgz’


2025-05-09 07:00:07 (16.8 MB/s) - ‘resource.tgz’ saved [24813708/24813708]



In [6]:
!tar -xzvf /external4/datasets/Mandarin/THCHS-30/test-noise.tgz

test-noise/
test-noise/noise/
test-noise/noise/car/
test-noise/noise/car/car.wav


test-noise/noise/cafe/
test-noise/noise/cafe/cafe.wav
test-noise/noise/white/
test-noise/noise/white/white.wav
test-noise/README
test-noise/utils/
test-noise/utils/add-noise-mod.py
test-noise/utils/produce-noisy-data.sh
test-noise/utils/util.pyx
test-noise/0db/
test-noise/0db/car/
test-noise/0db/car/D12_779.wav
test-noise/0db/car/D31_855.wav
test-noise/0db/car/D13_861.wav
test-noise/0db/car/D4_890.wav
test-noise/0db/car/D6_886.wav
test-noise/0db/car/D4_970.wav
test-noise/0db/car/D12_982.wav
test-noise/0db/car/D13_925.wav
test-noise/0db/car/D32_964.wav
test-noise/0db/car/D4_788.wav
test-noise/0db/car/D31_796.wav
test-noise/0db/car/D6_763.wav
test-noise/0db/car/D21_869.wav
test-noise/0db/car/D8_764.wav
test-noise/0db/car/D7_889.wav
test-noise/0db/car/D4_849.wav
test-noise/0db/car/D31_857.wav
test-noise/0db/car/D32_756.wav
test-noise/0db/car/D8_941.wav
test-noise/0db/car/D12_938.wav
test-noise/0db/car/D8_803.wav
test-noise/0db/car/D7_917.wav
test-noise/0db/car/D4_910.wav
test-noise/0db/ca

In [5]:

import sys, subprocess, importlib, os, glob, json, shutil
import soundfile as sf


# === User configuration ===
ROOT_DIR          = "/external4/datasets/Mandarin/THCHS-30/data_thchs30"
TRANSCRIPT_SUBDIR = "data"           # where .wav.trn live (under ROOT_DIR)
SPLITS            = ["train", "dev", "test"]
OUT_DIR           = "/external4/datasets/Mandarin/THCHS-30/"



def gather_manifest(split_dir, wav_out_dir):
    """
    - Copy each .wav from split_dir into wav_out_dir
    - Read its transcript from ROOT_DIR/TRANSCRIPT_SUBDIR/*.wav.trn
    - Keep only the first line of the transcript
    - Measure duration and return list of records
    """
    os.makedirs(wav_out_dir, exist_ok=True)
    records = []
    for wav in sorted(glob.glob(os.path.join(split_dir, '*.wav'))):
        base = os.path.basename(wav)
        dest = os.path.join(wav_out_dir, base)
        try:
            shutil.copy2(wav, dest)
        except Exception as e:
            print(f"  [!] failed to copy {wav} → {dest}: {e}")
            continue

        trn = os.path.join(ROOT_DIR, TRANSCRIPT_SUBDIR, os.path.splitext(base)[0] + '.wav.trn')
        if not os.path.isfile(trn):
            print(f"  [!] no transcript for {base}: {trn}")
            continue

        # read Mandarin line only
        with open(trn, 'r', encoding='utf-8') as f:
            full = f.read().strip()
        text = full.splitlines()[0]

        # duration
        try:
            info = sf.info(dest)
            duration = info.frames / float(info.samplerate)
        except Exception as e:
            print(f"  [!] cannot read {dest}: {e}")
            continue

        records.append({
            'audio_filepath': os.path.abspath(dest),
            'text': text,
            'duration': duration
        })
    return records

def write_jsonl(records, path):
    with open(path, 'w', encoding='utf-8') as out:
        for r in records:
            out.write(json.dumps(r, ensure_ascii=False) + '\n')

def compute_stats(recs):
    n = len(recs)
    tot = sum(r['duration'] for r in recs)
    return {
        'utterances': n,
        'total_hours': tot / 3600,
        'avg_seconds': tot / n if n else 0
    }

def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    summary = {}

    for split in SPLITS:
        split_dir = os.path.join(ROOT_DIR, split)
        if not os.path.isdir(split_dir):
            print(f"[!] skip '{split}': {split_dir} missing")
            continue

        print(f"\n>> Processing '{split}' …")
        wav_out = os.path.join(OUT_DIR, f"wavs_{split}")
        recs = gather_manifest(split_dir, wav_out)

        manifest = os.path.join(OUT_DIR, f"manifest_{split}.jsonl")
        write_jsonl(recs, manifest)

        stats = compute_stats(recs)
        summary[split] = stats

        print(f"   • Copied & indexed {stats['utterances']} files → {wav_out}")
        print(f"   • Wrote manifest → {manifest}")
        print(f"   • Total = {stats['total_hours']:.2f} h, avg = {stats['avg_seconds']:.2f} s")

    print("\n=== Summary ===")
    for sp, st in summary.items():
        print(f" {sp:5} | utts: {st['utterances']:5} | hrs: {st['total_hours']:5.2f} | avg: {st['avg_seconds']:4.2f}s")

if __name__ == '__main__':
    main()



>> Processing 'train' …
   • Copied & indexed 10000 files → /external4/datasets/Mandarin/THCHS-30/wavs_train
   • Wrote manifest → /external4/datasets/Mandarin/THCHS-30/manifest_train.jsonl
   • Total = 25.55 h, avg = 9.20 s

>> Processing 'dev' …
   • Copied & indexed 893 files → /external4/datasets/Mandarin/THCHS-30/wavs_dev
   • Wrote manifest → /external4/datasets/Mandarin/THCHS-30/manifest_dev.jsonl
   • Total = 2.30 h, avg = 9.28 s

>> Processing 'test' …
   • Copied & indexed 2495 files → /external4/datasets/Mandarin/THCHS-30/wavs_test
   • Wrote manifest → /external4/datasets/Mandarin/THCHS-30/manifest_test.jsonl
   • Total = 6.31 h, avg = 9.11 s

=== Summary ===
 train | utts: 10000 | hrs: 25.55 | avg: 9.20s
 dev   | utts:   893 | hrs:  2.30 | avg: 9.28s
 test  | utts:  2495 | hrs:  6.31 | avg: 9.11s
