In [None]:
from datasets import load_dataset
import os
import torchaudio

dataset = load_dataset("SPRINGLab/IndicTTS-Hindi", split="train")

os.makedirs("hindi_data/wavs", exist_ok=True)
os.makedirs("hindi_data/text", exist_ok=True)

for i, sample in enumerate(dataset):
    audio = sample["audio"]
    text = sample["sentence"]

    with open(f"hindi_data/text/{i:05d}.txt", "w", encoding="utf-8") as f:
        f.write(text.strip())

    waveform, sr = torchaudio.load(audio["path"])
    waveform = torchaudio.functional.resample(waveform, sr, 22050)
    torchaudio.save(f"hindi_data/wavs/{i:05d}.wav", waveform, 22050)

In [None]:
import os
import re
import torchaudio
import pandas as pd
from tqdm import tqdm

SRC_WAVS = "hindi_data/wavs"
SRC_TEXT = "hindi_data/text"

DST_ROOT = "hindi_dataset_normalised"
DST_WAVS = os.path.join(DST_ROOT, "wavs")
DST_TEXT = os.path.join(DST_ROOT, "text")

os.makedirs(DST_WAVS, exist_ok=True)
os.makedirs(DST_TEXT, exist_ok=True)

In [None]:
def normalize_text(text):
    text = text.strip()
    trans_table = str.maketrans("0123456789", "०१२३४५६७८९")
    text = text.translate(trans_table)
    text = re.sub(r"[^ऀ-ॿ।,!?०-९ ]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
def normalize_audio(src_path, dst_path, target_sr=16000):
    waveform, sr = torchaudio.load(src_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)
    waveform = waveform / waveform.abs().max()
    torchaudio.save(dst_path, waveform, target_sr)

In [None]:
metadata_entries = []

for fname in tqdm(sorted(os.listdir(SRC_TEXT))):
    if not fname.endswith(".txt"):
        continue
    idx = fname[:-4]

    with open(os.path.join(SRC_TEXT, fname), "r", encoding="utf-8") as f:
        text = f.read().strip()
    norm_text = normalize_text(text)

    with open(os.path.join(DST_TEXT, fname), "w", encoding="utf-8") as f:
        f.write(norm_text)

    src_wav = os.path.join(SRC_WAVS, f"{idx}.wav")
    dst_wav = os.path.join(DST_WAVS, f"{idx}.wav")
    normalize_audio(src_wav, dst_wav, target_sr=16000)

    metadata_entries.append([f"wavs/{idx}.wav", norm_text])

meta_df = pd.DataFrame(metadata_entries, columns=["path", "text"])
meta_df.to_csv(
    os.path.join(DST_ROOT, "metadata.csv.final.cleaned"),
    sep="|",
    header=False,
    index=False,
    encoding="utf-8"
)

print("Done Done Done")

Done Done Done
