# **Data Cleaning and Preprocessing For Hindi/English ASR (LibriSpeech ASR/ IndicSpeech)**

### **Imports**

In [1]:
import os
import re
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### **Loading the Dataset**

***Making Dir***

In [2]:
os.makedirs("../data/clean/asr/english_audio", exist_ok=True)
os.makedirs("../data/clean/asr/hindi_audio", exist_ok=True)

***Loading datasets***

In [5]:
# English
# Load dataset with "keep_in_memory=True" to make access faster
dataset_en = load_dataset("librispeech_asr", "clean", split="train.100[:1%]", keep_in_memory=True)

# Convert the dataset to a pandas dataframe — keeps only useful columns
df_en = dataset_en.to_pandas()[["file", "text"]]
df_en.rename(columns={"file": "audio_path"}, inplace=True)

print("✅ Dataset loaded successfully with raw file paths")
print(df_en.head())

✅ Dataset loaded successfully with raw file paths
                                          audio_path  \
0  /home/albert/.cache/huggingface/datasets/downl...   
1  /home/albert/.cache/huggingface/datasets/downl...   
2  /home/albert/.cache/huggingface/datasets/downl...   
3  /home/albert/.cache/huggingface/datasets/downl...   
4  /home/albert/.cache/huggingface/datasets/downl...   

                                                text  
0  CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE B...  
1  MARGUERITE TO BE UNABLE TO LIVE APART FROM ME ...  
2  I WISHED ABOVE ALL NOT TO LEAVE MYSELF TIME TO...  
3  ASSUMED ALL AT ONCE AN APPEARANCE OF NOISE AND...  
4  NOTHING IS SO EXPENSIVE AS THEIR CAPRICES FLOW...  


In [6]:
# Hindi
dataset_hi = load_dataset("AI4Bharat/IndicSpeech", "hi", split="train[:1%]")
print(dataset_hi)
print("Sample:", dataset_hi[0])

DatasetNotFoundError: Dataset 'AI4Bharat/IndicSpeech' doesn't exist on the Hub or cannot be accessed.

### **Data Cleaning**

In [None]:
# English
def clean_english_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

dataset_en = dataset_en.map(lambda x: {"clean_text": clean_english_text(x["text"])})
print("Cleaned sample:", dataset_en[0]["clean_text"])


In [None]:
# Hindi
import unicodedata

def clean_hindi_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[^\u0900-\u097Fa-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

dataset_hi = dataset_hi.map(lambda x: {"clean_text": clean_hindi_text(x["sentence"])})
print("Cleaned Hindi sample:", dataset_hi[0]["clean_text"])


### **Resample and Save Audio**

In [None]:
# English
def process_audio_en(example):
    audio_array, sr = librosa.load(example["audio"]["path"], sr=16000)
    new_path = os.path.join("data/clean/asr/english_audio", os.path.basename(example["audio"]["path"]).replace(".flac", ".wav"))
    sf.write(new_path, audio_array, 16000)
    example["resampled_path"] = new_path
    return example

print("Resampling English audio files...")
dataset_en = dataset_en.map(process_audio_en)

df_en = pd.DataFrame({
    "audio_path": [ex["resampled_path"] for ex in dataset_en],
    "text": [ex["clean_text"] for ex in dataset_en]
})
df_en.to_csv("data/clean/asr/english_asr_clean.csv", index=False)
print("Saved English ASR dataset to data/clean/asr/english_asr_clean.csv")


In [None]:
# Hindi
def process_audio_hi(example):
    audio_array, sr = librosa.load(example["audio"]["path"], sr=16000)
    new_path = os.path.join("data/clean/asr/hindi_audio", os.path.basename(example["audio"]["path"]).replace(".wav", "_16k.wav"))
    sf.write(new_path, audio_array, 16000)
    example["resampled_path"] = new_path
    return example

print("Resampling Hindi audio files...")
dataset_hi = dataset_hi.map(process_audio_hi)

df_hi = pd.DataFrame({
    "audio_path": [ex["resampled_path"] for ex in dataset_hi],
    "text": [ex["clean_text"] for ex in dataset_hi]
})
df_hi.to_csv("data/clean/asr/hindi_asr_clean.csv", index=False)
print("Saved Hindi ASR dataset to data/clean/asr/hindi_asr_clean.csv")

### **Summary**

In [None]:
print("English ASR samples:", len(df_en))
print("Hindi ASR samples:", len(df_hi))

print("\nSample English row:")
print(df_en.sample(1))

print("\nSample Hindi row:")
print(df_hi.sample(1))