In [1]:
import zipfile
import os

zip_file_path = '/content/hi_test_dataset.zip'
extraction_path = '/content/'

os.makedirs(extraction_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print(f"'{zip_file_path}' extracted to '{extraction_path}'")
print(f"Contents of '{extraction_path}':")
print(os.listdir(extraction_path))

'/content/hi_test_dataset.zip' extracted to '/content/'
Contents of '/content/':
['.config', 'hi_test_dataset.zip', 'hi_test_dataset', 'sample_data']


In [2]:
import os

BASE_DIR = "hi_test_dataset"

print("Base dir exists:", os.path.exists(BASE_DIR))
print("Base dir contents:", os.listdir(BASE_DIR))

SUB_DIR = os.path.join(BASE_DIR, "audio+transcripts")

print("\nSub dir exists:", os.path.exists(SUB_DIR))
print("Sub dir contents (first 15):", os.listdir(SUB_DIR)[:15])


Base dir exists: True
Base dir contents: ['audio+transcripts']

Sub dir exists: True
Sub dir contents (first 15): ['common_voice_hi_27762774.mp3', 'common_voice_hi_26044297.mp3', 'common_voice_hi_24258388.mp3', 'common_voice_hi_26018159.mp3', 'common_voice_hi_27408369.mp3', 'common_voice_hi_27372138.mp3', 'common_voice_hi_27371837.mp3', 'common_voice_hi_24360601.mp3', 'common_voice_hi_27407935.mp3', 'common_voice_hi_32249437.mp3', 'common_voice_hi_26988410.mp3', 'common_voice_hi_26120121.mp3', 'common_voice_hi_26120192.mp3', 'common_voice_hi_24829334.mp3', 'common_voice_hi_27565446.mp3']


In [4]:
pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.1


In [5]:
import os
import torchaudio

DATA_DIR = "hi_test_dataset/audio+transcripts"

# pick one mp3 file
mp3_files = [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".mp3")]
test_file = mp3_files[0]

test_path = os.path.join(DATA_DIR, test_file)
print("Testing file:", test_path)

waveform, sr = torchaudio.load(test_path)

print("Loaded successfully")
print("Waveform shape:", waveform.shape)
print("Sample rate:", sr)


Testing file: hi_test_dataset/audio+transcripts/common_voice_hi_27762774.mp3
Loaded successfully
Waveform shape: torch.Size([1, 150912])
Sample rate: 32000


In [6]:
import os
import torchaudio

DATA_DIR = "hi_test_dataset/audio+transcripts"
OUT_DIR = "/content/processed_wav"

os.makedirs(OUT_DIR, exist_ok=True)

test_file = "common_voice_hi_24258388.mp3"
in_path = os.path.join(DATA_DIR, test_file)
out_path = os.path.join(OUT_DIR, "TEST_RESAMPLED.wav")

# load
waveform, sr = torchaudio.load(in_path)

# mono (safety)
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

# resample to 16k
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)

# save
torchaudio.save(out_path, waveform, 16000)

print("Saved file exists:", os.path.exists(out_path))
print("Saved at:", out_path)


Saved file exists: True
Saved at: /content/processed_wav/TEST_RESAMPLED.wav


In [8]:
pip install silero-vad

Collecting silero-vad
  Downloading silero_vad-6.2.0-py3-none-any.whl.metadata (9.2 kB)
Collecting onnxruntime>=1.16.1 (from silero-vad)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime>=1.16.1->silero-vad)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.16.1->silero-vad)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading silero_vad-6.2.0-py3-none-any.whl (6.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)


In [9]:
import torch
from silero_vad import load_silero_vad, get_speech_timestamps

# load model
vad_model = load_silero_vad()

# input paths
DATA_DIR = "/content/hi_test_dataset/audio+transcripts"
OUT_DIR = "/content/processed_wav"

test_file = "common_voice_hi_24258388.mp3"
in_path = os.path.join(DATA_DIR, test_file)
out_path = os.path.join(OUT_DIR, "TEST_VAD.wav")

# load & resample
waveform, sr = torchaudio.load(in_path)

if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# VERY GENTLE VAD (for short Common Voice clips)
speech_ts = get_speech_timestamps(
    waveform,
    vad_model,
    sampling_rate=16000,
    threshold=0.2,
    min_speech_duration_ms=50,
    min_silence_duration_ms=300
)

print("VAD segments:", speech_ts)

if len(speech_ts) == 0:
    print("No speech detected by VAD")
else:
    speech_audio = torch.cat(
        [waveform[:, s["start"]:s["end"]] for s in speech_ts],
        dim=1
    )

    torchaudio.save(out_path, speech_audio, 16000)
    print("Saved VAD file exists:", os.path.exists(out_path))


VAD segments: [{'start': 7712, 'end': 76768}]
Saved VAD file exists: True


In [10]:
import os
import torch
import torchaudio
from silero_vad import get_speech_timestamps

DATA_DIR = "hi_test_dataset/audio+transcripts"
OUT_DIR = "/content/processed_wav"

os.makedirs(OUT_DIR, exist_ok=True)

mp3_files = [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".mp3")]

print("Total mp3 files:", len(mp3_files))

saved_count = 0
skipped_count = 0

for fname in mp3_files:
    in_path = os.path.join(DATA_DIR, fname)
    out_name = fname.replace(".mp3", ".wav")
    out_path = os.path.join(OUT_DIR, out_name)

    # load
    waveform, sr = torchaudio.load(in_path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
        sr = 16000

    # gentle VAD
    speech_ts = get_speech_timestamps(
        waveform,
        vad_model,
        sampling_rate=16000,
        threshold=0.2,
        min_speech_duration_ms=50,
        min_silence_duration_ms=300
    )

    if len(speech_ts) == 0:
        skipped_count += 1
        print("Skipped (no speech):", fname)
        continue

    speech_audio = torch.cat(
        [waveform[:, s["start"]:s["end"]] for s in speech_ts],
        dim=1
    )

    torchaudio.save(out_path, speech_audio, 16000)
    saved_count += 1
    print("Saved:", out_name)

print("\nSummary:")
print("Saved files:", saved_count)
print("Skipped files:", skipped_count)


Total mp3 files: 100
Saved: common_voice_hi_27762774.wav
Saved: common_voice_hi_26044297.wav
Saved: common_voice_hi_24258388.wav
Saved: common_voice_hi_26018159.wav
Saved: common_voice_hi_27408369.wav
Saved: common_voice_hi_27372138.wav
Saved: common_voice_hi_27371837.wav
Saved: common_voice_hi_24360601.wav
Saved: common_voice_hi_27407935.wav
Saved: common_voice_hi_32249437.wav
Saved: common_voice_hi_26988410.wav
Saved: common_voice_hi_26120121.wav
Saved: common_voice_hi_26120192.wav
Saved: common_voice_hi_24829334.wav
Saved: common_voice_hi_27565446.wav
Saved: common_voice_hi_26044000.wav
Saved: common_voice_hi_24225355.wav
Saved: common_voice_hi_24359434.wav
Saved: common_voice_hi_25215256.wav
Saved: common_voice_hi_26326389.wav
Saved: common_voice_hi_26955587.wav
Saved: common_voice_hi_26040841.wav
Saved: common_voice_hi_26203988.wav
Saved: common_voice_hi_26202875.wav
Saved: common_voice_hi_25288730.wav
Saved: common_voice_hi_25204281.wav
Saved: common_voice_hi_24974881.wav
Saved: 

In [None]:
'''

 All 100 .mp3 files were found
 All were loaded correctly
 Resampling to 16 kHz worked
 Gentle Silero VAD detected speech correctly
 Silence was removed
 Processed files were saved
 Output folder is correct

'''

In [12]:
pip install noisereduce

Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.3


In [13]:
import os
import torch
import torchaudio
import noisereduce as nr

# paths
IN_DIR = "/content/processed_wav"
OUT_DIR = "/content/processed_wav_clean"

os.makedirs(OUT_DIR, exist_ok=True)

wav_files = [f for f in os.listdir(IN_DIR) if f.endswith(".wav")]

print("Total WAV files:", len(wav_files))

saved_count = 0

for fname in wav_files:
    in_path = os.path.join(IN_DIR, fname)
    out_path = os.path.join(OUT_DIR, fname)

    # load
    waveform, sr = torchaudio.load(in_path)

    # --- Noise Reduction (light) ---
    audio_np = waveform.numpy().squeeze()
    clean_np = nr.reduce_noise(
        y=audio_np,
        sr=sr,
        prop_decrease=0.7   # light, safe
    )

    clean_wave = torch.tensor(clean_np).unsqueeze(0)

    # --- Loudness Normalization ---
    max_val = clean_wave.abs().max()
    if max_val > 0:
        clean_wave = clean_wave / max_val * 0.95  # prevent clipping

    # save
    torchaudio.save(out_path, clean_wave, sr)
    saved_count += 1

    if saved_count <= 3:
        print("Saved:", fname)

print("\nSummary:")
print("Cleaned files saved:", saved_count)


Total WAV files: 102
Saved: common_voice_hi_26018159.wav
Saved: common_voice_hi_24969673.wav
Saved: common_voice_hi_23809919.wav

Summary:
Cleaned files saved: 102


In [23]:
# 1. Remove wrong whisper package
!pip uninstall -y whisper

# 2. Install correct OpenAI Whisper
!pip install -U openai-whisper

# 3. Restart import cache (important)
import importlib, sys
sys.modules.pop("whisper", None)

# 4. Test correct import
import whisper
print("Whisper loaded from:", whisper.__file__)

# 5. Test model loading
model = whisper.load_model("small")
print("Whisper model loaded successfully")


Whisper loaded from: /usr/local/lib/python3.12/dist-packages/whisper/__init__.py


100%|███████████████████████████████████████| 461M/461M [00:07<00:00, 61.8MiB/s]


Whisper model loaded successfully


In [24]:
import os
import whisper

# paths
IN_DIR = "/content/processed_wav_clean"
OUT_TXT ="/content/text/predicted_transcripts.txt"

# load whisper model
model = whisper.load_model("small")  # stable for Hindi

wav_files = sorted([f for f in os.listdir(IN_DIR) if f.endswith(".wav")])

print("Total files to transcribe:", len(wav_files))

with open(OUT_TXT, "w", encoding="utf-8") as out_f:
    for i, fname in enumerate(wav_files, 1):
        wav_path = os.path.join(IN_DIR, fname)

        result = model.transcribe(
            wav_path,
            language="hi",
            task="transcribe",
            temperature=0.0,
            beam_size=5
        )

        text = result["text"].strip()

        out_f.write(f"{fname} | {text}\n")

        if i <= 3:
            print(f"Sample {i}: {fname} | {text}")

print("\nSaved transcripts to:", OUT_TXT)


Total files to transcribe: 102
Sample 1: TEST_RESAMPLED.wav | करीना के लाडले बेटे तैमूर को खाने में पसंद है ये खास जीज
Sample 2: TEST_VAD.wav | करीना के लाडले बेटे तैमूर को खाने में पसंद है ये खास जीज
Sample 3: common_voice_hi_23795243.wav | यह साई सब से अची हैं

Saved transcripts to: /content/text/predicted_transcripts.txt


In [26]:
pip install jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [27]:
from jiwer import wer
import re

# ---- PATHS ----
GT_TXT = "/content/hi_test_dataset/audio+transcripts/transcripts.txt"
PRED_TXT = "/content/text/predicted_transcripts.txt"

# ---- HINDI NORMALIZATION ----
def normalize_hi(text):
    text = text.lower()
    text = re.sub(r"[।?,!\"']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---- LOAD TRANSCRIPTS ----
def load_txt(path):
    data = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if "|" not in line:
                continue
            fname, text = line.strip().split("|", 1)
            data[fname.strip()] = text.strip()
    return data

gt = load_txt(GT_TXT)
pred = load_txt(PRED_TXT)

# ---- WER CALCULATION ----
wers = []

for fname in gt:
    wav_name = fname.replace(".mp3", ".wav")  # important mapping

    if wav_name not in pred:
        continue

    ref = normalize_hi(gt[fname])
    hyp = normalize_hi(pred[wav_name])

    score = wer(ref, hyp)
    wers.append(score)

# ---- FINAL RESULT ----
avg_wer = sum(wers) / len(wers)

print("Total files evaluated:", len(wers))
print("Average WER:", round(avg_wer, 3))


Total files evaluated: 100
Average WER: 0.587


In [29]:
from jiwer import cer
import re

# ---- PATHS ----
GT_TXT = "/content/hi_test_dataset/audio+transcripts/transcripts.txt"
PRED_TXT = "/content/text/predicted_transcripts.txt"

# ---- HINDI NORMALIZATION (light, safe) ----
def normalize_hi(text):
    text = text.lower()
    text = re.sub(r"[।?,!\"']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---- LOAD TEXT FILES ----
def load_txt(path):
    data = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if "|" not in line:
                continue
            fname, text = line.strip().split("|", 1)
            data[fname.strip()] = text.strip()
    return data

gt = load_txt(GT_TXT)
pred = load_txt(PRED_TXT)

# ---- CER CALCULATION ----
cers = []

for fname in gt:
    wav_name = fname.replace(".mp3", ".wav")  # mapping

    if wav_name not in pred:
        continue

    ref = normalize_hi(gt[fname])
    hyp = normalize_hi(pred[wav_name])

    score = cer(ref, hyp)
    cers.append(score)

# ---- FINAL RESULT ----
avg_cer = sum(cers) / len(cers)

print("Total files evaluated:", len(cers))
print("Average CER:", round(avg_cer, 3))


Total files evaluated: 100
Average CER: 0.317


In [30]:
import re

# ---- PATHS ----
GT_TXT = "/content/hi_test_dataset/audio+transcripts/transcripts.txt"
PRED_TXT = "/content/text/predicted_transcripts.txt"

# ---- HINDI NORMALIZATION ----
def normalize_hi(text):
    text = text.lower()
    text = re.sub(r"[।?,!\"']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---- LOAD FILES ----
def load_txt(path):
    data = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if "|" not in line:
                continue
            fname, text = line.strip().split("|", 1)
            data[fname.strip()] = text.strip()
    return data

gt = load_txt(GT_TXT)
pred = load_txt(PRED_TXT)

# ---- SER CALCULATION ----
total = 0
wrong = 0

for fname in gt:
    wav_name = fname.replace(".mp3", ".wav")

    if wav_name not in pred:
        continue

    ref = normalize_hi(gt[fname])
    hyp = normalize_hi(pred[wav_name])

    total += 1
    if ref != hyp:
        wrong += 1

ser = wrong / total

print("Total sentences evaluated:", total)
print("Sentence Error Rate (SER):", round(ser, 3))


Total sentences evaluated: 100
Sentence Error Rate (SER): 0.97
