In [15]:
import os
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, cer
import warnings

warnings.filterwarnings("ignore")

def load_grounds_truth(file_path):
    grounds_truths = {}
    
    if not os.path.exists(file_path):
        return grounds_truths
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            if "|" in line:
                audio_file, transcription = line.split("|", 1)
                audio_file = audio_file.strip() or "audio.flac"
            else:
                audio_file = "audio.flac"
                transcription = line
            
            grounds_truths[audio_file] = transcription.strip()
    
    return grounds_truths

def process_audio(file_path):
    try:
        waveform, sr = torchaudio.load(file_path)
        
        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample to 16kHz
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform = resampler(waveform)
        
        # Normalize
        waveform = waveform / torch.max(torch.abs(waveform))
        
        return waveform
    except:
        return None

def transcribe(processor, model, audio):
    audio_array = audio.squeeze().numpy()
    
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
    
    # Set Hindi language
    forced_decoder_ids = processor.get_decoder_prompt_ids(
        language="hindi", 
        task="transcribe"
    )
    
    with torch.no_grad():
        predicted_ids = model.generate(
            inputs["input_features"],
            forced_decoder_ids=forced_decoder_ids,
            max_new_tokens=440,
            num_beams=5,
            do_sample=False
        )
    
    transcription = processor.batch_decode(
        predicted_ids, 
        skip_special_tokens=True
    )[0]
    
    return transcription.strip()

def evaluate_asr():
    # Load model
    print("Loading Whisper model...")
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
    model.eval()
    
    # Load ground truth
    grounds_truths = load_grounds_truth("grounds_truth.txt")
    if not grounds_truths:
        print("No grounds truth found!")
        return
    
    print(f"Processing {len(grounds_truths)} audio file(s)...")
    
    total_wer = 0
    total_cer = 0
    count = 0
    
    for audio_file, gt_text in grounds_truths.items():
        if not os.path.exists(audio_file):
            print(f"File not found: {audio_file}")
            continue
        
        audio = process_audio(audio_file)
        if audio is None:
            print(f"Failed to load: {audio_file}")
            continue
        
        try:
            predicted = transcribe(processor, model, audio)
            
            wer_score = wer(gt_text, predicted)
            cer_score = cer(gt_text, predicted)
            
            total_wer += wer_score
            total_cer += cer_score
            count += 1
            
            print(f"\nFile: {audio_file}")
            print(f"Predicted: {predicted}")
            print(f"Actual: {gt_text}")
            print(f"WER: {wer_score:.3f} | CER: {cer_score:.3f}")
            
        except Exception as e:
            print(f"Error transcribing {audio_file}: {e}")
    
    # Results
    if count > 0:
        
        print("RESULTS")
        
        print(f"Average WER: {total_wer/count:.3f}")
        print(f"Average CER: {total_cer/count:.3f}")
        
        if total_wer/count < 0.2:
            print("Performance: Good")
        elif total_wer/count < 0.4:
            print("Performance: Good")
        else:
            print("Performance: Fair")
    else:
        print("No files processed successfully")

if __name__ == "__main__":
    evaluate_asr()

Loading Whisper model...
Processing 1 audio file(s)...

File: audio.flac
Predicted: हलो मेरा नाम मुहमद अथर है मुझे तीन दिन से खासी और जुगाम की शिकायत हो रही है मेरी बॉड़ी में भी पेल हो रहा है और गले में खराशें सी लग रही है साथी साथ सर भी भारी भारी रह रहा है तो क्या मुझे कोविड है या फिर कुछ और बीमारी होने का
Actual: हेलो मेरा नाम मोहम्मद अतर है मुझे तीन दिन से खांसी और ज़ुकाम की शिकायत हो रही है मेरी बॉडी में भी पेन हो रहा है और गले में खराशे सी लग रही हैं साथ ही साथ सर भी भारी भारी रह रहा है तो क्या मुझे कोविड है या फिर कुछ और बीमारी होने का
WER: 0.196 | CER: 0.061
RESULTS
Average WER: 0.196
Average CER: 0.061
Performance: Good
