In [1]:
import os
import zipfile

zip_path = 'train_split.zip'
extract_dir = ''
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(extract_dir)
    print(f"Extracted {zip_path} to {extract_dir}/")
else:
    print(f"Dataset directory already exists: {extract_dir}/")

Extracted train_split.zip to /


In [2]:
pip install openai-whisper pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m800.5/800.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting numba (from openai-whisper)
  Downloading numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba->openai-whisper)
  Downloading llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata 

In [7]:
!pip install librosa soundfile

Defaulting to user installation because normal site-packages is not writeable
Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

# Whisper transcription (no-ffmpeg, base model)
    - What it does:
        - Loads whisper (model "base").
        - Reads Kaldi files from train_split/transcripts/{wav.scp, segments, text}.
        - Loads audio via librosa, slices per segments, calls model.transcribe(y_seg, language="hi", fp16=False).
        -Saves CSV ‚Üí outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv with columns: utt_id, reference, whisper_output.

In [2]:
import librosa
import whisper
import pandas as pd
from pathlib import Path

# Load Whisper model
model = whisper.load_model("base")  # or 'medium', 'large'

# Paths
base_path = Path("train_split/transcripts")
output_dir = Path("outputs_noffmpeg")
output_dir.mkdir(exist_ok=True)

# Read mapping files
with open(base_path / "wav.scp") as f:
    wav_scp = dict(line.strip().split() for line in f)

with open(base_path / "text") as f:
    ref_text = {line.split()[0]: " ".join(line.strip().split()[1:]) for line in f}

# Read segments
segments = []
with open(base_path / "segments") as f:
    for line in f:
        utt_id, wav_id, start, end = line.strip().split()
        segments.append({
            "utt_id": utt_id,
            "wav_path": str(base_path / wav_scp[wav_id]),
            "start": float(start),
            "end": float(end)
        })

# Run transcription directly on NumPy audio
results = []
for seg in segments:
    # Load full audio and extract segment
    y, sr = librosa.load(seg["wav_path"], sr=16000)
    start_sample = int(seg["start"] * sr)
    end_sample = int(seg["end"] * sr)
    y_seg = y[start_sample:end_sample]

    # Pass raw audio array to Whisper directly
    result = model.transcribe(y_seg, language="hi", fp16=False)

    results.append({
        "utt_id": seg["utt_id"],
        "reference": ref_text.get(seg["utt_id"], ""),
        "whisper_output": result["text"].strip()
    })

# Save to CSV
df = pd.DataFrame(results)
df.to_csv(output_dir / "whisper_transcriptions_noffmpeg.csv", index=False)
print("‚úÖ Transcription done. File saved at:", output_dir / "whisper_transcriptions_noffmpeg.csv")


  checkpoint = torch.load(fp, map_location=device)


‚úÖ Transcription done. File saved at: outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv


In [3]:
!pip install transformers accelerate

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


# LLM refinement v1 (Qwen 1.5-1.8B) on Whisper CSV
    - What it does:

        - Loads outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv.
        - Creates a prompt: ‚Äúpreserve Hindi‚ÄìEnglish code-switching; don‚Äôt translate; keep Hindi in Devanagari; only light fixes.‚Äù
        - Uses pipeline("text-generation", model="Qwen/Qwen1.5-1.8B", device_map="auto", torch_dtype=float16); refines each whisper_output.
        - Saves ‚Üí outputs_noffmpeg/llm_refined_transcripts.csv.

In [2]:
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# ---------------- CONFIG ----------------
# Path to the output of your Whisper transcription
csv_path = Path("outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv")
output_path = Path("outputs_noffmpeg/llm_refined_transcripts.csv")

# HuggingFace LLM model (you can swap for Mixtral, DeepSeek, etc.)
model_name = "Qwen/Qwen1.5-1.8B"
# ----------------------------------------

# ‚úÖ Load Whisper transcription output
df = pd.read_csv(csv_path)

# ‚úÖ Load LLM with GPU acceleration
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",  # Automatically maps to available GPU
    torch_dtype=torch.float16  # Use float32 if float16 is unsupported
)

# ‚úÖ Define LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",   # Explicitly ensures GPU usage
    max_new_tokens=100
)

# ‚úÖ Function to prompt and refine text
def refine_text(raw):
    prompt = f"""
Refine the following Hindi-English code-switched sentence. Retain code-switching points and improve fluency and grammar.

Original: {raw}
Refined:"""
    response = llm(prompt, num_return_sequences=1)[0]["generated_text"]
    return response.split("Refined:")[-1].strip()

# ‚úÖ Apply LLM to all rows
df["llm_refined"] = df["whisper_output"].apply(refine_text)

# ‚úÖ Save final result
df.to_csv(output_path, index=False)
print(f"‚úÖ LLM refinement completed and saved to: {output_path}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


‚úÖ LLM refinement completed and saved to: outputs_noffmpeg/llm_refined_transcripts.csv


In [3]:
!pip install jiwer

Defaulting to user installation because normal site-packages is not writeable
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m102.2/102.2 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.1/3.1 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0

# Evaluate (WER/CER) for v1

In [2]:
import pandas as pd
from jiwer import wer, cer

# Load CSV
df = pd.read_csv("outputs_noffmpeg/llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Compute row-wise WER & CER
wer_whisper_scores = []
wer_llm_scores = []
cer_whisper_scores = []
cer_llm_scores = []

for _, row in df.iterrows():
    wer_whisper_scores.append(wer(row["reference"], row["whisper_output"]))
    wer_llm_scores.append(wer(row["reference"], row["llm_refined"]))
    cer_whisper_scores.append(cer(row["reference"], row["whisper_output"]))
    cer_llm_scores.append(cer(row["reference"], row["llm_refined"]))

# Prepare results table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [
        sum(wer_whisper_scores)/len(wer_whisper_scores)*100,
        sum(wer_llm_scores)/len(wer_llm_scores)*100
    ],
    "CER (%)": [
        sum(cer_whisper_scores)/len(cer_whisper_scores)*100,
        sum(cer_llm_scores)/len(cer_llm_scores)*100
    ]
})

# ‚úÖ Print the comparison result
print("\nüìä Evaluation Results:\n")
print(results.to_string(index=False))



üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output 104.828494  90.138387
   LLM Refined 469.010523 636.349431


In [3]:
# View random samples
sample_df = df[["utt_id", "reference", "whisper_output", "llm_refined"]].sample(5)
print(sample_df.to_string(index=False))


                      utt_id                                                                        reference                                                                          whisper_output                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      llm_refined
210672_0jdhCdy6wPFMjRXl_0102                                   ‡§Ø‡§π‡§æ‡§Å ‡§µ‡§ø‡§≠‡§ø‡§®‡•ç‡§® tools ‡§π‡•à‡§Ç ‡§ú‡•ã ‡§π‡§Æ ‡§â‡§™‡§Ø‡•ã‡§ó ‡§ï‡§∞ ‡§∏‡§ï‡§§‡•á ‡§π‡•à‡§Ç                                                ‡§Å‡§µ‡§ï‡•á ‡§µ‡§ø‡§∞ ‡§¨‡•Ç ‡§µ‡§∏ ‡§µ‡§æ‡§∞ ‡§≠‡§æ‡§≠‡•Ä‡§§ ‡§Ü‡§ø‡§è ‡§ï‡§∞ ‡§∏‡

# LLM refinement v1b (Qwen, slightly different prompt text)
    What it does:
        - Reloads Whisper CSV, drops NaNs.
        - Similar ‚Äúpreserve code-switching‚Äù prompt; generates ‚ÄúImproved:‚Äù style output and strips the marker.
        - Saves ‚Üí outputs_noffmpeg/llm_refined_transcripts.csv.   - 

In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# ----------------- CONFIG -----------------
csv_path = "outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv"
output_path = "outputs_noffmpeg/llm_refined_transcripts.csv"
model_name = "Qwen/Qwen1.5-1.8B"  # You can change this to other HF LLMs
# ------------------------------------------

# ‚úÖ Load CSV
df = pd.read_csv(csv_path)
df = df.dropna(subset=["whisper_output"])  # make sure no NaN

# ‚úÖ Load HuggingFace LLM with GPU support
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=100
)

# ‚úÖ Controlled Prompt Function (no hallucinations)
def refine_text(raw):
    prompt = f"""Refine this Hindi-English sentence. Improve grammar and fluency while preserving code-switching points.

Sentence: {raw}
Improved:"""
    output = llm(prompt, num_return_sequences=1)[0]["generated_text"]
    return output.split("Improved:")[-1].strip() if "Improved:" in output else output.strip()

# ‚úÖ Apply LLM refinement
df["llm_refined"] = df["whisper_output"].apply(refine_text)

# ‚úÖ Save refined file
df.to_csv(output_path, index=False)
print(f"‚úÖ Saved refined output to: {output_path}")


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


‚úÖ Saved refined output to: outputs_noffmpeg/llm_refined_transcripts.csv


# Evaluate (WER/CER) for v1b

In [8]:
from jiwer import wer, cer

# Load output
df = pd.read_csv(output_path)
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Optional: Filter clearly broken whisper outputs
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Evaluate
wer_whisper_scores = []
wer_llm_scores = []
cer_whisper_scores = []
cer_llm_scores = []

for _, row in df.iterrows():
    wer_whisper_scores.append(wer(row["reference"], row["whisper_output"]))
    wer_llm_scores.append(wer(row["reference"], row["llm_refined"]))
    cer_whisper_scores.append(cer(row["reference"], row["whisper_output"]))
    cer_llm_scores.append(cer(row["reference"], row["llm_refined"]))

# Results Table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [
        sum(wer_whisper_scores)/len(wer_whisper_scores)*100,
        sum(wer_llm_scores)/len(wer_llm_scores)*100
    ],
    "CER (%)": [
        sum(cer_whisper_scores)/len(cer_whisper_scores)*100,
        sum(cer_llm_scores)/len(cer_llm_scores)*100
    ]
})

# Display
print("\nüìä Evaluation Results:\n")
print(results.to_string(index=False))



üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output 105.408426  84.201110
   LLM Refined 193.219162 178.189485


# ‚ÄúClean‚Äù Qwen refinement (manual generate + decoding)
    What it does:
        - Loads Qwen tokenizer/model directly (AutoModelForCausalLM), filters short utterances.
        - Builds the same style prompt, uses .generate() and careful decoding to strip the prompt.
        - Saves ‚Üí outputs_noffmpeg/llm_refined_transcripts.csv

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# Load data
df = pd.read_csv("outputs_noffmpeg/whisper_transcriptions_noffmpeg.csv")
df = df.dropna(subset=["whisper_output"])
df = df[df["whisper_output"].str.count(r"\w+") > 3]  # Filter low content

# Load model
model_name = "Qwen/Qwen1.5-1.8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# Strict refinement (no hallucination)
def refine_text(prompt_input):
    prompt = f"""Refine this Hindi-English sentence. Keep code-switching unchanged.
                Sentence: {prompt_input}
                Refined:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract just the part after "Refined:"
    if "Refined:" in decoded:
        return decoded.split("Refined:")[-1].strip()
    else:
        return decoded.replace(prompt, "").strip()

# Apply clean refinement
df["llm_refined"] = df["whisper_output"].apply(refine_text)

# Save output
df.to_csv("outputs_noffmpeg/llm_refined_transcripts.csv", index=False)
print("‚úÖ Clean LLM refinement done and saved.")


‚úÖ Clean LLM refinement done and saved.


# Evaluate (WER/CER)

In [10]:
from jiwer import wer, cer

# Load output
df = pd.read_csv(output_path)
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Optional: Filter clearly broken whisper outputs
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Evaluate
wer_whisper_scores = []
wer_llm_scores = []
cer_whisper_scores = []
cer_llm_scores = []

for _, row in df.iterrows():
    wer_whisper_scores.append(wer(row["reference"], row["whisper_output"]))
    wer_llm_scores.append(wer(row["reference"], row["llm_refined"]))
    cer_whisper_scores.append(cer(row["reference"], row["whisper_output"]))
    cer_llm_scores.append(cer(row["reference"], row["llm_refined"]))

# Results Table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [
        sum(wer_whisper_scores)/len(wer_whisper_scores)*100,
        sum(wer_llm_scores)/len(wer_llm_scores)*100
    ],
    "CER (%)": [
        sum(cer_whisper_scores)/len(cer_whisper_scores)*100,
        sum(cer_llm_scores)/len(cer_llm_scores)*100
    ]
})

# Display
print("\nüìä Evaluation Results:\n")
print(results.to_string(index=False))



üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output 105.408426  84.201110
   LLM Refined 193.219162 178.189485


In [18]:
pip install resampy

Defaulting to user installation because normal site-packages is not writeable
Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.1/3.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# STEP 1 (Whisper ‚Äúmedium‚Äù) without ffmpeg

Uses soundfile + resampy to read/ensure 16 kHz mono; transcribes segments with whisper model "medium"

In [3]:
# ---------------------------
# STEP 1: Transcribe using Whisper (medium) without FFmpeg
# ---------------------------
import whisper
import os
import soundfile as sf
import resampy
import numpy as np
from pathlib import Path
import pandas as pd

# Paths
DATA_DIR = Path("train_split/transcripts")
OUTPUT_DIR = Path("outputs_medium")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load Whisper model
model = whisper.load_model("medium")

# Load metadata
segments = pd.read_csv(DATA_DIR / "segments", sep='\s+', names=["utt_id", "wav_id", "start", "end"])

# Fix for WAV file loading based on actual disk structure
wav_scp_raw = pd.read_csv(DATA_DIR / "wav.scp", sep='\s+', header=None)
wav_scp = {
    row[0]: str(DATA_DIR / row[1])  # use raw file path directly without adding .wav
    for row in wav_scp_raw.values
}

# Parse 'text' file manually to handle spaces in transcript
utt2text = {}
with open(DATA_DIR / "text", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            utt2text[parts[0]] = parts[1]

results = []

# Loop over segments
for seg in segments.itertuples():
    wav_path = wav_scp.get(seg.wav_id)
    if not wav_path:
        continue

    full_path = Path(wav_path)
    if not full_path.exists():
        print(f"‚ùå File not found: {full_path}")
        continue

    y, sr = sf.read(full_path)
    start_sample = int(seg.start * sr)
    end_sample = int(seg.end * sr)
    y_seg = y[start_sample:end_sample]

    # Convert stereo to mono if needed
    if len(y_seg.shape) > 1:
        y_seg = np.mean(y_seg, axis=1)

    # Ensure sample rate is 16kHz as required by Whisper
    if sr != 16000:
        y_seg = resampy.resample(y_seg, sr, 16000)
        sr = 16000

    # Convert to float32 to avoid dtype mismatch
    y_seg = y_seg.astype(np.float32)

    # Transcribe directly from numpy audio
    result = model.transcribe(y_seg, language="hi", fp16=False)
    results.append({
        "utt_id": seg.utt_id,
        "reference": utt2text.get(seg.utt_id, ""),
        "whisper_output": result['text'].strip()
    })

# Save transcriptions
df = pd.DataFrame(results)
df.to_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv", index=False)
print("\u2705 Whisper transcription complete.")

  checkpoint = torch.load(fp, map_location=device)


‚úÖ Whisper transcription complete.


# STEP 2 (LLM post-processing, Qwen)

Loads the medium CSV; builds preserve-code-switching prompt; runs Qwen 1.5-1.8B; saves ‚Üí outputs_medium/llm_refined_transcripts.csv.

In [17]:
# ---------------------------
# STEP 2: LLM Post-processing (preserve code-switching)
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen1.5-1.8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# Reload CSV
df = pd.read_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv")
df = df.dropna(subset=["whisper_output"])
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Refine preserving code-switching

def refine_text(prompt_input):
    prompt = (
        "The following sentence contains Hindi-English code-switching. "
        "Refine it by preserving the structure and language switching as it is. "
        "Do not correct grammar or translate to one language. Keep Hindi words in Devanagari script. "
        "Improve intelligibility only where needed, but avoid altering meaning.\n\n"
        f"Sentence: {prompt_input}\n"
        "Refined:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded.split("Refined:")[-1].strip()

# Apply
print("\u23F3 Running LLM refinement (preserving code-switching)...")
df["llm_refined"] = df["whisper_output"].apply(refine_text)
df.to_csv(OUTPUT_DIR / "llm_refined_transcripts.csv", index=False)
print("\u2705 LLM refinement completed.")


‚è≥ Running LLM refinement (preserving code-switching)...




‚úÖ LLM refinement completed.


# STEP 3 (Evaluate medium + Qwen)

In [18]:
# ---------------------------
# STEP 3: Evaluation
# ---------------------------
from jiwer import wer, cer

# Load for evaluation
df = pd.read_csv(OUTPUT_DIR / "llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Evaluation
wer_whisper = [wer(r, h) for r, h in zip(df.reference, df.whisper_output)]
wer_llm = [wer(r, h) for r, h in zip(df.reference, df.llm_refined)]
cer_whisper = [cer(r, h) for r, h in zip(df.reference, df.whisper_output)]
cer_llm = [cer(r, h) for r, h in zip(df.reference, df.llm_refined)]

# Summary table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [sum(wer_whisper)/len(wer_whisper)*100, sum(wer_llm)/len(wer_llm)*100],
    "CER (%)": [sum(cer_whisper)/len(cer_whisper)*100, sum(cer_llm)/len(cer_llm)*100]
})

print("\n\U0001F4CA Evaluation Results:\n")
print(results.to_string(index=False))



üìä Evaluation Results:

         Model    WER (%)     CER (%)
Whisper Output  84.763362   74.156412
   LLM Refined 817.949885 1139.750696


# Combined pipeline (Whisper medium ‚Üí Qwen ‚Üí Eval) in one cell

In [20]:
# ---------------------------
# STEP 1: Transcribe using Whisper (medium) without FFmpeg
# ---------------------------
import whisper
import os
import soundfile as sf
import resampy
import numpy as np
from pathlib import Path
import pandas as pd

# Paths
DATA_DIR = Path("train_split/transcripts")
OUTPUT_DIR = Path("outputs_medium")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load Whisper model
model = whisper.load_model("medium")

# Load metadata
segments = pd.read_csv(DATA_DIR / "segments", sep='\s+', names=["utt_id", "wav_id", "start", "end"])

# Fix for WAV file loading based on actual disk structure
wav_scp_raw = pd.read_csv(DATA_DIR / "wav.scp", sep='\s+', header=None)
wav_scp = {
    row[0]: str(DATA_DIR / row[1])
    for row in wav_scp_raw.values
}

# Parse 'text' file manually to handle spaces in transcript
utt2text = {}
with open(DATA_DIR / "text", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            utt2text[parts[0]] = parts[1]

results = []

# Loop over segments
for seg in segments.itertuples():
    wav_path = wav_scp.get(seg.wav_id)
    if not wav_path:
        continue

    full_path = Path(wav_path)
    if not full_path.exists():
        print(f"‚ùå File not found: {full_path}")
        continue

    y, sr = sf.read(full_path)
    start_sample = int(seg.start * sr)
    end_sample = int(seg.end * sr)
    y_seg = y[start_sample:end_sample]

    # Convert stereo to mono if needed
    if len(y_seg.shape) > 1:
        y_seg = np.mean(y_seg, axis=1)

    # Ensure sample rate is 16kHz as required by Whisper
    if sr != 16000:
        y_seg = resampy.resample(y_seg, sr, 16000)
        sr = 16000

    # Convert to float32 to avoid dtype mismatch
    y_seg = y_seg.astype(np.float32)

    # Transcribe directly from numpy audio
    result = model.transcribe(y_seg, language="hi", fp16=False)
    results.append({
        "utt_id": seg.utt_id,
        "reference": utt2text.get(seg.utt_id, ""),
        "whisper_output": result['text'].strip()
    })

# Save transcriptions
df = pd.DataFrame(results)
df.to_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv", index=False)
print("\u2705 Whisper transcription complete.")


# ---------------------------
# STEP 2: LLM Post-processing (preserve code-switching)
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen1.5-1.8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# Reload CSV
df = pd.read_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv")
df = df.dropna(subset=["whisper_output"])
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Refine preserving code-switching

def refine_text(prompt_input):
    prompt = (
        "Below is a code-switched Hindi-English sentence. Preserve the exact content and language switching style.\n"
        "Do NOT correct grammar. Do NOT translate. Only enhance understandability minimally.\n"
        "Keep Hindi words in Devanagari script.\n"
        f"Sentence: {prompt_input}\n"
        "Output:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded.split("Output:")[-1].strip()

# Apply
print("\u23F3 Running LLM refinement (preserving code-switching)...")
df["llm_refined"] = df["whisper_output"].apply(refine_text)
df.to_csv(OUTPUT_DIR / "llm_refined_transcripts.csv", index=False)
print("\u2705 LLM refinement completed.")


# ---------------------------
# STEP 3: Evaluation
# ---------------------------
from jiwer import wer, cer

# Load for evaluation
df = pd.read_csv(OUTPUT_DIR / "llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Evaluation
wer_whisper = [wer(r, h) for r, h in zip(df.reference, df.whisper_output)]
wer_llm = [wer(r, h) for r, h in zip(df.reference, df.llm_refined)]
cer_whisper = [cer(r, h) for r, h in zip(df.reference, df.whisper_output)]
cer_llm = [cer(r, h) for r, h in zip(df.reference, df.llm_refined)]

# Summary table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [sum(wer_whisper)/len(wer_whisper)*100, sum(wer_llm)/len(wer_llm)*100],
    "CER (%)": [sum(cer_whisper)/len(cer_whisper)*100, sum(cer_llm)/len(cer_llm)*100]
})

print("\n\U0001F4CA Evaluation Results:\n")
print(results.to_string(index=False))


‚úÖ Whisper transcription complete.
‚è≥ Running LLM refinement (preserving code-switching)...




‚úÖ LLM refinement completed.

üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output  85.921188  76.559156
   LLM Refined 725.098964 969.756419


# Combined pipeline (Whisper medium ‚Üí FLAN-T5 ‚Üí Eval)

In [14]:
# ---------------------------
# STEP 1: Transcribe using Whisper (medium) without FFmpeg
# ---------------------------
import whisper
import os
import soundfile as sf
import resampy
import numpy as np
from pathlib import Path
import pandas as pd

# Paths
DATA_DIR = Path("train_split/transcripts")
OUTPUT_DIR = Path("outputs_medium")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load Whisper model
model = whisper.load_model("medium")

# Load metadata
segments = pd.read_csv(DATA_DIR / "segments", sep='\s+', names=["utt_id", "wav_id", "start", "end"])

# Fix for WAV file loading based on actual disk structure
wav_scp_raw = pd.read_csv(DATA_DIR / "wav.scp", sep='\s+', header=None)
wav_scp = {
    row[0]: str(DATA_DIR / row[1])
    for row in wav_scp_raw.values
}

# Parse 'text' file manually to handle spaces in transcript
utt2text = {}
with open(DATA_DIR / "text", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            utt2text[parts[0]] = parts[1]

results = []

# Loop over segments
for seg in segments.itertuples():
    wav_path = wav_scp.get(seg.wav_id)
    if not wav_path:
        continue

    full_path = Path(wav_path)
    if not full_path.exists():
        print(f"‚ùå File not found: {full_path}")
        continue

    y, sr = sf.read(full_path)
    start_sample = int(seg.start * sr)
    end_sample = int(seg.end * sr)
    y_seg = y[start_sample:end_sample]

    # Convert stereo to mono if needed
    if len(y_seg.shape) > 1:
        y_seg = np.mean(y_seg, axis=1)

    # Ensure sample rate is 16kHz as required by Whisper
    if sr != 16000:
        y_seg = resampy.resample(y_seg, sr, 16000)
        sr = 16000

    # Convert to float32 to avoid dtype mismatch
    y_seg = y_seg.astype(np.float32)

    # Transcribe directly from numpy audio
    result = model.transcribe(y_seg, language="hi", fp16=False)
    results.append({
        "utt_id": seg.utt_id,
        "reference": utt2text.get(seg.utt_id, ""),
        "whisper_output": result['text'].strip()
    })

# Save transcriptions
df = pd.DataFrame(results)
df.to_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv", index=False)
print("\u2705 Whisper transcription complete.")


# ---------------------------
# STEP 2: LLM Post-processing (preserve code-switching) using FLAN-T5
# ---------------------------
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Reload CSV
df = pd.read_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv")
df = df.dropna(subset=["whisper_output"])
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Refine preserving code-switching without translating or correcting grammar
def refine_text(prompt_input):
    prompt = (
        "Keep the Hindi-English code-switching as it is. Do not translate. "
        "Do not correct grammar. Just clean any hallucinations or repetition and ensure readability.\n"
        f"Input: {prompt_input}"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

print("\u23F3 Running LLM refinement (FLAN-T5, preserving code-switching)...")
df["llm_refined"] = df["whisper_output"].apply(refine_text)
df.to_csv(OUTPUT_DIR / "llm_refined_transcripts.csv", index=False)
print("\u2705 LLM refinement completed.")


# ---------------------------
# STEP 3: Evaluation
# ---------------------------
from jiwer import wer, cer

# Load for evaluation
df = pd.read_csv(OUTPUT_DIR / "llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Evaluation
wer_whisper = [wer(r, h) for r, h in zip(df.reference, df.whisper_output)]
wer_llm = [wer(r, h) for r, h in zip(df.reference, df.llm_refined)]
cer_whisper = [cer(r, h) for r, h in zip(df.reference, df.whisper_output)]
cer_llm = [cer(r, h) for r, h in zip(df.reference, df.llm_refined)]

# Summary table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [sum(wer_whisper)/len(wer_whisper)*100, sum(wer_llm)/len(wer_llm)*100],
    "CER (%)": [sum(cer_whisper)/len(cer_whisper)*100, sum(cer_llm)/len(cer_llm)*100]
})

print("\n\U0001F4CA Evaluation Results:\n")
print(results.to_string(index=False))


  checkpoint = torch.load(fp, map_location=device)


‚úÖ Whisper transcription complete.
‚è≥ Running LLM refinement (FLAN-T5, preserving code-switching)...
‚úÖ LLM refinement completed.

üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output  78.701599  71.404357
   LLM Refined 123.244692 113.442642


# Google FLAN-T5 with few-shot examples)

In [17]:
# ---------------------------
# STEP 2: LLM Post-processing (preserve code-switching) using FLAN-T5 with few-shot examples
# ---------------------------
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Reload transcriptions
df = pd.read_csv(OUTPUT_DIR / "whisper_transcriptions_medium.csv")
df = df.dropna(subset=["whisper_output"])
df = df[df["whisper_output"].str.count(r"\w+") > 3]

# Few-shot prompt-based code-switching-aware cleaner
def refine_text(whisper_output):
    prompt = (
        "You are a bilingual speech-to-text cleaner for Hindi-English code-switched transcripts.\n"
        "Your goal is to ONLY remove hallucinations or repeated/fake words from the transcript.\n"
        "DO NOT paraphrase. DO NOT translate. DO NOT correct grammar. DO NOT add new words.\n"
        "Keep English and Hindi words as-is.\n\n"
        "Example 1:\n"
        "Input: spoken tutorial project talktoa teacher project ‡§™‡§∞ ‡§Ü‡§ß‡§æ‡§∞‡§ø‡§§ ‡§π‡•à\n"
        "Output: spoken tutorial project talk to a teacher project ‡§™‡§∞ ‡§Ü‡§ß‡§æ‡§∞‡§ø‡§§ ‡§π‡•à\n\n"
        "Example 2:\n"
        "Input: ‡§Ö‡§¨ monocyclic compound co-bicyclic compound may be the case\n"
        "Output: ‡§Ö‡§¨ monocyclic compound ‡§ï‡•ã bicyclic compound may be the case\n\n"
        "Example 3:\n"
        "Input: carbonic acid or sulphuric acid structures carbonic acid or sulphuric acid structures\n"
        "Output: carbonic acid or sulphuric acid structures\n\n"
        f"Now fix this:\nInput: {whisper_output}\nOutput:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()




print("\u23F3 Running LLM refinement (FLAN-T5 with few-shot)...")
df["llm_refined"] = df["whisper_output"].apply(refine_text)
df.to_csv(OUTPUT_DIR / "llm_refined_transcripts.csv", index=False)
print("\u2705 LLM refinement completed.")


‚è≥ Running LLM refinement (FLAN-T5 with few-shot)...
‚úÖ LLM refinement completed.


Evaluate FLAN-T5 few-shot

In [18]:
# ---------------------------
# STEP 3: Evaluation
# ---------------------------
from jiwer import wer, cer

# Load for evaluation
df = pd.read_csv(OUTPUT_DIR / "llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Evaluation
wer_whisper = [wer(r, h) for r, h in zip(df.reference, df.whisper_output)]
wer_llm = [wer(r, h) for r, h in zip(df.reference, df.llm_refined)]
cer_whisper = [cer(r, h) for r, h in zip(df.reference, df.whisper_output)]
cer_llm = [cer(r, h) for r, h in zip(df.reference, df.llm_refined)]

# Summary table
results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [sum(wer_whisper)/len(wer_whisper)*100, sum(wer_llm)/len(wer_llm)*100],
    "CER (%)": [sum(cer_whisper)/len(cer_whisper)*100, sum(cer_llm)/len(cer_llm)*100]
})

print("\n\U0001F4CA Evaluation Results:\n")
print(results.to_string(index=False))



üìä Evaluation Results:

         Model   WER (%)   CER (%)
Whisper Output 95.176844 93.352028
   LLM Refined 97.436419 98.743169


# Tagging Technique

In [20]:
pip install fasttext

Defaulting to user installation because normal site-packages is not writeable
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m73.4/73.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313499 sha256=b5f9c3552371c295252549cf7d934d12587f2b6f46671898abb4f945bbee9039
  Stored in directory: /home/jovya

In [1]:
# ---------------------------
# STEP 1: Whisper Transcription
# ---------------------------
import whisper
import os
import soundfile as sf
import resampy
import numpy as np
from pathlib import Path
import pandas as pd
import fasttext
import re

# Paths
DATA_DIR = Path("train_split/transcripts")
OUTPUT_DIR = Path("outputs_tagged")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load Whisper model
model = whisper.load_model("medium")

# Load FastText language identification model (download if not present)
if not Path("lid.176.bin").exists():
    import urllib.request
    print("Downloading FastText language model...")
    urllib.request.urlretrieve("https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", "lid.176.bin")

lang_model = fasttext.load_model("lid.176.bin")

# Load metadata
segments = pd.read_csv(DATA_DIR / "segments", sep='\s+', names=["utt_id", "wav_id", "start", "end"])
wav_scp_raw = pd.read_csv(DATA_DIR / "wav.scp", sep='\s+', header=None)
wav_scp = {row[0]: str(DATA_DIR / row[1]) for row in wav_scp_raw.values}

utt2text = {}
with open(DATA_DIR / "text", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            utt2text[parts[0]] = parts[1]

results = []


  checkpoint = torch.load(fp, map_location=device)


Downloading FastText language model...


# Word-level language tagging

What it does:
- For each Whisper hypothesis, predicts each token‚Äôs language with fastText; prepends <HI> / <EN> tags and joins back.
- Saves ‚Üí outputs_medium/whisper_tagged_transcriptions.csv with columns: utt_id, reference, whisper_output, tagged_output

In [2]:
# ---------------------------
# STEP 2: Transcription and Word-Level Language Tagging
# ---------------------------
def tag_languages(text):
    words = text.strip().split()
    tagged = []
    for word in words:
        lang = lang_model.predict(word.replace("‡•§", "").replace(".", ""))[0][0].replace("__label__", "")
        tag = "<HI>" if lang == "hi" else "<EN>"
        tagged.append(f"{tag} {word}")
    return " ".join(tagged)

for seg in segments.itertuples():
    wav_path = wav_scp.get(seg.wav_id)
    if not wav_path:
        continue

    full_path = Path(wav_path)
    if not full_path.exists():
        print(f"‚ùå File not found: {full_path}")
        continue

    y, sr = sf.read(full_path)
    start_sample = int(seg.start * sr)
    end_sample = int(seg.end * sr)
    y_seg = y[start_sample:end_sample]

    if len(y_seg.shape) > 1:
        y_seg = np.mean(y_seg, axis=1)

    if sr != 16000:
        y_seg = resampy.resample(y_seg, sr, 16000)
        sr = 16000

    y_seg = y_seg.astype(np.float32)

    result = model.transcribe(y_seg, fp16=False)
    output_text = result['text'].strip()
    tagged_text = tag_languages(output_text)

    results.append({
        "utt_id": seg.utt_id,
        "reference": utt2text.get(seg.utt_id, ""),
        "whisper_output": output_text,
        "tagged_output": tagged_text
    })

df = pd.DataFrame(results)
df.to_csv(OUTPUT_DIR / "whisper_tagged_transcriptions.csv", index=False)
print("‚úÖ Whisper transcription and tagging complete.")


‚úÖ Whisper transcription and tagging complete.


# LLM refinement with tags; FLAN-T5

What it does:
- Loads flan-t5-large, builds a prompt that says: ‚ÄúGiven tagged text (<HI>, <EN>), improve readability while preserving tags/code-switch pattern; no translation or meaning change.‚Äù
- Generates refined text per row; saves ‚Üí outputs_medium/llm_refined_transcripts.csv.

In [3]:
# ---------------------------
# STEP 3: LLM Refinement (Preserve Code-switching with Tags)
# ---------------------------
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

print("‚è≥ Running LLM refinement...")

model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
llm = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def refine_with_tags(tagged_text):
    prompt = (
        "You are an expert at processing Hindi-English code-switched transcriptions. "
        "Keep <HI> Hindi and <EN> English tags intact. Only remove repetition or hallucinated words. Do not correct grammar. Do not change languages.\n\n"
        f"Input: {tagged_text}\nOutput:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

df["llm_refined"] = df["tagged_output"].apply(refine_with_tags)
df.to_csv(OUTPUT_DIR / "llm_refined_transcripts.csv", index=False)
print("‚úÖ LLM refinement completed.")


‚è≥ Running LLM refinement...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


‚úÖ LLM refinement completed.


In [4]:
# ---------------------------
# STEP 4: Evaluation (WER/CER)
# ---------------------------
from jiwer import wer, cer

df = pd.read_csv(OUTPUT_DIR / "llm_refined_transcripts.csv")
df = df.dropna(subset=["reference", "whisper_output", "llm_refined"])

# Clean tags before evaluation
def clean_tags(text):
    return re.sub(r"</?EN>|</?HI>", "", text)

wer_whisper = [wer(r, h) for r, h in zip(df.reference, df.whisper_output)]
wer_llm = [wer(r, clean_tags(h)) for r, h in zip(df.reference, df.llm_refined)]
cer_whisper = [cer(r, h) for r, h in zip(df.reference, df.whisper_output)]
cer_llm = [cer(r, clean_tags(h)) for r, h in zip(df.reference, df.llm_refined)]

results = pd.DataFrame({
    "Model": ["Whisper Output", "LLM Refined"],
    "WER (%)": [sum(wer_whisper)/len(wer_whisper)*100, sum(wer_llm)/len(wer_llm)*100],
    "CER (%)": [sum(cer_whisper)/len(cer_whisper)*100, sum(cer_llm)/len(cer_llm)*100]
})

print("\nüìä Evaluation Results:\n")
print(results.to_string(index=False))


üìä Evaluation Results:

         Model    WER (%)    CER (%)
Whisper Output  78.260012  69.075436
   LLM Refined 136.169147 125.845370
