In [1]:
!apt-get update && apt-get install -y ffmpeg  # Fix for TorchCodec/FFmpeg issues
!pip install -q datasets==2.21.0 accelerate peft bitsandbytes jiwer evaluate librosa soundfile transformers

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,130 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.n

In [None]:
!pip install -q librosa soundfile

In [2]:
CSV_PATH          = "/content/drive/MyDrive/nepali_speech_data.csv"
AUDIO_ROOT        = "/content/drive/MyDrive/audio_chunks"   # folder with .wav files
CHECKPOINT_DIR    = "/content/drive/MyDrive/whisper-small-lora-nepali/checkpoint-1000"  # your previous LoRA checkpoint
OUTPUT_DIR        = "/content/drive/MyDrive/whisper_small_nep_parliament"

In [None]:
# DEBUG: Show first 10 raw lines
with open(CSV_PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()[:10]

print("First 10 raw lines from CSV:")
for i, line in enumerate(lines, 1):
    print(f"{i:2d}: {repr(line.strip())}")

First 10 raw lines from CSV:
 1: 'file_name,transcription'
 2: '2079-11-27_1.wav,"महेश कुमार महरा । समय ६ मिनेट । यसपछि माननीय दिल कुमारी'
 3: 'रावल ।"'
 4: '2079-11-27_2.wav,"सम्माननीय अध्यक्ष महोदय, आज म यो सदनमा अलि केही कुरा राख गइरहेको छु । मलाई के लाग्छ भने राज्य भनेको नागरिकको अभिभावक हो र नागरिकपट्टि चैं उत्तरदायी हुनुपर्छ राज्यले । तर त्यसो "'
 5: '2079-11-27_3.wav,"मैले देखिरहेको छैन । एउटा म घटना क्रम यहाँ राख चाहन्छु । यो विभिन्न सोसियल मिडियामा पनि यो निकै भनौं प्रचार भएको छ । एउटा सुरक्षाकर्मी आफ्नो ड्युटी सिदिएर घर जाने क्रममा मोटरसाइकल दुर्घटनामा पर्छन्\u200c । र परिसकेपछि उहाँको उपचारका निमित्त टाउकोमा गहिरो चोट लाग्छ । र न त्यो सङ्गठन तयार छ, न राज्य सुन्छ ।राज्य बेखबर भएको छ । त्यसो भएको कारण"'
 6: "2079-11-27_4.wav,म यो सम्मानित सदनबाट सम्माननीय अध्यक्षमार्फत्\u200c नेपाल सरकारको ध्यानाकर्षण गराउन चाहन्छु । यदि राज्य चैं जिम्मेवार हुँदैन भने एउटा नेपाल प्रहरीको जवान जब त्यो ड्युटी सिद्धिएर घर गइसकेपछि जो घटना भो' त्यो घटनापश्चात्\u200c विभिन्न उपचारको क्रममा चैं हुन

In [None]:
import pandas as pd
from pathlib import Path
import re

# === 1. READ RAW LINES ===
with open(CSV_PATH, 'r', encoding='utf-8') as f:
    lines = [line.rstrip('\n') for line in f.readlines()]

print(f"Total raw lines in CSV: {len(lines)}")

# === 2. SKIP HEADER & PARSE ===
data = []
current_file = None
current_trans = []

for i, line in enumerate(lines):
    line = line.strip()
    if not line:
        continue

    # Does this line start with a .wav filename?
    if re.match(r'^[\w\-.]+\.wav,', line):
        # Save previous entry
        if current_file:
            transcription = " ".join(current_trans).strip().strip('"')
            data.append({"file_name": current_file, "transcription": transcription})
        # Start new entry
        parts = line.split(",", 1)
        current_file = parts[0].strip()
        current_trans = [parts[1].strip().strip('"')] if len(parts) > 1 else []
    else:
        # Continuation line
        cleaned = line.strip().strip('"')
        if cleaned:
            current_trans.append(cleaned)

# Save last entry
if current_file:
    transcription = " ".join(current_trans).strip().strip('"')
    data.append({"file_name": current_file, "transcription": transcription})

df = pd.DataFrame(data)
print(f"Parsed {len(df)} valid audio entries.\n")

# === 3. CLEAN & BUILD PATHS ===
df["file_name"] = df["file_name"].str.strip()
df["audio_path"] = df["file_name"].apply(lambda x: Path(AUDIO_ROOT) / x)

# === 4. FIND BAD PATHS BEFORE .exists() ===
bad_paths = []
valid_paths = []

for idx, path in df["audio_path"].items():
    try:
        # This will fail if path is malformed
        if not path.exists():
            bad_paths.append((df.loc[idx, "file_name"], str(path)))
        else:
            valid_paths.append(str(path))
    except Exception as e:
        bad_paths.append((df.loc[idx, "file_name"], f"ERROR: {e}"))

# === 5. FINAL CLEAN DF ===
df = df[df["audio_path"].apply(lambda p: p.exists())].reset_index(drop=True)

print(f"FINAL DATASET: {len(df)} samples with valid audio files\n")
print("First 3 entries:")
print(df[["file_name", "transcription"]].head(3).to_string(index=False))

# === 6. REPORT PROBLEMS ===
if bad_paths:
    print(f"\nWARNING: {len(bad_paths)} files NOT found or malformed:")
    for name, path in bad_paths[:5]:
        print(f"   • {name} → {path}")
    if len(bad_paths) > 5:
        print(f"   ... and {len(bad_paths) - 5} more")
else:
    print("\nAll audio files found and valid!")

Total raw lines in CSV: 411
Parsed 333 valid audio entries.

FINAL DATASET: 333 samples with valid audio files

First 3 entries:
       file_name                                                                                                                                                                                                                                                                                                                                     transcription
2079-11-27_1.wav                                                                                                                                                                                                                                                                                    महेश कुमार महरा । समय ६ मिनेट । यसपछि माननीय दिल कुमारी रावल ।
2079-11-27_2.wav                                                                                                                                                 

In [None]:
from datasets import Dataset, Audio

# Convert audio_path from Path → string (required!)
df["audio_path"] = df["audio_path"].astype(str)

# Now create HF Dataset
dataset = Dataset.from_pandas(df[["audio_path", "transcription"]])

# Cast to Audio → loads + resamples to 16kHz on-the-fly
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16_000))

# Rename transcription → sentence (standard for ASR)
dataset = dataset.rename_column("transcription", "sentence")

# Preview
print("Dataset created successfully!")
print(dataset)

print("\nExample 0:")
example = dataset[0]
print(f"File: {example['audio_path']['path'].split('/')[-1]}")
print(f"Duration: {len(example['audio_path']['array']) / 16000:.2f} seconds")
print(f"Transcription: {example['sentence'][:120]}...")

Dataset created successfully!
Dataset({
    features: ['audio_path', 'sentence'],
    num_rows: 333
})

Example 0:
File: 2079-11-27_1.wav
Duration: 30.04 seconds
Transcription: महेश कुमार महरा । समय ६ मिनेट । यसपछि माननीय दिल कुमारी रावल ।...


In [None]:
# DIAGNOSE: Are audio arrays actually loaded?
print(f"Total samples: {len(dataset)}")

valid = 0
for i in range(min(5, len(dataset))):
    audio = dataset[i]["audio_path"]
    arr = audio["array"]
    path = audio["path"]
    print(f"\nSample {i}:")
    print(f"   Path: {path.split('/')[-1] if path else 'None'}")
    print(f"   Array type: {type(arr)}")
    print(f"   Array length: {len(arr) if arr is not None else 'None'}")
    if arr is not None and len(arr) > 0:
        valid += 1

print(f"\nValid audio in first 5: {valid}/5")

Total samples: 333

Sample 0:
   Path: 2079-11-27_1.wav
   Array type: <class 'numpy.ndarray'>
   Array length: 480640

Sample 1:
   Path: 2079-11-27_2.wav
   Array type: <class 'numpy.ndarray'>
   Array length: 480640

Sample 2:
   Path: 2079-11-27_3.wav
   Array type: <class 'numpy.ndarray'>
   Array length: 480640

Sample 3:
   Path: 2079-11-27_4.wav
   Array type: <class 'numpy.ndarray'>
   Array length: 480640

Sample 4:
   Path: 2079-11-27_5.wav
   Array type: <class 'numpy.ndarray'>
   Array length: 480640

Valid audio in first 5: 5/5


In [None]:
from datasets import Dataset
import numpy as np
import os

# === RELOAD WITH TRIM TO 30.0s ===
def load_and_trim(row):
    path = row["audio_path"]
    try:
        y, sr = librosa.load(path, sr=16000, mono=True)
        max_samples = int(30.0 * 16000)  # 480,000
        if len(y) > max_samples:
            y = y[:max_samples]
        return {
            "audio": {"array": y, "sampling_rate": 16000, "path": path},
            "sentence": row["transcription"]
        }
    except Exception as e:
        print(f"Failed: {path} → {e}")
        return None

print("Loading + trimming to 30.0s...")
data = [load_and_trim(row) for row in df.to_dict("records") if load_and_trim(row)]
dataset = Dataset.from_list(data)

print(f"Final dataset: {len(dataset)} samples (all ≤ 30.0s)")

# Cache
dataset.save_to_disk("/content/drive/MyDrive/nepali_asr_dataset_cached")
print("Cached!")

# Preview
ex = dataset[0]
print(f"Example: {os.path.basename(ex['audio']['path'])}, {len(ex['audio']['array'])/16000:.2f}s")

Loading + trimming to 30.0s...
Final dataset: 333 samples (all ≤ 30.0s)


Saving the dataset (0/2 shards):   0%|          | 0/333 [00:00<?, ? examples/s]

Cached!
Example: 2079-11-27_1.wav, 30.00s


In [5]:
from datasets import load_from_disk
dataset = load_from_disk("/content/drive/MyDrive/nepali_asr_dataset_cached")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = dataset["train"]
eval_ds  = dataset["test"]
print(f"Train: {len(train_ds)} | Eval: {len(eval_ds)}")

Train: 299 | Eval: 34


In [45]:
# === 1. Preprocessing function (same) ===
def prepare_dataset(batch):
    audio = [b["array"] for b in batch["audio"]]
    input_features = processor.feature_extractor(
        audio, sampling_rate=16000, padding="max_length", max_length=30*16000, truncation=True
    ).input_features

    labels = processor.tokenizer(
        batch["sentence"], padding="max_length", max_length=448, truncation=True
    ).input_ids

    return {"input_features": input_features, "labels": labels}

# === 2. Apply + SAVE ===
preprocessed_dir = "/content/drive/MyDrive/nepali_asr_preprocessed"

print("Preprocessing + saving train...")
train_ds = train_ds.map(
    prepare_dataset,
    batched=True,
    batch_size=8,
    remove_columns=train_ds.column_names,
    num_proc=1
)
train_ds.save_to_disk(f"{preprocessed_dir}/train")

print("Preprocessing + saving eval...")
eval_ds = eval_ds.map(
    prepare_dataset,
    batched=True,
    batch_size=8,
    remove_columns=eval_ds.column_names,
    num_proc=1
)
eval_ds.save_to_disk(f"{preprocessed_dir}/eval")

print(f"\nPreprocessed datasets saved to: {preprocessed_dir}")
print(f"   Train: {len(train_ds)} samples")
print(f"   Eval:  {len(eval_ds)} samples")

Preprocessing + saving train...


Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/299 [00:00<?, ? examples/s]

Preprocessing + saving eval...


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34 [00:00<?, ? examples/s]


Preprocessed datasets saved to: /content/drive/MyDrive/nepali_asr_preprocessed
   Train: 299 samples
   Eval:  34 samples


In [35]:
from datasets import load_from_disk

# === 1. LOAD PREPROCESSED DATA ===
preprocessed_dir = "/content/drive/MyDrive/nepali_asr_preprocessed"
train_ds = load_from_disk(f"{preprocessed_dir}/train")
eval_ds  = load_from_disk(f"{preprocessed_dir}/eval")

print(f"Loaded preprocessed data:")
print(f"   Train: {len(train_ds)} samples")
print(f"   Eval:  {len(eval_ds)} samples")
print(f"   Features: {train_ds.column_names}")

Loaded preprocessed data:
   Train: 299 samples
   Eval:  34 samples
   Features: ['input_features', 'labels']


In [36]:
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import WhisperProcessor, WhisperForConditionalGeneration # Import here
import torch

# Initialize the WhisperProcessor here
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Nepali", task="transcribe")

# === 1. Start fresh: base model + new LoRA config ===
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-small",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# === 2. Apply LoRA config (same as before) ===
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    modules_to_save=[],
)

model = get_peft_model(model, peft_config)

# === 3. Load your checkpoint weights into this new adapter ===
print(f"Loading LoRA weights from: {CHECKPOINT_DIR}")
model.load_adapter(CHECKPOINT_DIR, adapter_name="nepali_lora")

# === 4. Set active adapter ===
model.set_adapter("nepali_lora")
model.print_trainable_parameters()

# Explicitly cast trainable LoRA parameters to float32 for stable fp16 training
for name, param in model.named_parameters():
    if param.requires_grad:
        # Check if it's a LoRA layer and not already float32
        if "lora" in name and param.dtype != torch.float32:
            param.data = param.data.to(torch.float32)
            print(f"Cast LoRA parameter '{name}' to float32 for mixed precision stability.")

# === 5. Final check ===
print(f"Active adapter: {model.active_adapter}")

Loading LoRA weights from: /content/drive/MyDrive/whisper-small-lora-nepali/checkpoint-1000
trainable params: 1,769,472 || all params: 245,273,856 || trainable%: 0.7214
Active adapter: nepali_lora


In [37]:
import torch
from dataclasses import dataclass
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperProcessor, WhisperForConditionalGeneration # Added WhisperProcessor, WhisperForConditionalGeneration
import evaluate
import numpy as np

import torch
from dataclasses import dataclass

# === FINAL DATA COLLATOR (ALL TORCH) ===
@dataclass
class DataCollatorWhisper:
    processor: any
    padding: str = "longest"

    def __call__(self, features):
        # 1. Extract input_features (list of np arrays)
        input_features = [f["input_features"] for f in features]

        # 2. Pad with feature_extractor → returns dict of np arrays
        padded = self.processor.feature_extractor.pad(
            {"input_features": input_features},
            padding=self.padding,
            return_tensors="np"
        )

        # 3. Convert to torch.float32
        input_features = torch.tensor(padded["input_features"], dtype=torch.float32)

        # 4. Labels
        labels = [f["labels"] for f in features]
        padded_labels = self.processor.tokenizer.pad(
            {"input_ids": labels},
            padding=self.padding,
            return_tensors="np"
        )

        # 5. Convert to torch.long
        labels_tensor = torch.tensor(padded_labels["input_ids"], dtype=torch.long)
        attention_mask = torch.tensor(padded_labels["attention_mask"], dtype=torch.bool)

        # 6. Mask padding with -100
        labels_tensor = labels_tensor.masked_fill(~attention_mask, -100)

        return {
            "input_features": input_features,
            "labels": labels_tensor
        }

# === RE-INSTANTIATE COLLATOR ===
data_collator = DataCollatorWhisper(processor)

# === 2. WER METRIC ===
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}



In [None]:
# === 1. TRAINING ARGS (FRESH RUN) ===
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,        # effective batch size = 32
    learning_rate=5e-5,
    warmup_steps=30,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=25,                        # eval every ~1/4 epoch
    save_steps=50,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=False, # Changed to False
    predict_with_generate=False,
    generation_max_length=225,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    save_total_limit=2,
    remove_unused_columns=False,
    # NO resume_from_checkpoint → fresh training!
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)

print("Starting training with FULLY CORRECT collator...")
trainer.train()

# Save
final_lora_dir = f"{OUTPUT_DIR}/final_lora_final"
model.save_pretrained(final_lora_dir)
processor.save_pretrained(final_lora_dir)
print(f"Saved to: {final_lora_dir}")

Starting training with FULLY CORRECT collator...


You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
