In [1]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


CUDA Available: True
GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [2]:
# You can run this in terminal OR create a setup script
!pip install  datasets transformers librosa jiwer evaluate


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import os
import torch
import librosa
import shutil
import pandas as pd
from datasets import Dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from datasets import Dataset, Audio

def load_local_dataset(audio_dir, transcription_dir):
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.mp3')])
    transcription_files = sorted([f for f in os.listdir(transcription_dir) if f.endswith('.txt')])
    
    dataset = []

    for audio_file in audio_files:
        base_name = os.path.splitext(audio_file)[0]
        txt_file = base_name + ".txt"
        
        audio_path = os.path.join(audio_dir, audio_file)
        transcription_path = os.path.join(transcription_dir, txt_file)

        if os.path.exists(transcription_path):
            with open(transcription_path, 'r', encoding='utf-8') as f:
                transcription = f.read().strip()
            dataset.append({
                "audio": audio_path,
                "sentence": transcription
            })

    return Dataset.from_list(dataset)

# Load your train dataset
train_audio_dir = r"C:\Users\ASUS\Desktop\Whispher-Finetuning\sindhi_data\sindhi_data\audio"
train_text_dir = r"C:\Users\ASUS\Desktop\Whispher-Finetuning\sindhi_data\sindhi_data\transcriptions"
train_dataset = load_local_dataset(train_audio_dir, train_text_dir)

# Preprocess: convert to Audio column with sampling_rate
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
print("Training samples:", len(train_dataset))


Training samples: 244


In [5]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load Whisper Tiny instead of Whisper Base
model_name = "openai/whisper-tiny"  # This is the correct model for Whisper Tiny

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [None]:
# from transformers import WhisperProcessor, WhisperForConditionalGeneration

# model_name = "openai/whisper-base"

# processor = WhisperProcessor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained(model_name)

# # Move to GPU if available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)


In [6]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # Audio -> log-Mel features
    batch["input_features"] = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt",
        language="sd",
        task="transcribe"
    ).input_features[0]

    # Transcription -> tokenized labels
    batch["labels"] = processor.tokenizer(
        batch["sentence"],
        padding="longest",
        return_tensors="pt"
    ).input_ids[0]

    return batch

train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)


Map: 100%|██████████| 244/244 [00:06<00:00, 39.78 examples/s]


In [7]:
from transformers import TrainingArguments, Trainer
from dataclasses import dataclass

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

# Data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [8]:
training_args = TrainingArguments(
    output_dir=r"C:\Users\ASUS\Desktop\whispher-tiny\input",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    save_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.005,
    logging_dir="./logs",
    logging_steps=2,
    save_total_limit=2,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,  # using train as eval for now
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

# Start training
print("🚀 Starting fine-tuning...")
trainer.train()


# Save the model & processor
save_path = r"C:\Users\ASUS\Desktop\whispher-tiny\output"
trainer.save_model(save_path)
processor.save_pretrained(save_path)


  trainer = Trainer(


🚀 Starting fine-tuning...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
2,5.4565
4,3.3493
6,2.2648
8,1.8551
10,1.617
12,1.5566
14,1.4312
16,1.0721
18,0.9836
20,0.9705




[]

In [9]:
import os

import torch

import librosa

import pandas as pd

from transformers import WhisperProcessor, WhisperForConditionalGeneration

from evaluate import load
 
# Load fine-tuned model and processor

model_path = r"C:\Users\ASUS\Desktop\Whispher-Finetuning\final-model"

processor = WhisperProcessor.from_pretrained(model_path)

model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
 
# Fix the generation config - remove forced_decoder_ids

model.generation_config.language = "<|sd|>"

model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None  # This is the key fix
 
# Load WER metric

wer_metric = load("wer")
 
# Test data paths

test_audio_dir = r"C:\Users\ASUS\Desktop\Whispher-Finetuning\test_54\audio"

test_text_dir = r"C:\Users\ASUS\Desktop\Whispher-Finetuning\test_54\transcription"
 
# Files to evaluate

test_files = sorted([f for f in os.listdir(test_audio_dir) if f.endswith(".mp3")])
 
# Inference loop

results = []

total_wer = 0
 
for audio_file in test_files:

    audio_path = os.path.join(test_audio_dir, audio_file)

    text_path = os.path.join(test_text_dir, audio_file.replace(".mp3", ".txt"))
 
    # Load audio

    audio, sr = librosa.load(audio_path, sr=16000)

    input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(model.device)
 
    # Get reference text

    with open(text_path, "r", encoding="utf-8") as f:

        reference = f.read().strip().lower()
 
    # Generate transcription with explicit generation parameters

    with torch.no_grad():

        predicted_ids = model.generate(

            input_features,

            language="<|sd|>",  # Sindhi language token

            task="transcribe",

            forced_decoder_ids=None,  # Explicitly set to None

            max_length=448,

            num_beams=1,

            do_sample=False

        )

    prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].lower()
 
    # WER calculation

    wer = wer_metric.compute(predictions=[prediction], references=[reference])

    total_wer += wer
 
    print(f"📄 {audio_file} | WER: {wer:.4f}")

    print("   REF:", reference)

    print("   HYP:", prediction, "\n")
 
    results.append({

        "File": audio_file,

        "Reference": reference,

        "Prediction": prediction,

        "WER": round(wer, 4)

    })
 
# Save to CSV

df = pd.DataFrame(results)

df.to_csv("finetuned_whisper_sindhi_results.csv", index=False)

print(f"✅ Saved results to finetuned_whisper_sindhi_results.csv")
 
# Print average WER

average_wer = total_wer / len(results)

print(f"🔥 Average WER on test set: {average_wer:.4f}")
 

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token 

📄 common_voice_sd_41397153.mp3 | WER: 0.4375
   REF: ڇا اهو اسان کي ٻڌائي ٿو ته آمريڪي واپاري قانون ڪم ڪري رهيو آهي، هن چيو
   HYP:  چان اهو اسان کي اڌائي ٿو ته عمريقي، پاپاري ڪاملون ڪم ڪري رهيو آهي، نه ٿيو 

📄 common_voice_sd_41397158.mp3 | WER: 0.7500
   REF: بخاري جي ٻين روايتن مان معلوم ٿئي ٿو
   HYP:  ٻو خاري جي بيٽن روائيٽن مان، معلوم ٿئي ٿو. 

📄 common_voice_sd_41397194.mp3 | WER: 0.4000
   REF: ڇهين مالا پيدا ٿئي ٿي
   HYP:  چهن مالا پيدا ٿئيٽيءَ ٿي 

📄 common_voice_sd_41397195.mp3 | WER: 0.5789
   REF: يقينن، ان جو مطلب اهو نه وٺڻ گهرجي ته هتي ڪي به مسئلا ۽ محروم طبقن جي کوٽ ناهي.
   HYP:  اهڪينن اڻهن اڄهن جو مطلب اهو نه وٺڻ گھرجي هٽي هٽي ڪي به مسلاح ۽ محرون تبقن جي خو تناحي 

📄 common_voice_sd_41397199.mp3 | WER: 0.5000
   REF: تنهن ڪري، انهن ۾ صالح آهن
   HYP:  تهن ڪري انهن ۾ سالي آهن 

📄 common_voice_sd_41397200.mp3 | WER: 1.0000
   REF: اهي حالتون تڏهن هيون.
   HYP:  کئي حالتو تڏي هئيون. 

📄 common_voice_sd_41397487.mp3 | WER: 0.8889
   REF: حمائمه ملڪ به وومين پروٽيڪشن بل