In [None]:
# Install PyTorch with GPU support (CUDA 11.8 version)
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

import torch
print("CUDA Available:", torch.cuda.is_available())  # Should be True
print("GPU:", torch.cuda.get_device_name(0))  # Should print Quadro RTX 4000

In [None]:
%pip install transformers datasets evaluate jiwer
%pip install librosa scikit-learn pandas
%pip install soundfile
%pip install tensorboard

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load TSV
df = pd.read_csv("updated_file.tsv", sep="\t", names=["audio", "sentence"])

# Add file extension if missing
df["audio"] = df["audio"].apply(lambda x: f"{x}.wav")

# Optionally prepend path if needed:
# df["audio"] = df["audio"].apply(lambda x: f"./audio_folder/{x}")

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Optional: save for reference
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)

In [None]:
from datasets import Dataset, Audio

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Cast audio with target sampling rate
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(train_dataset[0]["audio"])

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="ta", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="ta", task="transcribe")

print(train_df.columns)

In [None]:
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array
    audio = examples["audio"]
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    del examples["audio"]
    sentences = examples["sentence"]

    # encode target text to label ids
    examples["labels"] = tokenizer(sentences).input_ids
    del examples["sentence"]
    return examples

In [None]:
train_dataset = train_dataset.map(prepare_dataset, num_proc=1)
val_dataset = val_dataset.map(prepare_dataset, num_proc=1)

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

## lets initiate the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [None]:
! pip install evaluate jiwer

import evaluate
metric = evaluate.load("wer")

In [None]:
from jiwer import wer, cer, mer

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode token IDs to strings
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # WER, CER, MER
    wer_score = wer(label_str, pred_str) * 100
    cer_score = cer(label_str, pred_str) * 100
    mer_score = mer(label_str, pred_str) * 100

    # SER: Sentence Error Rate = % of completely incorrect sentences
    ser_score = (
        sum(ref.strip() != pred.strip() for ref, pred in zip(label_str, pred_str))
        / len(label_str)
    ) * 100

    return {
        "wer": wer_score,
        "cer": cer_score,
        "ter": mer_score,  # Same as MER
        "ser": ser_score,
    }

In [None]:
%pip install "accelerate>=0.26.0"
%pip install transformers[torch]

In [None]:
# Load a Pre-Trained Checkpoint
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-base-ta", 
    per_device_train_batch_size=48,
    gradient_accumulation_steps=1, 
    learning_rate=1.7e-05,
    warmup_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="steps",
    num_train_epochs=10,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    save_steps=3,
    save_total_limit=3,
    logging_steps=25,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    greater_is_better=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)




In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
# Save model
model.save_pretrained("./whisper-tamil-model")

# Save processor (feature extractor + tokenizer)
processor.save_pretrained("./whisper-tamil-model")

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio
import torch

# Load model and processor
processor = WhisperProcessor.from_pretrained("whisper-tamil-model")
model = WhisperForConditionalGeneration.from_pretrained("whisper-tamil-model")

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load Tamil audio file
waveform, sr = torchaudio.load("male data/tag_00023_00002135809.wav")
if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

# Mono channel
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

input_features = processor.feature_extractor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)

# Generate transcription
predicted_ids = model.generate(input_features)
transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]

print("🗣️ Transcription:", transcription)