In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install datasets

In [None]:
!pip install jiwer

In [None]:
!pip install evaluate

In [None]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor,WhisperTokenizer,Seq2SeqTrainingArguments,Seq2SeqTrainer
from google.colab import drive
import os
import pandas as pd
import csv
from datasets import Dataset,load_from_disk
import torchaudio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import numpy as np
import evaluate

In [None]:
device = torch.device("cuda")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_path = '/content/drive/MyDrive/asr_sinhala'

In [None]:
tsv_file = os.path.join(data_path, 'trainF.tsv')

In [None]:
test_tsv_file = os.path.join(data_path, 'spontaniousTest.tsv')

In [None]:
with open(tsv_file, 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    df = pd.DataFrame(reader)
df.head()

In [None]:
with open(test_tsv_file, 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    df_test = pd.DataFrame(reader)
df_test.head()

In [None]:
audio_data_path =  '/content/drive/MyDrive/asr_sinhala/audioDataF'

In [None]:
audio_data_path_test =  '/content/drive/MyDrive/asr_sinhala/SponAudio'

In [None]:
data_dict = {
    'audio': [os.path.join(audio_data_path, f"{filename}.flac") for filename in df[0]],
    'text': df[2],
    'speaker_id': df[1]
}

In [None]:
data_dict_test = {
    'audio': [os.path.join(audio_data_path_test, f"{filename}.flac") for filename in df_test[0]],
    'text': df_test[2],
    'speaker_id': df_test[1]  # Speaker ID, if needed for further processing
}

In [None]:
dataset = Dataset.from_dict(data_dict)

In [None]:
test_dataset=Dataset.from_dict(data_dict_test)

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Sinhala", task="transcribe")

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Sinhala", task="transcribe")

In [None]:
def tokenize_transcriptions(examples):
    return processor.tokenizer(examples['text'], padding=True, truncation=True)

In [None]:
tokenized_dataset = dataset.map(tokenize_transcriptions)

In [None]:
tokenized_testset = test_dataset.map(tokenize_transcriptions)

In [None]:
def preprocess_data(batch):

    audio_path = batch['audio']
    waveform, sample_rate = torchaudio.load(audio_path)

    input_features = feature_extractor(waveform.squeeze().numpy(), sampling_rate=sample_rate).input_features[0]
    labels = tokenizer(batch["text"]).input_ids

    return {"input_features": input_features, "labels": labels}

In [None]:
processed_dataset = tokenized_dataset.map(preprocess_data, remove_columns=["audio", "speaker_id"])

In [None]:
save_path = '/content/drive/My Drive/asr_sinhala/ProcessedData/trained_dataset_f'
processed_dataset.save_to_disk(save_path)

In [None]:
processed_test_dataset = tokenized_testset.map(preprocess_data, remove_columns=["audio"])

In [None]:
save_path_test = '/content/drive/My Drive/asr_sinhala/ProcessedData/test_dataset_s'
processed_test_dataset.save_to_disk(save_path_test)

In [None]:
processed_dataset = load_from_disk(save_path)
processed_test_dataset = load_from_disk(save_path_test)

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

      input_features = []
      for feature in features:
          if "input_features" not in feature:
              print("Warning: 'input_features' not found in feature:", feature)
              continue
          input_features.append({"input_features": feature["input_features"]})

      batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
      label_features = [{"input_ids": feature["labels"]} for feature in features if "labels" in feature]
      if len(label_features) == 0:
          raise ValueError("No valid 'labels' found in the features.")

      labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
      labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

      if labels.size(1) > 0 and (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
          labels = labels[:, 1:]

      batch["labels"] = labels

      return batch

In [None]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-specaugment-sinhala",
    logging_steps=100,
    report_to=["tensorboard"],
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_steps=500,
    num_train_epochs=3,
    gradient_checkpointing=True,
    fp16=True,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=processed_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
    eval_dataset=processed_test_dataset,
)

In [None]:
trainer.train()

In [None]:
model.push_to_hub("RRashmini/whisper-small-sinhala")

In [None]:
trainer.evaluate(eval_dataset=processed_test_dataset)