In [1]:
%%capture
!pip install --upgrade pip
!pip install jiwer
!pip install evaluate
!pip install tensorboard
!pip install datasets
!pip install --upgrade transformers
!pip install --upgrade torch
!pip install --upgrade torchvision
!pip install --upgrade torchaudio
!pip install librosa
!pip install numpy==2.1.0
!pip install scipy==1.11.4
!pip install librosa==0.10.1
!pip install numba==0.58.1
!pip install datasets>=2.14.0
!pip install accelerate>=0.26.0
!pip install typing_extensions --upgrade

In [2]:
# !pip install huggingface_hub --quiet


from huggingface_hub import login
login(token="INSERT_YOUR_HUGGING_FACE_TOKEN")

In [None]:
from datasets import load_dataset, concatenate_datasets, Audio

ds = load_dataset("kaarthu2003/SlrCvVoicesTtsDataset")
train_data = ds["train"]
test_data = ds["validation"]

In [None]:
def has_few_words(example):
    return len(example["sentence"]) < 35  # or "text" if that's the field name

filtered_test_data = test_data.filter(has_few_words)

In [None]:
# Print the 30 longest sentences in test_data
sorted_sentences = sorted(test_data, key=lambda x: len(x["sentence"]), reverse=True)
print("30 longest sentences:")
for i, example in enumerate(sorted_sentences[:30]):
    print(f"{i+1}: ({len(example['sentence'])} chars) {example['sentence']}")


In [None]:
# Print confirmation
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(filtered_test_data)}")

# Sample peek
print("\nSample example:")
print(train_data[0])

In [None]:
test_data = filtered_test_data

In [5]:
telugu_special_unwanted_characters = [
    'ౄ',  # Vocalic RR
    'ౢ',  # Vocalic L
    'ౣ',  # Vocalic LL
    'ౠ',  # Long Vocalic RR
    'ఽ',  # Avagraha
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',  # Telugu digits
    'ఀ',  # Telugu Sign Combining Candrabindu Above
    'ౘ',  # Letter TTHA
    'ౙ',  # Letter DDA
    'ౚ',  # Letter RHA
    '౷',  # Vedic Tone
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',  # Common punctuation
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c' #Unwanted in the dataset
]

In [6]:
import re
chars_to_remove_regex = f'[{re.escape("".join(telugu_special_unwanted_characters))}]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["transcription"])
    return batch

In [None]:
train_data = train_data.map(remove_special_characters)
test_data = test_data.map(remove_special_characters)

In [8]:
repo_name = "whisper-IEEEAccess-FinalRun-4Datasets"

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Telugu", task="transcribe")

In [None]:
tokenizer.push_to_hub(repo_name)

Ensuring Tokenizer is working correctly

In [None]:
input_str = train_data[0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

In [13]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Telugu", task="transcribe")

In [None]:
print(train_data[0])

In [15]:
from datasets import Audio

train_data = train_data.cast_column("audio", Audio(sampling_rate=16000))
test_data = test_data.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(train_data[0])

In [17]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names)
test_data = test_data.map(prepare_dataset, remove_columns=test_data.column_names)

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [None]:
model.generation_config.language = "telugu"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
import warnings
import torch

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Check GPU memory status
print(torch.cuda.memory_summary())

In [None]:
from transformers import Seq2SeqTrainingArguments

#These are the optimal Configurations found, Train this for reproductability, Feel free to tweak these to find your results.

training_args = Seq2SeqTrainingArguments(
    output_dir=repo_name,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,  
    learning_rate=1e-5,
    warmup_ratio=0.1,  
    num_train_epochs=30, 
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",     
    eval_steps=3000,
    logging_steps=300,
    save_steps=3000,
    save_total_limit=3,
    generation_max_length=60,       
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model = WhisperForConditionalGeneration.from_pretrained("kaarthu2003/whisper-IEEEAccess-FinalRun-4Datasets")
processor = WhisperProcessor.from_pretrained("kaarthu2003/whisper-IEEEAccess-FinalRun-4Datasets")

In [None]:
print(test_data[0])

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Placeholder to store predictions and references
predictions = []
references = []

# Evaluate your model on the test data
for example in test_data:
    # Extract input features and labels
    input_features = example["input_features"]
    label = example["labels"]

    # Move input features to device
    input_features = torch.tensor(input_features).unsqueeze(0).to(device)

    # Generate predictions
    predicted_ids = model.generate(input_features)
    predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    # Check if label is an integer or needs formatting
    if isinstance(label, int):
        label = [label]  # Wrap the single integer in a list if necessary

    # Decode the label to get the reference text
    reference_text = processor.batch_decode([label], skip_special_tokens=True)[0]

    # Append to lists
    predictions.append(predicted_text)
    references.append(reference_text)

# Compute WER
wer = metric.compute(predictions=predictions, references=references)
print(f"Word Error Rate (WER): {wer:.2f}")
cer_metric = evaluate.load("cer")
cer = cer_metric.compute(predictions=predictions, references=references)
print(f"Character Error Rate (CER): {cer: .2f}")