# **Initial Training of Whisper Model on Mozilla Common Voice for accent recognition**

## **Inital** Setup

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

Repository: 'deb https://ppa.launchpadcontent.net/jonathonf/ffmpeg-4/ubuntu/ jammy main'
Description:
Backport of FFmpeg 4 and associated libraries. Now includes AOM/AV1 support!

FDK AAC is not compatible with GPL and FFmpeg can't be redistributed with it included. Please don't ask for it to be added to this public PPA.

---

PPA supporters:

BigBlueButton (https://bigbluebutton.org)

---

Donate to FFMPEG: https://ffmpeg.org/donations.html
Donate to Debian: https://www.debian.org/donations
Donate to this PPA: https://ko-fi.com/jonathonf
More info: https://launchpad.net/~jonathonf/+archive/ubuntu/ffmpeg-4
Adding repository.
Adding deb entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding key to /etc/apt/trusted.gpg.d/jonathonf-ubuntu-ffmpeg-4.gpg with fingerprint 4AB0F789CBA31744CC7DA76A8CF63AD3F06FC659
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ In

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-qdbxhmex
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-qdbxhmex
  Resolved https://github.com/huggingface/transformers to commit 1d063793318b20654ebb850f48f43e0a247ab7bb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.21,>=0.20 (from transformers==4.47.0.dev0)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: transformers
  

In [None]:
import os
from datasets import Dataset, DatasetDict, Audio
import pandas as pd
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from datasets import Audio

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import Seq2SeqTrainingArguments
from transformers import WhisperForConditionalGeneration
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

In [None]:
from huggingface_hub import login

login(token="") # Add Access Token

In [None]:
# Select CUDA device index

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-large-v2"
language = "English"
language_abbr = "en"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_17_0"

In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Navigate to the location where the dataset is stored
os.chdir('/content/drive/My Drive/dataset_research')

In [None]:
# If the dataset is compressed, first extract the tar.gz file
# import tarfile

# # Replace 'your_dataset.tar.gz' with the actual filename of the dataset
# dataset_tar = 'cv-corpus-17.0-delta-2024-03-15-en.tar.gz'

# # Extract the dataset
# with tarfile.open(dataset_tar, 'r:gz') as tar:
#     tar.extractall()

# Verify the extraction
!ls  # This will list the files extracted from the tar.gz file


cv-corpus-17.0-delta-2024-03-15		   cv-corpus-19.0-delta-2024-09-13  wandb
cv-corpus-17.0-delta-2024-03-15-en.tar.gz  temp


## **Load Dataset**

In [None]:

# Path to the folder containing audio clips and metadata
clips_folder = '/content/drive/My Drive/dataset_research/cv-corpus-17.0-delta-2024-03-15/en/clips/'
metadata_file = '/content/drive/My Drive/dataset_research/cv-corpus-17.0-delta-2024-03-15/en/validated.tsv'

# Load the metadata from the .tsv file
common_voice = pd.read_csv(metadata_file, sep='\t')

# Take a subset of 10,000 samples
common_voice = common_voice[:10000]

# Map audio file paths to the 'audio' column
common_voice['audio'] = common_voice['path'].apply(lambda x: os.path.join(clips_folder, x))

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(common_voice)

# Split into train and test (80-20 split)
train_test_split = dataset.train_test_split(test_size=0.2)

# Create a DatasetDict
common_voice= DatasetDict({
    "train": train_test_split['train'],
    "test": train_test_split['test']
})

# Cast the 'audio' column to the Audio feature type
common_voice = common_voice.cast_column("audio", Audio())

# Verify the DatasetDict structure
print(common_voice)


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio'],
        num_rows: 1501
    })
    test: Dataset({
        features: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment', 'audio'],
        num_rows: 376
    })
})


In [None]:
common_voice = common_voice.remove_columns(
    ["accents", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes", 'sentence_id', 'sentence_domain', 'variant']
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 1501
    })
    test: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 376
    })
})


## **Prepare Feature Extractor, Tokenizer and Data**

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [None]:
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

### Prepare Data

In [None]:
print(common_voice["train"][0])

{'sentence': 'He embarks on a sailboat for a tour of the world.', 'audio': {'path': '/content/drive/My Drive/dataset_research/cv-corpus-17.0-delta-2024-03-15/en/clips/common_voice_en_40057521.mp3', 'array': array([ 0.00000000e+00, -1.86986277e-13,  8.01778349e-13, ...,
       -4.56719533e-08, -1.25622307e-06, -8.75218007e-07]), 'sampling_rate': 32000}}


In [None]:
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

Re-loading the first audio sample in the Common Voice dataset will resample
it to the desired sampling rate:

In [None]:
print(common_voice["train"][0])

{'sentence': 'Will you, for instance, join the glad throng?', 'audio': {'path': '/content/drive/My Drive/dataset_research/cv-corpus-17.0-delta-2024-03-15/en/clips/common_voice_en_39666160.mp3', 'array': array([ 8.73114914e-11,  1.45519152e-11,  6.54836185e-11, ...,
        7.81213885e-08,  2.51861820e-08, -1.00890247e-08]), 'sampling_rate': 16000}}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)

Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

Map:   0%|          | 0/376 [00:00<?, ? examples/s]

In [None]:
common_voice["train"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 1501
})

## **Training and Evaluation**

### Define a Data Collator

In [None]:

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Let's initialise the data collator we've just defined:

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Load a Pre-Trained Checkpoint

In [None]:
model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

### Post-processing on the model

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.

In [None]:
import peft
dir(peft)


In [None]:
model = prepare_model_for_kbit_training(model)

### Apply LoRA


In [None]:
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


### Define the Training Configuration

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="temp",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,
    label_names=["labels"],
)



In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
model.config.use_cache = False

  trainer = Seq2SeqTrainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.209,0.198679
2,0.0913,0.182877
3,0.0272,0.193227


  return fn(*args, **kwargs)


TrainOutput(global_step=564, training_loss=0.21208929064426016, metrics={'train_runtime': 3596.8884, 'train_samples_per_second': 1.252, 'train_steps_per_second': 0.157, 'total_flos': 9.6626286415872e+18, 'train_loss': 0.21208929064426016, 'epoch': 3.0})

In [None]:
print(model.peft_config)


{'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='openai/whisper-large-v2', revision=None, task_type=None, inference_mode=False, r=32, target_modules={'v_proj', 'q_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))}


In [None]:
model_name_or_path = "openai/whisper-large-v2"

# Retrieve the peft_type from the default configuration
peft_type = model.peft_config['default'].peft_type.value

# Construct the model ID
peft_model_id = "Sanjana6178/" + f"{model_name_or_path}-{peft_type}-colab".replace("/", "-")

# Push the model to the hub
model.push_to_hub(peft_model_id)

# Print the model ID
print(peft_model_id)


adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

Sanjana6178/openai-whisper-large-v2-LORA-colab


# **Evaluation and Inference**

In [None]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

# Load the PeftConfig and model without GPU-specific options
peft_model_id = "Sanjana6178/openai-whisper-large-v2-LORA-colab"
peft_config = PeftConfig.from_pretrained(peft_model_id)

# Load the Whisper model without 8-bit quantization or device_map
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path
)

# Load the model weights from the specified PEFT model ID
model = PeftModel.from_pretrained(model, peft_model_id)


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc

# Set up DataLoader
eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)

# Set the model to evaluation mode
model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.no_grad():
        generated_tokens = (
            model.generate(
                input_features=batch["input_features"],
                decoder_input_ids=batch["labels"][:, :4],
                max_new_tokens=255,
            )
            .cpu()
            .numpy()
        )

        labels = batch["labels"].numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Update metric with decoded predictions and labels
        metric.add_batch(
            predictions=decoded_preds,
            references=decoded_labels,
        )

    # Free up memory
    del generated_tokens, labels, batch
    gc.collect()

# Compute the Word Error Rate (WER)
wer = 100 * metric.compute()
print(f"{wer=}")


  0%|          | 0/47 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
 13%|█▎        | 6/47 [2:10:26<15:02:01, 1320.03s/it]

## Using AutomaticSpeechRecognitionPipeline Evaluation


In [None]:
import torch
import gradio as gr
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig


peft_model_id = "Sanjana6178/openai-whisper-large-v2-LORA-colab"
language = "English"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)


def transcribe(audio):
    with torch.cuda.amp.autocast():
        text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
    return text


iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Speech-to-Text Phase 1 result Interface",
    description="Realtime demo for English speech recognition Whisper Large V2 model.",
)

iface.launch(share=True)