# **Finetuning Whisper on ATCOSIM dataset for understanding aviation terminologies**

## Inital Setup

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

Repository: 'deb https://ppa.launchpadcontent.net/jonathonf/ffmpeg-4/ubuntu/ jammy main'
Description:
Backport of FFmpeg 4 and associated libraries. Now includes AOM/AV1 support!

FDK AAC is not compatible with GPL and FFmpeg can't be redistributed with it included. Please don't ask for it to be added to this public PPA.

---

PPA supporters:

BigBlueButton (https://bigbluebutton.org)

---

Donate to FFMPEG: https://ffmpeg.org/donations.html
Donate to Debian: https://www.debian.org/donations
Donate to this PPA: https://ko-fi.com/jonathonf
More info: https://launchpad.net/~jonathonf/+archive/ubuntu/ffmpeg-4
Adding repository.
Found existing deb entry in /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding deb entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Found existing deb-src entry in /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding key

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-thcm61n9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-thcm61n9
  Resolved https://github.com/huggingface/transformers to commit a3d69a8994d673899608a7c17fbf4f953f50474e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from datasets import load_dataset, DatasetDict
import os
from datasets import Audio
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperForConditionalGeneration
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

In [None]:
from huggingface_hub import login

login(token="") # Add access Token

In [None]:
# Select CUDA device index
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-large-v2"
language = "English"
language_abbr = "en"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_17_0"

## Load Dataset

In [None]:
# Load ATCOSIM dataset
dataset = DatasetDict({
    "train": load_dataset("Jzuluaga/atcosim_corpus", split="train[:3000]"),
    "test": load_dataset("Jzuluaga/atcosim_corpus", split="test[:1000]")
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1000
    })
})

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
dataset = dataset.remove_columns(
    ['id','segment_start_time', 'segment_end_time', 'duration']
)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 1000
    })
})


## Prepare Feature Extractor, Tokenizer and Data

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)

In [None]:
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

In [None]:
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

### Prepare Data

In [None]:
print(dataset["train"][0])

{'audio': {'path': None, 'array': array([-0.00551888, -0.0072564 , -0.00494211, ...,  0.00080846,
       -0.00853999, -0.00141465]), 'sampling_rate': 16000}, 'text': 'psa eight one zero turn right to trasadingen'}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=["text", "audio"], num_proc=1)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
dataset['train']

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 3000
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 1000
    })
})

In [None]:
dataset[0]

## Training and Evaluation

### Define a Data Collator

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Load a Pre-Trained Checkpoint

In [None]:
model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

### Post-processing on the model


In [None]:
import peft
dir(peft)


['AdaLoraConfig',
 'AdaLoraModel',
 'AdaptionPromptConfig',
 'AdaptionPromptModel',
 'AutoPeftModel',
 'AutoPeftModelForCausalLM',
 'AutoPeftModelForFeatureExtraction',
 'AutoPeftModelForQuestionAnswering',
 'AutoPeftModelForSeq2SeqLM',
 'AutoPeftModelForSequenceClassification',
 'AutoPeftModelForTokenClassification',
 'BOFTConfig',
 'BOFTModel',
 'BoneConfig',
 'BoneModel',
 'EvaConfig',
 'FourierFTConfig',
 'FourierFTModel',
 'HRAConfig',
 'HRAModel',
 'IA3Config',
 'IA3Model',
 'LNTuningConfig',
 'LNTuningModel',
 'LoHaConfig',
 'LoHaModel',
 'LoKrConfig',
 'LoKrModel',
 'LoftQConfig',
 'LoraConfig',
 'LoraModel',
 'LoraRuntimeConfig',
 'MODEL_TYPE_TO_PEFT_MODEL_MAPPING',
 'MultitaskPromptTuningConfig',
 'MultitaskPromptTuningInit',
 'OFTConfig',
 'OFTModel',
 'PEFT_TYPE_TO_CONFIG_MAPPING',
 'PeftConfig',
 'PeftMixedModel',
 'PeftModel',
 'PeftModelForCausalLM',
 'PeftModelForFeatureExtraction',
 'PeftModelForQuestionAnswering',
 'PeftModelForSeq2SeqLM',
 'PeftModelForSequenceClassi

In [None]:
model = prepare_model_for_kbit_training(model)

### Apply LoRA


In [None]:
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


### Define the Training Configuration

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="temp",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,
    label_names=["labels"],
)



In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
model.config.use_cache = False

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.049,0.244839
2,0.0136,0.226392


  return fn(*args, **kwargs)


TrainOutput(global_step=750, training_loss=0.146608353416125, metrics={'train_runtime': 4865.8504, 'train_samples_per_second': 1.233, 'train_steps_per_second': 0.154, 'total_flos': 1.28749215744e+19, 'train_loss': 0.146608353416125, 'epoch': 2.0})

In [None]:
print(model.peft_config)


{'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='openai/whisper-large-v2', revision=None, task_type=None, inference_mode=False, r=32, target_modules={'q_proj', 'v_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))}


In [None]:
model_name_or_path = "openai/whisper-large-v2"
# peft_model_id = "sanjana/" + f"{model_name_or_path}-{model.peft_config.peft_type}-colab".replace("/", "-")
# model.push_to_hub(peft_model_id)
# print(peft_model_id)

# Retrieve the peft_type from the default configuration
peft_type = model.peft_config['default'].peft_type.value

# Construct the model ID
peft_model_id = "Sanjana6178/" + f"{model_name_or_path}-{peft_type}-colab_phase2".replace("/", "-")

# Push the model to the hub
model.push_to_hub(peft_model_id)

# Print the model ID
print(peft_model_id)


adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

Sanjana6178/openai-whisper-large-v2-LORA-colab_phase2


## Evaluation and Inference

In [None]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

# Load the PeftConfig and model without GPU-specific options
peft_model_id = "Sanjana6178/openai-whisper-large-v2-LORA-colab_phase2"
peft_config = PeftConfig.from_pretrained(peft_model_id)

# Load the Whisper model without 8-bit quantization or device_map
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path
)

# Load the model weights from the specified PEFT model ID
model = PeftModel.from_pretrained(model, peft_model_id)


In [None]:
dataset["test"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 1000
})

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc

# Move the model to GPU
model = model.to("cuda")

eval_dataloader = DataLoader(dataset["test"], batch_size=8, collate_fn=data_collator)

model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.amp.autocast("cuda"):  # Adjusted to default dtype
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    decoder_input_ids=batch["labels"][:, :4].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            metric.add_batch(
                predictions=decoded_preds,
                references=decoded_labels,
            )
    del generated_tokens, labels, batch
    gc.collect()

wer = 100 * metric.compute()
print(f"{wer=}")


100%|██████████| 125/125 [13:01<00:00,  6.25s/it]

wer=8.636068764190723





In [None]:
print(f"{wer=}")

wer=8.636068764190723


## Using AutomaticSpeechRecognitionPipeline

In [None]:
import torch
import gradio as gr
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig


peft_model_id = "Sanjana6178/openai-whisper-large-v2-LORA-colab_phase2"
language = "English"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)


def transcribe(audio):
    with torch.cuda.amp.autocast():
        text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
    return text


iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Speech-to-Text Phase 2 result Interface",
    description="Realtime demo for English speech recognition Whisper Large V2 model.",
)

iface.launch(share=True)