In [None]:
! pip install datasets transformers peft bitsandbytes gradio accelerate jiwer evaluate librosa update
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt install -y ffmpeg
!apt update
! pip install --upgrade datasets
!pip install git+https://github.com/huggingface/transformers
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
import torch
import peft
import evaluate
import torch.nn as nn
import transformers
import bitsandbytes
from dataclasses import dataclass
from typing import Dict, List, Any, Union
from datasets import Audio, load_dataset, DatasetDict
from transformers import AutoTokenizer, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, default_data_collator,WhisperForConditionalGeneration,BitsAndBytesConfig, Seq2SeqTrainingArguments
from peft import PeftConfig, PeftModel, LoraConfig,LoraModel,get_peft_model, LoraConfig, prepare_model_for_kbit_training


os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name = "openai/whisper-large-v2"
language = "Marathi"
language_abbr = "mr"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

In [4]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train+validation")
common_voice["test"] = load_dataset(dataset_name, language_abbr, split="test")

print(common_voice)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

common_voice_11_0.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 3927
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 1816
    })
})


In [5]:
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3927
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1816
    })
})


In [6]:
# Prepare feature extractor Tokenizer and Processor
feature_extractor=WhisperFeatureExtractor.from_pretrained(model_name)

tokenizer=WhisperTokenizer.from_pretrained(model_name,language=language,task=task)

processor=WhisperProcessor.from_pretrained(model_name,language=language,task=task)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

## Prepare Data

In [7]:
# Since our input audio is sampled at 48Hz, We need to downsample it to 16Hz as our WhisperModel expect that Freq.
common_voice=common_voice.cast_column('audio',Audio(sampling_rate=16000))

In [8]:
def prepare_dataset(batch):
  # Load resampled audio from 48Hz to 16Hz
  audio=batch['audio']

  # Converts raw audio into log-Mel spectrogram featurs suitable for model input
  batch['input_features']=feature_extractor( audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]

  # Encode target text into label ids
  batch['labels'] = tokenizer(batch['sentence']).input_ids

  return batch

common_voice=common_voice.map(prepare_dataset, remove_columns=common_voice.column_names['train'], num_proc=2)
common_voice['train']

Map (num_proc=2):   0%|          | 0/3927 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1816 [00:00<?, ? examples/s]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 3927
})

In [20]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
  processor:Any

  def __call__(self, features:List[Dict[str, Union[List[int],torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    # Split the inputs and labels since they have to be different length and need different padding methods
    # Will treat the audio input by simply returning tensors
    input_features= [{'input_features':feature['input_features']} for feature in features ]  # --> Get input in sequence format.
    # Pad the input to max lenght
    batch= self.processor.feature_extractor.pad(input_features, return_tensors='pt')

    # Get tokenized label sequence
    label_features=[{'input_ids': feature['labels']} for feature in features]
    # Pad the label to max lenght
    labels_batch=self.processor.tokenizer.pad(label_features,return_tensors='pt')

    # Replace padding to -100 to ignore the optimizer
    labels=labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1),-100)

    # Remove BOS[Begining of the sentence] here
    if (labels[:,0] ==self.processor.tokenizer.bos_token_id ).all().cpu().item():
      labels =labels[:,1:]
    batch['labels'] =labels

    return batch

In [21]:
data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Evaluation metrices- `WER` [Word Error Rate]

In [22]:
metric=evaluate.load('wer')

def compute_metrics(pred):
  pred_ids=pred.predictions
  label_ids=pred.label_ids

  # Replace -100 to pad_token_id
  label_ids[label_ids == -100]= tokenizer.pad_token_id

  pred_str=tokenizer.batch_decode(pred_ids,skip_special_tokens=True)
  label_str=tokenizer.batch_decode(label_ids,skip_special_tokens=True)

  wer= 100 * metric.compute(prediction=pred_str, references=label_str)

  return {'wer':wer}

In [None]:
## Load Pretrained Checkpoint
quant_config=BitsAndBytesConfig(load_in_8bit=True)

#model=WhisperForConditionalGeneration.from_pretrained(model_name, load_in_8bit=True)
model=WhisperForConditionalGeneration.from_pretrained(model_name, quantization_config=quant_config)


In [24]:
model.config.forced_decoder_ids=None
model.config.suppress_tokens=[]

In [25]:
# Apply LoRA
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


config=LoraConfig(r=32, lora_alpha=64, target_modules=['q_proj','v_proj'],lora_dropout=0.05,bias='none')

model =get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


In [29]:
# Set Training Args
training_args=Seq2SeqTrainingArguments(
    output_dir='temp',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=1,
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,
    label_names=['labels']

)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
from transformers import Seq2SeqTrainer,TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

class SavePeftModelCallback(TrainerCallback):
  def on_save(
      self,
      args:TrainingArguments,
      state: TrainerState,
      control: TrainerControl,
      **kwargs
  ):
      checkpoint_folder =os.path.join(args.output_dir,f"{PREFIX_CHECKPOINT_DIR}--{state.best_global_step}")
      peft_model_path=os.path.join(checkpoint_folder,'adapter_model')
      kwargs['model'].save_pretrained(peft_model_path)

      pytorch_model_path=os.path.join(checkpoint_folder,'pytorch_model.bin')
      if os.path.exists(pytorch_model_path):
        os.remove(pytorch_model_path)
      return control

trainer=Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice['train'],
    eval_dataset=common_voice['test'],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback]
)
model.config.use_cache

  trainer=Seq2SeqTrainer(
Using auto half precision backend


True

In [32]:
trainer.train()

***** Running training *****
  Num examples = 3,927
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 491
  Number of trainable parameters = 15,728,640


Step,Training Loss
25,0.5096
50,0.4225
75,0.3476
100,0.3411
125,0.3131
150,0.3158
175,0.2855
200,0.303
225,0.2813
250,0.2673


Saving model checkpoint to temp/checkpoint-491
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/config.json
Model config WhisperConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1280,
  "decoder_attention_heads": 20,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 32,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 20,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 32,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],

TrainOutput(global_step=491, training_loss=0.2881989760699923, metrics={'train_runtime': 2472.8944, 'train_samples_per_second': 1.588, 'train_steps_per_second': 0.199, 'total_flos': 8.4266361704448e+18, 'train_loss': 0.2881989760699923, 'epoch': 1.0})

In [35]:
model_name='openai/whisper-large-v2'
peft_model_id = "OmBhandwalkar/openai-whisper-large-v2-LORA-colab"
model.push_to_hub(peft_model_id)
print(peft_model_id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/config.json
Model config WhisperConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1280,
  "decoder_attention_heads": 20,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 32,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 20,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 32,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": tr

adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

OmBhandwalkar/openai-whisper-large-v2-LORA-colab


## Evaluate

In [36]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

peft_model_id = "OmBhandwalkar/openai-whisper-large-v2-LORA-colab"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)

All model checkpoint weights were used when initializing WhisperForConditionalGeneration.

All the weights of WhisperForConditionalGeneration were initialized from the model checkpoint at openai/whisper-large-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use WhisperForConditionalGeneration for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/generation_config.json
Generate config GenerationConfig {
  "alignment_heads": [
    [
      10,
      12
    ],
    [
      13,
      17
    ],
    [
      16,
      11
    ],
    [
      16,
      12
    ],
    [
      16,
      13
    ],
    [
      17,
      15
    ],
    [
      17,
      16
    ],
    [
      18,
      4
    ],
    [
      18,
      11
    ],
    [
      18,
      19
    ],
    [
      19,
      11
    ],
    

adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc

eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)

model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    decoder_input_ids=batch["labels"][:, :4].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            metric.add_batch(
                predictions=decoded_preds,
                references=decoded_labels,
            )
    del generated_tokens, labels, batch
    gc.collect()
wer = 100 * metric.compute()
print(f"{wer=}")

  with torch.cuda.amp.autocast():
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
 24%|██▍       | 54/227 [22:08<1:03:47, 22.13s/it]

In [None]:
import torch
import gradio as gr
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig

peft_model_id='OmBhandwalkar/openai-whisper-large-v2-LORA-colab'
language='Marathi'
task='transcribe'
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name, load_in_8bit=True, device_map="auto"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)


def transcribe(audio):
  with torch.cuda.amp.autocast():
    text =pipe(audio,generate_kwargs={'forced_decoder_ids':forced_decoder_ids},max_new_tokens=255)['text']
  return text

Intface=gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source='microphone',type='filepath'),
    outputs='text',
    title='PEFT LoRA + INT-8 Whisper Large V2 Marathi',
    description="Realtime demo for Marathi speech recognition using `PEFT-LoRA+INT8` fine-tuned Whisper Large V2 model."
)
Intface.launch(share=True)