In [3]:
! pip install datasets transformers peft bitsandbytes gradio accelerate jiwer evaluate librosa update
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt install -y ffmpeg
!apt update
!pip install git+https://github.com/huggingface/transformers
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
import os
import torch
import peft
import evaluate
import torch.nn as nn
import transformers
import bitsandbytes
from dataclasses import dataclass
from typing import Dict, List, Any, Union
from datasets import Audio, load_dataset, DatasetDict
from transformers import AutoTokenizer, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, default_data_collator,WhisperForConditionalGeneration,BitsAndBytesConfig, Seq2SeqTrainingArguments
from peft import PeftConfig, PeftModel, LoraConfig,LoraModel,get_peft_model, LoraConfig, prepare_model_for_kbit_training


os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name = "openai/whisper-large-v2"
language = "Marathi"
language_abbr = "mr"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

In [6]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train+validation")
common_voice["test"] = load_dataset(dataset_name, language_abbr, split="test")

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 3927
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 1816
    })
})


In [7]:
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3927
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1816
    })
})


In [8]:
# Prepare feature extractor Tokenizer and Processor
feature_extractor=WhisperFeatureExtractor.from_pretrained(model_name)

tokenizer=WhisperTokenizer.from_pretrained(model_name,language=language,task=task)

processor=WhisperProcessor.from_pretrained(model_name,language=language,task=task)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

## Prepare Data

In [9]:
# Since our input audio is sampled at 48Hz, We need to downsample it to 16Hz as our WhisperModel expect that Freq.
common_voice=common_voice.cast_column('audio',Audio(sampling_rate=16000))

In [10]:
def prepare_dataset(batch):
  # Load resampled audio from 48Hz to 16Hz
  audio=batch['audio']

  # Converts raw audio into log-Mel spectrogram featurs suitable for model input
  batch['input_features']=feature_extractor( audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]

  # Encode target text into label ids
  batch['labels'] = tokenizer(batch['sentence']).input_ids

  return batch

common_voice=common_voice.map(prepare_dataset, remove_columns=common_voice.column_names['train'], num_proc=2)
common_voice['train']

Map (num_proc=2):   0%|          | 0/3927 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1816 [00:00<?, ? examples/s]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 3927
})

In [14]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
  processor:Any

  def __call__(self, features:List[Dict[str, Union[List[int],torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    # Split the inputs and labels since they have to be different length and need different padding methods
    # Will treat the audio input by simply returning tensors
    input_features= [{'input_features':featue['input_features']} for feature in features ]  # --> Get input in sequence format.
    # Pad the input to max lenght
    batch= self.processor.feature_extractor.pad(input_features, return_tensors='pt')

    # Get tokenized label sequence
    label_features=[{'input_ids': feature['labels']} for feature in features]
    # Pad the label to max lenght
    labels_batch=self.processor.tokenizer.pad(label_featues,return_tensors='pt')

    # Replace padding to -100 to ignore the optimizer
    labels=labels_batch['input_ids'].masked_fill(label_batch.attention_mask.ne(1),-100)

    # Remove BOS[Begining of the sentence] here
    if (labels[:,0] ==self.processor.tokenizer.bos_token_id ).all().cpu().item():
      labels =labels[:,1:]
    batch['labels'] =labels

    return batch

In [15]:
data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Evaluation metrices- `WER` [Word Error Rate]

In [18]:
metric=evaluate.load('wer')

def compute_metrics(pred):
  pred_ids=pred.predictions
  label_ids=pred.label_ids

  # Replace -100 to pad_token_id
  label_ids[label_ids == -100]= tokenizer.pad_token_id

  pred_str=tokenizer.batch_decode(pred_ids,skip_special_tokens=True)
  label_str=tokenizer.batch_decode(label_ids,skip_special_tokens=True)

  wer= 100 * metric.compute(prediction=pred_str, references=label_str)

  return {'wer':wer}

In [36]:
## Load Pretrained Checkpoint
quant_config=BitsAndBytesConfig(load_in_8bit=True)

#model=WhisperForConditionalGeneration.from_pretrained(model_name, load_in_8bit=True)
model=WhisperForConditionalGeneration.from_pretrained(model_name, quantization_config=quant_config)


In [37]:
model.config.forced_decoder_ids=None
model.config.suppress_tokens=[]

In [38]:
# Apply LoRA
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


config=LoraConfig(r=32, lora_alpha=64, target_modules=['q_proj','v_proj'],lora_dropout=0.05,bias='none')

model =get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


In [None]:
# Set Training Args
training_args=Seq2SeqTrainingArguments(
    output_dir='temp',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=3,
    evaluation_strategy='epoch',
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,
    label_names=['labels']

)