https://github.com/Vaibhavs10/fast-whisper-finetuning/blob/main/Whisper_w_PEFT.ipynb

In [31]:
!pip install -q transformers datasets librosa evaluate jiwer gradio bitsandbytes accelerate 
!pip install -q git+https://github.com/huggingface/peft.git@main

In [1]:
import pandas as pd
import os

DATAFRAME_PATH = './tmp/asr_dataframe.pkl'
FILE_PATH = "../../advanced"

if os.path.exists(DATAFRAME_PATH):
    df = pd.read_pickle(DATAFRAME_PATH) 
else:
    df = read_data(FILE_PATH, DATAFRAME_PATH)
    
df.head()

Unnamed: 0,key,audio,transcript,b64
0,0,audio_0.wav,"Turret, prepare to deploy electromagnetic puls...",UklGRgB3BABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACAB...
1,1,audio_1.wav,Engage yellow drone with surface-to-air missil...,UklGRubrAgBXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACAB...
2,2,audio_2.wav,"Control to turrets, deploy electromagnetic pul...",UklGRgCXBwBXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACAB...
3,3,audio_3.wav,"Alfa, Echo, Mike Papa, deploy EMP tool heading...",UklGRiRgBABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACAB...
4,4,audio_4.wav,"Engage the grey, black, and green fighter plan...",UklGRvaYAwBXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACAB...


In [2]:
from datasets import Audio, Dataset

directory_prefix = "../../advanced/audio/"

# Assuming df is your DataFrame
def df_to_dataset(df):
    df['audio'] = df['audio'].apply(lambda x: directory_prefix + x)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    return dataset

In [3]:
from datasets import DatasetDict

# Split the DataFrame into train, test, and validation DataFrames
train_df = df.sample(frac=0.8, random_state=42)  # 80% for training
test_df = df.drop(train_df.index)

# Convert DataFrames to Datasets
train_dataset = df_to_dataset(train_df)
test_dataset = df_to_dataset(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['key', 'audio', 'transcript', 'b64', '__index_level_0__'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['key', 'audio', 'transcript', 'b64', '__index_level_0__'],
        num_rows: 700
    })
})


In [4]:
dataset_dict = dataset_dict.remove_columns(
    ["b64", "__index_level_0__"]
)

In [5]:
model_name_or_path = "openai/whisper-large-v2"
task = "transcribe"

language = "English"
language_abbr = "en" # Short hand code for the language we want to fine-tune

In [9]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)



In [11]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

In [None]:
print(dataset_dict['train'][0])

In [7]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcript"]).input_ids
    return batch

In [13]:
import pickle

train_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict.column_names["train"], num_proc=1)

DATASETDICT_PATH = './tmp/asr_datasetdict.pkl'
with open(DATASETDICT_PATH, "wb") as f:
    pickle.dump(train_dict, f)

print(train_dict)
print(train_dict['train'])

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 700
    })
})
Dataset({
    features: ['input_features', 'labels'],
    num_rows: 2800
})


In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

In [46]:
!pip -q install accelerate 
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [47]:
!pip show bitsandbytes
!pip show accelerate

Name: bitsandbytes
Version: 0.37.0
Summary: 8-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/TimDettmers/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: 
Required-by: 
Name: accelerate
Version: 0.30.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /opt/conda/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: peft


from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig, WhisperConfig

config = WhisperConfig.from_pretrained(model_name_or_path, quantization_config=BitsAndBytesConfig(load_in_8bit=True))

# Initialize model
model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, config=config)

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [None]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./helps-peft",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=1,
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    # fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    save_steps=100,
    logging_steps=5,
    max_steps=5, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
    push_to_hub=True,
    metric_for_best_model="wer",
    load_best_model_at_end=True,
)

In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dict["train"],
    eval_dataset=train_dict["test"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.train()

In [2]:
import pickle
from datasets import Audio, Dataset, DatasetDict

DATASETDICT_PATH = './tmp/asr_datasetdict.pkl'
    
with open(DATASETDICT_PATH, "rb") as f:
    dataset_dict = pickle.load(f)
    print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 700
    })
})


In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat May 18 12:36:55 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch

# Check if CUDA is available
print(torch.cuda.is_available())

# Check the number of GPUs
print(torch.cuda.device_count())

# Get the name of the GPU
print(torch.cuda.get_device_name(0))

True
1
Tesla T4
