In [None]:
!pip install -U bitsandbytes



In [None]:
!pip install -q transformers datasets librosa evaluate jiwer gradio bitsandbytes accelerate
!pip install -q git+https://github.com/huggingface/peft.git@main

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Dec  2 06:19:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
model_name_or_path = "openai/whisper-small.en"
task = "transcribe"

In [None]:
dataset_name = "Tarakeshwaran/Whisper-train-data"
language = "English"
language_abbr = "en" # Short hand code for the language we want to fine-tune

In [None]:
from datasets import load_dataset, DatasetDict

whisper_data = DatasetDict()

whisper_data["train"] = load_dataset(dataset_name, split="train")
whisper_data["test"] = load_dataset(dataset_name, split="test")

print(whisper_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'start', 'end'],
        num_rows: 80
    })
    test: Dataset({
        features: ['audio', 'text', 'start', 'end'],
        num_rows: 20
    })
})


In [None]:
whisper_data = whisper_data.remove_columns(
    ["start","end"]
)

print(whisper_data)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 80
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 20
    })
})


In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

In [None]:
print(whisper_data["train"][0])

{'audio': {'path': 'sample-005811.mp3', 'array': array([-5.06648125e-24,  0.00000000e+00, -3.61891518e-24, ...,
       -1.07697160e-05,  2.15555119e-05, -2.88033334e-05]), 'sampling_rate': 16000}, 'text': "in alchemy it's called the soul of the world"}


In [None]:
from datasets import Audio

whisper_data = whisper_data.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(whisper_data["train"][0])

{'audio': {'path': 'sample-005811.mp3', 'array': array([-5.06648125e-24,  0.00000000e+00, -3.61891518e-24, ...,
       -1.07697160e-05,  2.15555119e-05, -2.88033334e-05]), 'sampling_rate': 16000}, 'text': "in alchemy it's called the soul of the world"}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [None]:
whisper_data = whisper_data.map(prepare_dataset, remove_columns=whisper_data.column_names["train"], num_proc=2)

In [None]:
whisper_data["train"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 80
})

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [None]:
from peft import prepare_model_for_kbit_training
# Assume `model` is your pre-trained transformer model
model = prepare_model_for_kbit_training(model)

In [None]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7a3a13270cd0>

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 245,273,856 || trainable%: 1.4429


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./faster-whisper-small-en",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-3,
    warmup_steps=50,
    evaluation_strategy="steps",
    fp16=True,
    save_steps=10,
    eval_steps=10,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    predict_with_generate=True,
    logging_steps=25,
    metric_for_best_model="wer",
    greater_is_better=False,
    max_steps=500, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)



In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=whisper_data["train"],
    eval_dataset=whisper_data["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtarakeshsampath1[0m ([33mtarakeshsampath1-none[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Wer
10,No log,2.769154,25.853659
20,No log,1.278829,155.609756
30,2.421900,0.734945,4.878049
40,2.421900,0.513023,4.878049
50,0.474800,0.102858,2.926829
60,0.474800,0.083441,4.390244
70,0.474800,0.078193,3.902439
80,0.008500,0.074802,4.390244
90,0.008500,0.075127,3.414634
100,0.003000,0.084832,3.414634


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.proce

TrainOutput(global_step=500, training_loss=0.5669511904753745, metrics={'train_runtime': 3064.3353, 'train_samples_per_second': 1.305, 'train_steps_per_second': 0.163, 'total_flos': 1.17472591872e+18, 'train_loss': 0.5669511904753745, 'epoch': 50.0})

In [None]:
peft_model_id = "Tarakeshwaran/faster-whisper-small-en"
model.push_to_hub(peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Tarakeshwaran/faster-whisper-small-en/commit/7fb1694df51792a0b8c6aed0a2aac1c54a1e362e', commit_message='Upload model', commit_description='', oid='7fb1694df51792a0b8c6aed0a2aac1c54a1e362e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Tarakeshwaran/faster-whisper-small-en', endpoint='https://huggingface.co', repo_type='model', repo_id='Tarakeshwaran/faster-whisper-small-en'), pr_revision=None, pr_num=None)

In [None]:
from google.colab import files
!zip -r /content/faster-whisper-small-en.zip /content/faster-whisper-small-en


  adding: content/faster-whisper-small-en/ (stored 0%)
  adding: content/faster-whisper-small-en/checkpoint-440/ (stored 0%)
  adding: content/faster-whisper-small-en/checkpoint-440/rng_state.pth (deflated 25%)
  adding: content/faster-whisper-small-en/checkpoint-440/trainer_state.json (deflated 82%)
  adding: content/faster-whisper-small-en/checkpoint-440/adapter_config.json (deflated 54%)
  adding: content/faster-whisper-small-en/checkpoint-440/adapter_model/ (stored 0%)
  adding: content/faster-whisper-small-en/checkpoint-440/adapter_model/adapter_config.json (deflated 54%)
  adding: content/faster-whisper-small-en/checkpoint-440/adapter_model/README.md (deflated 66%)
  adding: content/faster-whisper-small-en/checkpoint-440/adapter_model/adapter_model.safetensors (deflated 7%)
  adding: content/faster-whisper-small-en/checkpoint-440/README.md (deflated 66%)
  adding: content/faster-whisper-small-en/checkpoint-440/training_args.bin (deflated 51%)
  adding: content/faster-whisper-smal

FileNotFoundError: Cannot find file: /content/sample_data.zip

In [None]:
files.download('/content/faster-whisper-small-en.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>