In [None]:
%%capture
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install tensorboard
!pip install gradio
!pip install audiomentations soundfile
!pip install huggingface
!pip install -q bitsandbytes datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install pyngrok

In [None]:
import os

noise_dir = "/kaggle/input/env-noise-data/16hz_audio"

toggle_LoRa = True
toggle_bit_quantization = False

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_DISABLED"] = "true"
model_type = "tiny" # tiny or base
model_repo = f"vinai/PhoWhisper-{model_type}"
language = "vi"
target_repo = "Reunoze565231/Whisper_tiny"
task = "transcribe"
dataset_name = "linhtran92/viet_bud500"

In [None]:
#Make sure that random seed is the same across training sessions
import torch
import numpy as np
import random

seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
from datasets import load_dataset
from huggingface_hub import login

login(token=os.environ["HUGGINGFACE_TOKEN"])
bud500 = load_dataset(dataset_name, streaming=True)
bud500 = bud500.shuffle(buffer_size=10_000, seed=seed)

## Prepare Feature Extractor, Tokenizer and Data

In [None]:
from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_repo)

In [None]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_repo, language=language, task=task)

In [None]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(model_repo, language=language, task=task)

In [None]:
import numpy as np
import accelerate
from audiomentations import Compose, PitchShift, TimeStretch, Gain, AddBackgroundNoise, TimeMask

def augment_audio(audio_data, sr=16_000):
    augment = Compose([
        AddBackgroundNoise(sounds_path=noise_dir, min_snr_db=3.0, max_snr_db=15.0, p=0.6),
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.4),
        PitchShift(min_semitones=-2, max_semitones=2, p=0.4),
        Gain(min_gain_db=-6, max_gain_db=6, p=0.2),
        TimeMask(min_band_part=0.05, max_band_part=0.15, p=0.2)
    ])
    augmented_audio = augment(samples=audio_data, sample_rate=sr)
    return augmented_audio

def prepare_dataset1(batch):
    audio = batch["audio"]
    augmented_audio = augment_audio(audio["array"], sr=audio["sampling_rate"])
    batch["input_features"] = feature_extractor(augmented_audio, sampling_rate=audio["sampling_rate"]).input_features[0]
    # Max length, infernece sentence is short, set by analyzing a trainig data distribution
    batch["labels"] = tokenizer(batch["transcription"], truncation=True, max_length=64).input_ids
    return batch

bud500 = bud500.map(prepare_dataset1, remove_columns=["audio", "transcription"])

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [None]:
import evaluate
metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig

if toggle_bit_quantization:
    model = WhisperForConditionalGeneration.from_pretrained(model_repo, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
else:
    model = WhisperForConditionalGeneration.from_pretrained(model_repo)

model.generation_config.language = language
model.generation_config.task = task
model.generation_config.forced_decoder_ids = None


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id
)

Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from peft import prepare_model_for_kbit_training
if toggle_bit_quantization:
    model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# Use second most avaiable pairs (8, 16) in lora config to ensure the performance of whisper model (16, 32 for base)
# The adaptor will be applied to in infernece
# config = LoraConfig(r=8,
#                     lora_alpha=16,
#                     target_modules=["q_proj", "v_proj"],
#                     bias="none")
model.enable_input_require_grads()


## Distributed-training (Continous training configuration)

In [None]:
from accelerate import Accelerator

#Review this link for further information-https://huggingface.co/docs/accelerate/en/usage_guides/checkpoint
accelerator = Accelerator(project_dir=target_repo)

### Define the Training Configuration

In [None]:
from transformers import Seq2SeqTrainingArguments
import multiprocessing
num_workers = multiprocessing.cpu_count() // 2

training_args = Seq2SeqTrainingArguments(
    logging_strategy="steps",
    logging_dir="./temp_logs",
    save_strategy="steps",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_checkpointing=True,
    logging_steps=10,
    save_steps=500,
    warmup_steps=500,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    eval_strategy="steps",
    eval_steps=1_000,
    fp16=True,
    max_steps=20_000,
    generation_max_length=64,
    report_to=["tensorboard"],
    dataloader_num_workers=num_workers,
    dataloader_pin_memory=True,
    remove_unused_columns=False,
    weight_decay=1e-3,
    lr_scheduler_type= "cosine",
    metric_for_best_model="loss",
    label_names=["labels"],
    hub_token=os.environ["HUGGINGFACE_TOKEN"],
    hub_model_id=target_repo,
)

In [None]:
!rm -rf ./temp_logs && mkdir ./temp_logs

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("./temp_logs")
%load_ext tensorboard
%tensorboard --logdir ./temp_logs

In [None]:

from pyngrok import ngrok

ngrok.set_auth_token("2u1hzH5wyJQQBaWaFfC0x6iY71P_5SQ5NfPX7hUAHfMCGUSHg")
public_url = ngrok.connect(6006)
print("Ngrok Tunnel URL:", public_url)

In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl, EarlyStoppingCallback
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from huggingface_hub import upload_folder
class SavePeftModelToHubCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        repo_id = args.hub_model_id
        checkpoint_folder = os.path.join(args.output_dir,  f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        print(f"🚀 Uploading checkpoint {checkpoint_folder} to Hugging Face Hub...")
        upload_folder(
            path_in_repo=f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
            repo_id=repo_id,
            folder_path=checkpoint_folder,
            commit_message=f"Checkpoint at {state.global_step}")
        import shutil
        shutil.rmtree(checkpoint_folder, ignore_errors=True)
        return control

trainer = accelerator.prepare(Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=bud500["train"],
    eval_dataset=bud500["validation"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelToHubCallback(), EarlyStoppingCallback(early_stopping_patience=3)],
))

model.config.use_cache = False

In [None]:
!rm -rf Whisper_tiny && git clone https://huggingface.co/Reunoze565231/Whisper_tiny/

In [None]:
import os
import glob

def get_last_checkpoint():
    checkpoint_dirs = sorted(glob.glob("Whisper_tiny/checkpoint-*"))[0]
    return checkpoint_dirs

In [None]:
from huggingface_hub import list_repo_files
import warnings
warnings.filterwarnings("ignore")

f = list_repo_files(target_repo)
if len(f) <= 1:
  trainer.train()
else:
  trainer.train(resume_from_checkpoint=get_last_checkpoint())