In [None]:
%%capture
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install tensorboard
!pip install gradio
!pip install audiomentations soundfile
!pip install huggingface
!pip install -q bitsandbytes datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install pyngrok

In [None]:
import os
noise_dir = "/kaggle/input/env-noise-data/16hz_audio"

toggle_LoRa = True
toggle_bit_quantization = True

in_fly_evaluation_freq = 50
is_testing_phase = False
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_DISABLED"] = "true"
model_type = "tiny" # tiny or base
model_repo = f"vinai/PhoWhisper-{model_type}"
language = "vi"
target_repo = f"Reunoze565231/PEFT_qLoRa_Whisper_{model_type}"
task = "transcribe"
dataset_name = "linhtran92/viet_bud500"

In [None]:
#Make sure that random seed is the same across training sessions
import torch
import numpy as np
import random

seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
from datasets import load_dataset
from huggingface_hub import login

login(token=os.environ["HUGGINGFACE_TOKEN"])
bud500 = load_dataset(dataset_name,streaming=True)
bud500 = bud500.shuffle(buffer_size=10_000, seed=seed)

In [None]:
from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_repo)

In [None]:
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_repo, language=language, task=task)

In [None]:
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(model_repo, language=language, task=task)

In [None]:
import numpy as np
import accelerate
from functools import partial
from audiomentations import Compose, PitchShift, TimeStretch, Gain, AddBackgroundNoise, TimeMask
lower_case = True
def augment_audio(audio_data, sr=16_000, with_augmentation=True, minimize_augmentation=True):
    if not with_augmentation:
        return audio_data
    if minimize_augmentation:
        augment = Compose([
            AddBackgroundNoise(sounds_path=noise_dir, min_snr_db=3.0, max_snr_db=15.0, p=0.4),
            Gain(min_gain_db=-4, max_gain_db=4, p=0.1),
        ])
    else: 
        augment = Compose([
            AddBackgroundNoise(sounds_path=noise_dir, min_snr_db=3.0, max_snr_db=15.0, p=0.4),
            TimeStretch(min_rate=0.9, max_rate=1.1, p=0.1),
            PitchShift(min_semitones=-2, max_semitones=2, p=0.1),
            Gain(min_gain_db=-6, max_gain_db=6, p=0.1),
            TimeMask(min_band_part=0.05, max_band_part=0.15, p=0.1)
        ])
    augmented_audio = augment(samples=audio_data, sample_rate=sr)
    return augmented_audio

def prepare_dataset(with_augmentation, batch):
    audio = batch["audio"]
    augmented_audio = augment_audio(audio["array"], sr=audio["sampling_rate"], with_augmentation=with_augmentation)
    batch["input_features"] = feature_extractor(augmented_audio, sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["transcription"], truncation=True, max_length=64).input_ids
    return batch

def prepare_wer_eval_dataset(batch):
    audio  = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    transcription = batch["transcription"]
    if lower_case:
        transcription = transcription.lower()
    batch["labels"] = tokenizer(transcription, truncation=True, max_length=64).input_ids
    return batch

bud500["train"] = bud500["train"].map(partial(prepare_dataset, True), remove_columns=["audio", "transcription"])
bud500["validation"] = bud500["validation"].map(partial(prepare_dataset, False), remove_columns=["audio", "transcription"])

bud500_wer_evaluation = load_dataset(dataset_name, split="validation", streaming=True)
bud500_wer_evaluation = bud500_wer_evaluation.shuffle(seed=42)
bud500_wer_evaluation = bud500_wer_evaluation.map(prepare_wer_eval_dataset, remove_columns=["audio", "transcription"])

private_wer_evaluation = load_dataset("kanjiroreal/vietF_123", split="train", streaming=True)
private_wer_evaluation = private_wer_evaluation.shuffle(seed=42)
private_wer_evaluation = private_wer_evaluation.map(prepare_wer_eval_dataset, remove_columns=["audio", "transcription"])

# bud500["test"] = bud500["test"].map(prepare_dataset1, remove_columns=["audio", "transcription"], with_augmentation=False)

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [None]:
from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig

if toggle_bit_quantization:
    model = WhisperForConditionalGeneration.from_pretrained(model_repo, quantization_config=BitsAndBytesConfig())
else:
    model = WhisperForConditionalGeneration.from_pretrained(model_repo)

model.generation_config.language = language
model.generation_config.task = task
model.generation_config.forced_decoder_ids = None

In [None]:
from torch.utils.data import DataLoader

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id
)
bud500_onfly_dataloader = DataLoader(bud500_wer_evaluation, batch_size=128, collate_fn=data_collator)
private_onfly_dataloader = DataLoader(private_wer_evaluation, batch_size=128, collate_fn=data_collator)

Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from peft import prepare_model_for_kbit_training
if toggle_bit_quantization:
    model = prepare_model_for_kbit_training(model) 

In [None]:
from peft import LoraConfig, PeftModel,  LoraConfig, get_peft_model, PeftConfig

# Use second most avaiable pairs (8, 16) in lora config to ensure the performance of whisper model (16, 32 for base)
# The adaptor will be applied to in infernece
if model_type == "tiny":
    config = LoraConfig(r=8,
                        lora_alpha=16,
                        target_modules=["q_proj", "v_proj"],
                        bias="none")
elif model_type == "base":
    config = LoraConfig(r=16,
                        lora_alpha=32,
                        target_modules=["q_proj", "v_proj"],
                        bias="none")

model.enable_input_require_grads()
model = get_peft_model(model, config)
model.print_trainable_parameters()


## Distributed-training (Continous training configuration)

In [None]:
import glob
def get_last_checkpoint():
    checkpoint_dir = sorted(glob.glob(f"PEFT_qLoRa_Whisper_{model_type}/checkpoint-*"), key= lambda i: int(i.split("-")[1]))[-1]
    return checkpoint_dir, checkpoint_dir.split("-")[1]

In [None]:
!rm -rf PEFT_qLoRa_Whisper_tiny && git clone https://huggingface.co/Reunoze565231/PEFT_qLoRa_Whisper_tiny/

In [None]:
last_checkpoint, checkpoint_no = get_last_checkpoint()
from accelerate import Accelerator
last_checkpoint, checkpoint_no = get_last_checkpoint()
accelerator = Accelerator(project_dir=last_checkpoint)

peft_model_id = f"{last_checkpoint}/adapter_model"

config = PeftConfig.from_pretrained(f"{last_checkpoint}/")

model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=True)

model.print_trainable_parameters()

### Define the Training Configuration

In [None]:
from transformers import Seq2SeqTrainingArguments
import multiprocessing
num_workers = multiprocessing.cpu_count() // 2


if is_testing_phase:
    training_args = Seq2SeqTrainingArguments(
        logging_strategy="steps",
        logging_dir="./temp_logs",
        save_strategy="steps",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_checkpointing=True,
        logging_steps=10,
        save_steps=500,
        warmup_steps=10,
        gradient_accumulation_steps=1,
        learning_rate=3e-5,
        eval_strategy="steps",
        eval_steps=10,
        fp16=True,
        max_steps=20_000,
        generation_max_length=64,
        report_to=["tensorboard"],
        dataloader_num_workers=num_workers,
        dataloader_pin_memory=True,
        remove_unused_columns=False,
        weight_decay=1e-3,
        lr_scheduler_type= "cosine",
        metric_for_best_model="loss",
        hub_token=os.environ["HUGGINGFACE_TOKEN"],
        label_names=["labels"],
        hub_model_id=target_repo,
    )
else:
    training_args = Seq2SeqTrainingArguments(
        logging_strategy="steps",
        logging_dir="./temp_logs",
        save_strategy="steps",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        gradient_checkpointing=True,
        logging_steps=10,
        save_steps=200,
        warmup_steps=500,
        gradient_accumulation_steps=1,
        learning_rate=3e-5,
        eval_strategy="steps",
        eval_steps=100,
        fp16=True,
        ignore_data_skip =True,
        max_steps=20_000,
        generation_max_length=64,
        report_to=["tensorboard"],
        dataloader_num_workers=num_workers,
        dataloader_pin_memory=True,
        remove_unused_columns=False,
        weight_decay=1e-3,
        lr_scheduler_type= "cosine",
        seed=seed,
        save_total_limit = 1,
        metric_for_best_model="loss",
        # load_best_model_at_end=True,
        hub_token=os.environ["HUGGINGFACE_TOKEN"],
        label_names=["labels"],
        hub_model_id=target_repo,
    )

In [None]:
%rm -rf ./temp_logs && mkdir ./temp_logs

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("./temp_logs")
%load_ext tensorboard
%tensorboard --logdir ./temp_logs

In [None]:
from pyngrok import ngrok
# token 1 2b6j3fSKaPypUs09RvKK29Bamcf_4ymVMLNVBK59JQCibHWGH
# toekn 2 2u1hzH5wyJQQBaWaFfC0x6iY71P_5SQ5NfPX7hUAHfMCGUSHg
ngrok.set_auth_token("2u1hzH5wyJQQBaWaFfC0x6iY71P_5SQ5NfPX7hUAHfMCGUSHg")
public_url = ngrok.connect(6006)
print("Ngrok Tunnel URL:", public_url)

In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from huggingface_hub import upload_folder
import evaluate

metric = evaluate.load("wer")

class SavePeftModelToHubCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        repo_id = args.hub_model_id
        checkpoint_folder = os.path.join(args.output_dir,  f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        print(f"🚀 Uploading checkpoint {checkpoint_folder} to Hugging Face Hub...")
        upload_folder(
            path_in_repo=f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
            repo_id=repo_id,
            folder_path=checkpoint_folder,
            commit_message=f"Checkpoint at {state.global_step}")
        import shutil
        shutil.rmtree(checkpoint_folder, ignore_errors=True)
        return control

class OnflyEvaluation(TrainerCallback):
    def __init__(self, per_steps, tokenizer, data_loader, metric_prefix):
        self.per_steps = per_steps
        self.tokenizer = tokenizer
        self.data_loader = data_loader
        self.metric_prefix = metric_prefix
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def on_step_end(self, args, state, control, **kwargs):
        if (state.global_step % self.per_steps) == 0 and state.global_step > 0:
            model = kwargs["model"]
            model.eval()
            predictions, references = [], []
            
            with torch.no_grad():
                for batch in self.data_loader:
                    inputs = batch["input_features"].to(self.device)
                    labels = batch["labels"].to(self.device)

                    outputs = model.generate(inputs)
                    
                    pred_texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
                    ref_texts = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

                    predictions.extend(pred_texts)
                    references.extend(ref_texts)
            wer_score = metric.compute(predictions=predictions, references=references)
            # print(f"The WER score is: {wer_score}")
            writer.add_scalar(f'eval/{ self.metric_prefix}/wer', wer_score, global_step=state.global_step)
            state.log_history.append({"step": state.global_step, f"wer/{self.metric_prefix}": wer_score})

bud500_onfly_callback = OnflyEvaluation(per_steps=in_fly_evaluation_freq,
                              tokenizer=tokenizer,
                              data_loader=bud500_onfly_dataloader,
                              metric_prefix="bud500")

private_onfly_callback = OnflyEvaluation(per_steps=in_fly_evaluation_freq,
                              tokenizer=tokenizer,
                              data_loader=private_onfly_dataloader,
                              metric_prefix="private")

trainer = accelerator.prepare(Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=bud500["train"],
    eval_dataset=bud500["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelToHubCallback(),
               bud500_onfly_callback,
               private_onfly_callback],
))
model.config.use_cache = False

In [None]:
%debug
from huggingface_hub import list_repo_files
import warnings

warnings.filterwarnings("ignore")
f = list_repo_files(target_repo)

if len(f) <= 1:
    trainer.train()
else:
    print(f"Continuous training from checkpoint {checkpoint_no}")
    trainer.train(resume_from_checkpoint=last_checkpoint)