In [1]:
import os

# disable GPU

#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import pandas as pd

from datasets import Dataset as HFDataset, Audio

In [3]:
def prep_df(root_path):
    # Read the TSV file and extract data
    tsv_file_path = os.path.join(root_path, "utt_spk_text.tsv")
    with open(tsv_file_path, "r", encoding="utf-8") as tsv_file:
        lines = tsv_file.readlines()

    # Prepare data for the Hugging Face dataset
    file_paths = []
    folder_names = []
    texts = []

    for line in lines:
        file_name, _, text = line.strip().split("\t")
        folder_name = file_name[:2]

        file_path = os.path.join(root_path, "data", folder_name, file_name + ".flac")

        file_paths.append(file_path)
        folder_names.append(folder_name)
        texts.append(text)

    # Create the Hugging Face dataset
    dataset_dict = {
        "file_path": file_paths,
        "folder_name": folder_names,
        "text": texts,
    }
    
    # create dataframe
    df = pd.DataFrame(dataset_dict)
    
    return df, dataset_dict


In [4]:
df,dataset_dict = prep_df("/home/venom/repo/Bengali.AI-Speech-Recognition/openslr/")

In [5]:
print(df.head())

                                           file_path folder_name  \
0  /home/venom/repo/Bengali.AI-Speech-Recognition...          00   
1  /home/venom/repo/Bengali.AI-Speech-Recognition...          00   
2  /home/venom/repo/Bengali.AI-Speech-Recognition...          00   
3  /home/venom/repo/Bengali.AI-Speech-Recognition...          00   
4  /home/venom/repo/Bengali.AI-Speech-Recognition...          00   

                      text  
0  বাংলাদেশে দায়িত্ব নেবে  
1      এ ধরণের কার্ড নিয়ে  
2        হতে উপার্জিত অর্থ  
3    হাসির বিষয় হয়েই আছে  
4          সার্ক দেশগুলোতে  


In [6]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor


# Load the tokenizer and feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="bengali", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="bengali", task="transcribe")

In [7]:
# Wrap the dictionary in a Hugging Face Dataset object
dataset_output = HFDataset.from_dict(dataset_dict)

# Cast the 'file_path' column to the 'Audio' feature
dataset = dataset_output.cast_column("file_path", Audio())

In [8]:
dataset = dataset.train_test_split(test_size=0.01)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file_path', 'folder_name', 'text'],
        num_rows: 216515
    })
    test: Dataset({
        features: ['file_path', 'folder_name', 'text'],
        num_rows: 2188
    })
})

In [10]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    feature_extractor: Any
    tokenizer: Any
    sampling_rate: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # First treat the audio inputs: load audio, extract features and convert to torch tensors
        input_features = [{"input_features": self.feature_extractor(feature["file_path"]["array"], sampling_rate=self.sampling_rate).input_features[0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Get the tokenized label sequences
        label_features = [{"input_ids": self.tokenizer(feature["text"]).input_ids} for feature in features]
        # Pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If bos token is appended in previous tokenization step,
        # cut bos token here as it's appended later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, feature_extractor=feature_extractor, tokenizer=tokenizer, sampling_rate=16000)

In [11]:
import evaluate
metric = evaluate.load("wer")

2023-07-30 14:44:30.449730: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-30 14:44:30.481240: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-07-30 14:44:33,807] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cpu (auto detect)


In [12]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [13]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("bangla-speech-processing/BanglaASR") #,load_in_8bit=True)


In [14]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [15]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



In [16]:
import bitsandbytes as bnb

def find_all_linear_names(model, bits=8):
    cls = bnb.nn.Linear4bit if bits == 4 else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [17]:
#target_modules = find_all_linear_names(model)
target_modules = ['k_proj', 'fc2', 'q_proj', 'fc1', 'out_proj', 'v_proj']
#target_modules = ['q_proj', 'v_proj']


In [18]:
from peft import LoraConfig,LoraConfig, get_peft_model

config = LoraConfig(r=8, lora_alpha=32, target_modules=target_modules, lora_dropout=0.1, bias="none")


if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()
else:
    def make_inputs_require_grad(module, input, output):
         output.requires_grad_(True)

    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

In [19]:
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3,244,032 || all params: 244,978,944 || trainable%: 1.3242085001395059


In [20]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-bn",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=25,
    #max_steps=10,
    do_eval=True,
    do_train=True,
    gradient_checkpointing=True,
    fp16=True,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=5,
    #use_ipex=True,
    #bf16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    generation_max_length=225,
    eval_steps=100,
    logging_steps=25,
    report_to=["wandb", "tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
    predict_with_generate=True
)

In [21]:
from transformers import Seq2SeqTrainer


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
    
)
model.config.use_cache = False

In [22]:
with torch.cuda.amp.autocast(enabled=True):
    trainer.train()

2023-07-30 14:44:46,958 - wandb.jupyter - ERROR - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msupersecurehuman[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/40596 [00:00<?, ?it/s]



{'loss': 0.4476, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 0.3227, 'learning_rate': 9.993837963077075e-06, 'epoch': 0.0}
{'loss': 0.2951, 'learning_rate': 9.98767592615415e-06, 'epoch': 0.01}
{'loss': 0.2479, 'learning_rate': 9.981513889231224e-06, 'epoch': 0.01}


RuntimeError: expected scalar type BFloat16 but found Half