In [1]:
import psutil
import torch
import argparse
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets, load_dataset
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, \
    Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
#######################         GPU CHECK           #########################

print("GPU Available: ", torch.cuda.is_available())

GPU Available:  True


In [3]:
from huggingface_hub import notebook_login
import datasets

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
class args_class():

    model_name = 'openai/whisper-small.en' # which hf model to fine tune on

    language = 'English' # which language model will be adpated to in full, Camel Case

    sampling_rate = 16000 # audio sampling rate

    num_proc = 8 # 'Number of parallel jobs to run. Helps parallelize the dataset prep stage.

    train_strategy = 'epochs' # Training strategy. Choose between steps and epoch.
    learning_rate = 1e-5 # Learning rate for the fine-tuning process. Kind of a measure of how "fast" the model tunes its weights
    warmup = 500 # Number of warmup steps. These steps are done with a smaller learning rate to minimise the impact of devaiting model due to sudden new data exposure
    train_batchsize = 64 # Batch size during the training phase.
    test_batchsize = 64 # Batch size during the testing phase.
    num_epochs = 1 # Number of epochs to train for. (if on epoch strategy)
    num_steps = 100000 # Number of steps to train for.

    resume_from_ckpt = None # Path to a trained checkpoint to resume training from.

    output_dir = r"C:\Users\userAdmin\Desktop\whisper_final" # Output directory for the checkpoints generated.

    train_datasets = [r"C:\Users\userAdmin\Desktop\part2_no_repeats", r"C:\Users\userAdmin\Desktop\part1_2_no_repeat" ] # List of local datasets to be used for training.
    
    test_datasets = [] # List of HF datasets to be used for testing.
 
    combine_and_shuffle = True # set to true to combine all train and test datasets, shuffle and then split into new random train / test split
    test_size= 0.2 # ratio of test / train dataset when split (if combine_and_shuffle is true)

# instantiate args_class
args = args_class()

In [5]:
# gradient checkpointing helps reduce memory footprint, while increasingly training time marginally
# TO DO
gradient_checkpointing = True
freeze_feature_encoder = False
freeze_encoder = False


# do_normalize_eval will compute wer of the the normalized model output (ie all lower, case no punctuation). This means the model is not faulted for case or punctuation during evaluation
do_normalize_eval = True

# these are modifiers for the training data.
# do_lower_case converts all training data to lower case
# do_remove_punctuation will remove all puntuation

do_lower_case = False
do_remove_punctuation = False

# this is a function that normalizes text for us. It  does the following
# 1. Remove any phrases between matching brackets ([, ]).
# 2. Remove any phrases between matching parentheses ((, )).
# 3. Replace any markers, symbols, and punctuation characters with a space, i.e. when the Unicode category of each character in the NFKC-normalized string starts with M, S, or P.
# 4. make the text lowercase.
# 5. replace any successive whitespace characters with a space

# TO DO: verify if this works
normalizer = BasicTextNormalizer()

In [6]:
#############################       MODEL LOADING       #####################################

feature_extractor = WhisperFeatureExtractor.from_pretrained(args.model_name)
tokenizer = WhisperTokenizer.from_pretrained(args.model_name, language=args.language, task="transcribe")
processor = WhisperProcessor.from_pretrained(args.model_name, language=args.language, task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(args.model_name)

if model.config.decoder_start_token_id is None:
    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

if freeze_feature_encoder:
    model.freeze_feature_encoder()

if freeze_encoder:
    model.freeze_encoder()
    model.model.encoder.gradient_checkpointing = False

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

if gradient_checkpointing:
    model.config.use_cache = False

In [7]:
###########################        DATASET LOADING AND PREP        ##########################

def load_custom_dataset(split):
    ds = []
    if split == 'train':
        for dset in args.train_datasets:
            ds.append(load_from_disk(dset))
    if split == 'test':
        for dset in args.test_datasets:
            ds.append(load_from_disk(dset))

    ds_to_return = concatenate_datasets(ds)
    ds_to_return = ds_to_return.shuffle(seed=22)
    return ds_to_return

class TokenizerWrapper:
    def __init__(self, processor, normalizer):
        self.processor = processor
        self.do_lower_case = do_lower_case
        self. do_remove_punctuation = do_remove_punctuation
        self.normalizer = normalizer
    
    def prepare_dataset(self, batch):
    
        # load and (possibly) resample audio data to 16kHz
        audio = batch["audio"]

        # compute log-Mel input features from input audio array
        batch["input_features"] = self.processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
        # compute input length of audio sample in seconds
        batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]

        # optional pre-processing steps
        transcription = batch["sentence"]
        if self.do_lower_case:
            transcription = transcription.lower()
        if self.do_remove_punctuation:
            transcription = self.normalizer(transcription).strip()

        # encode target text to label ids
        batch["labels"] = self.processor.tokenizer(transcription).input_ids
        
        return batch




max_label_length = model.config.max_length
min_input_length = 0.0
max_input_length = 30.0

class FilterWrapper:

    def __init__(self, max_label_length, min_input_length, max_input_length):

        self.max_label_length = max_label_length
        self.min_input_length = min_input_length
        self.max_input_length = max_input_length

    def is_in_length_range(self, length, labels, sentence):
        return self.min_input_length < length < self.max_input_length and 0 < len(labels) < self.max_label_length and 0 < len(sentence)


print('DATASET PREPARATION IN PROGRESS...')

# Case 1: train and test are both provided, no combine_and_shuffle. Note that if combine_and_shuffle is False, both train and test are guranteed to be provided (else I would have gotten an error in cell 4)
if not(args.combine_and_shuffle):
    raw_dataset = DatasetDict()
    raw_dataset["train"] = load_custom_dataset('train')
    raw_dataset["test"] = load_custom_dataset('test')

# case 2: combine_and_shuffle is true, train and test both provided
elif len(args.test_datasets) > 0:
    # load both datasets
    train_set = load_custom_dataset('train')
    test_set = load_custom_dataset('test')

    # combine both datasets
    combined_dataset = concatenate_datasets(train_set, test_set)

    # split dataset
    raw_dataset = DatasetDict()
    raw_dataset = combined_dataset.train_test_split(test_size = args.test_size, shuffle=True, seed=42)

# case 3: combine_and_shuffle is true, only train provided
else:

    train_set = load_custom_dataset('train')

    raw_dataset = train_set.train_test_split(test_size = args.test_size, shuffle=True, seed=42)

raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
print("Raw Dataset: ")
print(raw_dataset)    

DATASET PREPARATION IN PROGRESS...


Loading dataset from disk:   0%|          | 0/99 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/27 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [8]:
## ONLY RUN THIS CELL TO AUGMENT WITH DATA FROM HUGGINGFACE ##

## THIS WILL MERGE THE TRAIN AND TEST SPLITS OF RAW_DATASETS AND RESHUFFLE IT ##

# first load the extra data, change the path to the HF dataset as needed
extra_data = load_dataset("mozilla-foundation/common_voice_16_1", "en", split="train")

# next, lets do some pre-processing on the extra data to get it in the same format as the rest of the dataset
extra_data = extra_data.remove_columns(set(extra_data.features.keys()) - set(["audio", "sentence"]))

# next lets get a slice of the dataset (the whole dataset is too big, so lets just use 450k examples)
extra_data = extra_data.shuffle(seed=42)
extra_data = extra_data.select( [i for i in range(0, 450000)] )

# cast audio
print("Extra Data: "
print(extra_data)
extra_data = extra_data.cast_column("audio", Audio(sampling_rate=args.sampling_rate))

# perform any additional formatting on EXTRA data
def prepare_extra_dataset(batch):
  """Function to preprocess the dataset with the .map method"""
  transcription = batch["sentence"]
  
  if transcription.startswith('"') and transcription.endswith('"'):
    # we can remove trailing quotation marks as they do not affect the transcription
    transcription = transcription[1:-1]
  
  if transcription[-1] not in [".", "?", "!"]:
    # append a full-stop to sentences that do not end in punctuation
    transcription = transcription + "."
  
  batch["sentence"] = transcription
  
  return batch

extra_data = extra_data.map(prepare_dataset, num_proc=8)

print("Formatted extra data: ")
print(extra_data)

# now that our extra_data is ready, lets merge it with the rest of our dataset
raw_dataset = concatenate_datasets( [raw_dataset["train"], raw_dataset["test"], extra_data] )
raw_dataset = raw_dataset.shuffle(seed=22)

raw_dataset = raw_dataset.train_test_split(test_size = args.test_size, shuffle=True, seed=42)

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 450000
})
Dataset({
    features: ['audio', 'sentence'],
    num_rows: 450000
})


In [9]:
# now perform pre-processing on the entire dataset

print("Before Map:")
print(raw_dataset)


tokenizer_wrapper = TokenizerWrapper(processor, normalizer)

raw_dataset = raw_dataset.map(tokenizer_wrapper.prepare_dataset, num_proc=args.num_proc)

print("After Map:")
print(raw_dataset)

filter_wrapper = FilterWrapper(max_label_length, min_input_length, max_input_length)

raw_dataset = raw_dataset.filter(
    filter_wrapper.is_in_length_range,
    input_columns=["input_length", "labels", "sentence"],
    num_proc=args.num_proc,
)

print("After Filter:")
print(raw_dataset)

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 435141
})
Before Map:
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 708112
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 177029
    })
})
After Map:
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'input_length', 'labels'],
        num_rows: 708112
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_features', 'input_length', 'labels'],
        num_rows: 177029
    })
})
After Filter:
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'input_length', 'labels'],
        num_rows: 708111
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_features', 'input_length', 'labels'],
        num_rows: 177029
    })
})


In [10]:
###############################     DATA COLLATOR AND METRIC DEFINITION     ########################

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors

        # compute log-Mel input features from input audio arra
        #input_features = [{"input_features": self.processor.feature_extractor(feature["audio"]["array"], sampling_rate=feature["audio"]["sampling_rate"]).input_features[0]} for feature in features]
        
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        
        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print('DATASET PREPARATION COMPLETED')

DATASET PREPARATION COMPLETED


In [11]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    if do_normalize_eval:
        pred_str = [normalizer(pred) for pred in pred_str]
        label_str = [normalizer(label) for label in label_str]

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}



In [12]:
###############################     TRAINING ARGS AND TRAINING      ############################

if args.train_strategy == 'epochs' or args.train_strategy == 'epoch':
    training_args = Seq2SeqTrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.train_batchsize,
        gradient_accumulation_steps=1,
        learning_rate=args.learning_rate,
        warmup_steps=args.warmup,
        gradient_checkpointing=gradient_checkpointing,
        fp16=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=args.num_epochs,
        save_total_limit=10,
        per_device_eval_batch_size=args.test_batchsize,
        predict_with_generate=True,
        generation_max_length=225,
        logging_steps=500,
        report_to=["tensorboard"],
        load_best_model_at_end=True,
        metric_for_best_model="wer",
        greater_is_better=False,
        optim="adafactor",
        resume_from_checkpoint=args.resume_from_ckpt,
        #remove_unused_columns=False
    )

elif args.train_strategy == 'steps' or args.train_strategy == 'step':
    training_args = Seq2SeqTrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.train_batchsize,
        gradient_accumulation_steps=1,
        learning_rate=args.learning_rate,
        warmup_steps=args.warmup,
        gradient_checkpointing=gradient_checkpointing,
        fp16=True,
        evaluation_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        max_steps=num_steps,
        save_total_limit=10,
        per_device_eval_batch_size=args.test_batchsize,
        predict_with_generate=True,
        generation_max_length=225,
        logging_steps=500,
        report_to=["tensorboard"],
        load_best_model_at_end=True,
        metric_for_best_model="wer",
        greater_is_better=False,
        optim="adafactor",
        resume_from_checkpoint=args.resume_from_ckpt,

    )

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=raw_dataset["train"],
    eval_dataset=raw_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)

processor.save_pretrained(training_args.output_dir)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


[]

In [13]:
print('INITIAL EVAL IN PROGRESS...')
print(trainer.evaluate())
print('DONE INITIAL EVAL')

INITIAL EVAL IN PROGRESS...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'eval_loss': 4.494086742401123, 'eval_wer': 19.883646857017613, 'eval_runtime': 46808.8052, 'eval_samples_per_second': 3.782, 'eval_steps_per_second': 0.059}
DONE INITIAL EVAL


In [None]:
print('TRAINING IN PROGRESS...')
trainer.train()
print('DONE TRAINING')