In [1]:
import json
import os
import torch
from torch.utils.data import Dataset
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Data preparation: Each data point is annotated using the start and end token in format [label_name] label [label_name]
def prepare_data(jsonl_file, audio_folder):
    data = []
    with open(jsonl_file, 'r') as f:
        for i, line in enumerate(f):
            item = json.loads(line)
            text = item['text']
            entities = item['entities']
            # Insert entity tags
            for start, end, label in reversed(entities):
                text = text[:start] + f"[{label}] " + text[start:end] + f" [{label}]" + text[end:]
            audio_file = f"{audio_folder}/test{i+1}.wav"
            if not os.path.exists(audio_file):
                print(f"Warning: Audio file not found: {audio_file}")
                continue
            data.append({
                'audio': audio_file,
                'text': text,
            })
    return data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Custom dataset class
class SpokenNERDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        audio_file = item['audio']
        text = item['text']
        # Load and process audio using librosa with sample rate as 16000
        speech, _ = librosa.load(audio_file, sr=16000)
        input_features = self.processor(speech, sampling_rate=16000, return_tensors="pt").input_features # Extracting the audio features 
        labels = self.processor(text=text, return_tensors="pt").input_ids  # Process text
        return {
            "input_features": input_features.squeeze(),
            "labels": labels.squeeze(),
        }

In [None]:
train_data = prepare_data("/content/Final_training_data.jsonl", "/content/Audio_Files_for_training")
test_data = prepare_data("/content/Final_testing_data.jsonl", "/content/Audio_Files_for_training")
eval_data = prepare_data("/content/Final_evaluation_data.jsonl", "/content/Audio_Files_for_training")
print(f"Train dataset size: {len(train_data)}")
print(f"Test dataset size: {len(test_data)}")
print(f"Eval dataset size: {len(eval_data)}")

Train dataset size: 137
Test dataset size: 39
Eval dataset size: 20


In [None]:
# Using Whisper base model and processor
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# Adding special tokens
special_tokens = ['[PERSON]', '[PHONE]', '[DATE]', '[CARDINAL]', '[GPE]', '[LOC]', '[MONEY]', '[ORG]', '[EMAIL]', '[CREDIT_CARD]', '[BANK_ACCOUNT]', '[CAR_PLATE]', '[NRIC]', '[PASSPORT_NUM]', '[TIME]']
processor.tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(processor.tokenizer))

train_dataset = SpokenNERDataset(train_data, processor)
test_dataset = SpokenNERDataset(test_data, processor)
eval_dataset = SpokenNERDataset(eval_data, processor)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# !pip install accelerate -U

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    fp16=False,  
    learning_rate=3e-5,
    num_train_epochs=3,
    max_steps=500,
    logging_steps=50,
    save_steps=100,
    eval_steps=100,
)

In [None]:
# Data collator function
def data_collator(features):
    input_features = [{"input_features": feature["input_features"]} for feature in features]
    labels = [feature["labels"] for feature in features]
    batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
    batch['labels'] = processor.tokenizer.pad({"input_ids": labels}, return_tensors="pt")["input_ids"]
    return batch

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
100,0.4969,4.027322
200,0.0957,4.561027
300,0.0342,4.72169
400,0.0094,4.882617
500,0.0038,4.885138


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635

TrainOutput(global_step=500, training_loss=0.2596944146156311, metrics={'train_runtime': 328.8844, 'train_samples_per_second': 6.081, 'train_steps_per_second': 1.52, 'total_flos': 1.2699587248128e+17, 'train_loss': 0.2596944146156311, 'epoch': 14.285714285714286})

In [3]:
output_dir = "./ner_model_v2"
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)

In [4]:
processor = WhisperProcessor.from_pretrained(output_dir)
model = WhisperForConditionalGeneration.from_pretrained(output_dir)

def predict_entities(audio_file, processor, model):
    speech, _ = librosa.load(audio_file, sr=16000)
    input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features

    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)[0]

    return transcription
audio_file = "Audio_Files_for_training/test1.wav"
transcription = predict_entities(audio_file, processor, model)
print(f"Transcription with entities: {transcription}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


: 