In [9]:
import librosa
import numpy as np
import whisper
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, Audio

In [4]:
def load_audio_and_targets(libri_speech_dir):
  libri_dict = {}
  for root, dirs, files in os.walk(libri_speech_dir):
    for file in files:
      if file.endswith('.txt'):
        target_path = os.path.join(root, file)
        with open(target_path, 'r') as f:
          targets =  f.readlines()
          for target in targets:
            target = target.split()
            file = os.path.join(root, target[0]+".mp3")
            target = " ".join(target[1:])
            libri_dict[file] = target
  return libri_dict

In [5]:
def tag_names_in_transcripts(names_dict, df):
    tagged_transcripts = {}
    for key, val in names_dict.items():
        words = val.lower().split()
        tagged_words = []  

        for i in range(len(df)):
            name = df.iloc[i, 0].lower()
            if name in words:
                tag = 'PER'
                tagged_words.append((name, tag))

        if tagged_words:
            tagged_transcripts[key] = tagged_words

    return tagged_transcripts

In [6]:
df = pd.read_csv('/home/g7/Desktop/baby-names.csv')
df.drop(['year', 'percent', 'sex'], inplace=True, axis=1)
df = df[df['name']!='Will']
df = df.head(329)
df

Unnamed: 0,name
0,John
1,William
2,James
3,Charles
4,George
...,...
325,Archibald
326,Caleb
327,Clint
328,Dudley


In [7]:
libri_dict = load_audio_and_targets("/home/g7/Desktop/LibriSpeech/train-clean-100/")
len(libri_dict)

28539

In [8]:
names_dict = {}
for i in range(len(df)):
  for key, val in libri_dict.items():
    if df.iloc[i, 0].lower() in val.lower().split():
      names_dict[key] = val

names_dict

{'/home/g7/Desktop/LibriSpeech/train-clean-100/4018/107312/4018-107312-0013.mp3': "YOU SENT UP FOR SNICKS I HAVE KNOWN YOU MAN AND BOY JOHN HILL THESE TWENTY SUMMERS AND NEVER HEARD A WORD AGAINST YOU TILL YOU GOT INTO SHUFFLE AND SCREW'S MILL OH THEY ARE A BAD YARN JOHN",
 '/home/g7/Desktop/LibriSpeech/train-clean-100/332/128985/332-128985-0079.mp3': 'TRIBUTOR AND A GREAT MANY OF THE PIRATES WERE KILLED OR TAKEN AND ROC THE BRAZILIAN HAD A TERRIBLE FALL THIS MOST MEMORABLE FALL OCCURRED IN THE ESTIMATION OF JOHN ESQUEMELING',
 '/home/g7/Desktop/LibriSpeech/train-clean-100/332/128985/332-128985-0087.mp3': 'BUT WHICH ARE NECESSARY TO MAKE UP THE TRUE CHARACTER OF A PIRATE THE HISTORIAN JOHN SEEMS TO HAVE BEEN VERY MUCH CUT UP BY THE MANNER IN WHICH HIS FAVORITE HERO HAD ROUNDED OFF HIS PIRATICAL CAREER',
 '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57158/5789-57158-0026.mp3': 'HE WAS OF COURSE ALTOGETHER UNCONSCIOUS WHAT GRAND THINGS HIS COUSIN JOHN HAD INTENDED TO DO BY HIM HAD 

In [9]:
ground_truth_annotations = tag_names_in_transcripts(names_dict, df)
print(len(ground_truth_annotations))
print(ground_truth_annotations)
len(names_dict)

3880
{'/home/g7/Desktop/LibriSpeech/train-clean-100/4018/107312/4018-107312-0013.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/332/128985/332-128985-0079.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/332/128985/332-128985-0087.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57158/5789-57158-0026.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57158/5789-57158-0033.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57195/5789-57195-0013.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57195/5789-57195-0015.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57195/5789-57195-0017.mp3': [('john', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/5789/57195/5789-57195-0025.mp3': [('john', 'PER'), ('general', 'PER')], '/home/g7/Desktop/LibriSpeech/train-clean-100/3857/180923/3857-180923-0014.mp3': [('john',

3880

In [10]:
data = pd.DataFrame(list(names_dict.items()), columns=['Audio', 'Sentence'])
data

Unnamed: 0,Audio,Sentence
0,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,YOU SENT UP FOR SNICKS I HAVE KNOWN YOU MAN AN...
1,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,TRIBUTOR AND A GREAT MANY OF THE PIRATES WERE ...
2,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,BUT WHICH ARE NECESSARY TO MAKE UP THE TRUE CH...
3,/home/g7/Desktop/LibriSpeech/train-clean-100/5...,HE WAS OF COURSE ALTOGETHER UNCONSCIOUS WHAT G...
4,/home/g7/Desktop/LibriSpeech/train-clean-100/5...,AH YES HE DIDN'T LIKE TO SAY UNCLE BECAUSE REG...
...,...,...
3875,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,MISTER DUDLEY HASTENED ROUND TO PREVENT THEIR ...
3876,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,HAD BEEN FAIRLY DRESSED FOR CHRISTMAS BY SPIRI...
3877,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,IN A TREMULOUS BUT VERY KIND VOICE GIVE YOUR M...
3878,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,OR DO PRISON LABOR ANY WORDS OF MINE WOULD BE ...


# Finetuning

In [11]:
df_train, df_test = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
df_train

Unnamed: 0,Audio,Sentence
1071,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,STUDIED AFTER THE WAR AT THE BERKELEY DIVINITY...
3777,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,EVEN SHOULD I BREAK ONE OF THEM WITH MY FIRST ...
2632,/home/g7/Desktop/LibriSpeech/train-clean-100/6...,WHICH HE HAD IMPARTED TO HER OF AN ARCH IT MUS...
3665,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,IN THIS WAY THEY HAVE THE ADVANTAGE OF THE PRI...
2453,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,I WENT IMMEDIATELY AND FOUND THE HISTORIC WAR ...
...,...,...
1130,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,MISSUS TEETER STOOD UP AND STEPPED ASIDE THEN ...
1294,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,AND COOL OFF BEFORE PLUNGING INTO THE EXCITEME...
860,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,THE CHANNINGS THEMSELVES WERE SILENT THEY COUL...
3507,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,AND THE FEW FINAL MINUTES BEFORE NOON PASSED B...


In [13]:
df_test

Unnamed: 0,Audio,Sentence
1826,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,HIS REWARD FOR THIS UNUSUAL DEMONSTRATION WAS ...
211,/home/g7/Desktop/LibriSpeech/train-clean-100/5...,I UNDERSTAND DAN WELL I DON'T RETURNED UNCLE J...
3768,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,AND PLACED IT UPON THE BLOTTING PAD BEFORE ME ...
410,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,THE ATMOSPHERE THROUGHOUT THE NEIGHBORHOOD WHE...
1188,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,MISTER OPOSSUM BUT PETER WASN'T LISTENING THE ...
...,...,...
2715,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,HAVING VOYAGED TWO HUNDRED AND SIXTY SEVEN MIL...
1114,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,I AM GLAD IT IS NO ONE BUT YOU PETER FOR I WAS...
2991,/home/g7/Desktop/LibriSpeech/train-clean-100/8...,KNOWING WHAT MIGHT BE EXPECTED FROM THE COMPAN...
175,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,AND NO SUCH NOBLE FAMILY SIGNOR MERRICK UNCLE ...


In [14]:
dataset = DatasetDict()
dataset["train"] = Dataset.from_pandas(df_train).cast_column("Audio", Audio())
dataset["test"] = Dataset.from_pandas(df_test).cast_column("Audio", Audio())
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['Audio', 'Sentence'],
        num_rows: 3104
    })
    test: Dataset({
        features: ['Audio', 'Sentence'],
        num_rows: 776
    })
})

In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
input_str = dataset["train"][0]["Sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


2024-10-05 01:56:44.710482: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-05 01:56:44.731806: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-05 01:56:44.731826: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-05 01:56:44.732398: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-05 01:56:44.736072: I tensorflow/core/platform/cpu_feature_guar

Input:                 STUDIED AFTER THE WAR AT THE BERKELEY DIVINITY SCHOOL AND HAS BEEN FOR MANY YEARS RECTOR OF SAINT JOHN'S CHURCH IN BRIDGEPORT LIEUTENANT AND BREVET CAPTAIN LEWIS W MUNGER
Decoded w/ special:    <|startoftranscript|><|en|><|transcribe|><|notimestamps|>STUDIED AFTER THE WAR AT THE BERKELEY DIVINITY SCHOOL AND HAS BEEN FOR MANY YEARS RECTOR OF SAINT JOHN'S CHURCH IN BRIDGEPORT LIEUTENANT AND BREVET CAPTAIN LEWIS W MUNGER<|endoftext|>
Decoded w/out special: STUDIED AFTER THE WAR AT THE BERKELEY DIVINITY SCHOOL AND HAS BEEN FOR MANY YEARS RECTOR OF SAINT JOHN'S CHURCH IN BRIDGEPORT LIEUTENANT AND BREVET CAPTAIN LEWIS W MUNGER
Are equal:             True


In [18]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
dataset = dataset.cast_column("Audio", Audio(sampling_rate=16000))

In [20]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["Audio"]
    
    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["Sentence"]).input_ids
    return batch


In [21]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1)

Map:   0%|          | 0/3104 [00:00<?, ? examples/s]

Map:   0%|          | 0/776 [00:00<?, ? examples/s]

In [22]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3104
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 776
    })
})

In [24]:
model.generation_config.language = "english"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [25]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [26]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [27]:
import evaluate

metric = evaluate.load("wer")
#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

In [28]:
# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # replace -100 with the pad_token_id
#     label_ids[label_ids == -100] = tokenizer.pad_token_id

#     # we do not want to group tokens when computing the metrics
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     wer = 100 * metric.compute(predictions=pred_str, references=label_str)

#     return {"wer": wer}


In [29]:
# from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     output_dir="./names-whisper-en-spectrogram-pitch-shifted",  # change to a repo name of your choice
#     per_device_train_batch_size=16,
#     gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
#     learning_rate=1e-5,
#     warmup_steps=500,
#     max_steps=5000,
#     gradient_checkpointing=True,
#     fp16=True,
#     evaluation_strategy="steps",
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     generation_max_length=225,
#     save_steps=1000,
#     eval_steps=1000,
#     logging_steps=25,
#     report_to=["tensorboard"],
#     load_best_model_at_end=True,
#     metric_for_best_model="wer",
#     greater_is_better=False,
#     push_to_hub=True,
#     push_to_hub_model_id="names-whisper-en-spectrogram-pitch-shifted",
# )


In [28]:
# from transformers import Seq2SeqTrainer

# trainer = Seq2SeqTrainer(
#     args=training_args,
#     model=model,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["test"],
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     tokenizer=processor.feature_extractor,
# )


In [29]:
# trainer.train()

In [30]:
# kwargs = {
#     "dataset_tags": "Libri",
#     "dataset": "LibriSpeech",  # a 'pretty' name for the training dataset
#     "dataset_args": "config: en, split: test",
#     "language": "en",
#     "model_name": "names Whisper small",  # a 'pretty' name for your model
#     "finetuned_from": "openai/whisper-small",
#     "tasks": "automatic-speech-recognition",
# }


In [31]:
# trainer.save_model() 
# trainer.push_to_hub()
# tokenizer.push_to_hub("names-whisper-en-spectrogram-pitch-shifted") 


# Evaluate Using NER Metric

#

## Using bert-base-NER

In [33]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")

Some weights of the model checkpoint at dslim/bert-base-NER-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
from transformers import pipeline

nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.993952, 'index': 4, 'word': 'wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.997895, 'index': 9, 'word': 'berlin', 'start': 34, 'end': 40}]


In [34]:
# pred_entities=[]
# doc_entities = []
# for entity in ner_results:
#     # Normalize B-PER and I-PER to PER and include only if PER
#     if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
#         doc_entities.append((entity['word'], 'PER'))
# if doc_entities:  # Only append if there are PER entities in the document
#     pred_entities.append(doc_entities)
# print("pred_entity: ", pred_entities)

In [None]:
# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # Replace -100 (used to mask padding in labels) with the pad_token_id
#     label_ids[label_ids == -100] = tokenizer.pad_token_id
50
                
#                 full_word = entity['word'].replace('##', '')
#                 previous_type = entity['entity']

#             elif entity['entity'] == 'I-PER' and previous_type in ['B-PER', 'I-PER']:
#                 full_word += ' ' + entity['word'].replace('##', '')

#         if previous_type in ['B-PER', 'I-PER'] and full_word:
#             print(f" - Text: {full_word}, Type: 'PERSON'")

#     wer = 100 * metric.compute(predictions=pred_str, references=label_str)

#     return {"wer": wer}

In [35]:
# true_entitiess = []
# for key, value in ground_truth_annotations.items():
#     doc_annotations = [entity for entity in value if entity[1] == 'PER']
#     true_entitiess.append(doc_annotations)
# print("pred_entity: ", true_entitiess)

In [80]:
# from seqeval.metrics import precision_score, recall_score, f1_score

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # Handle padding masking
#     label_ids[label_ids == -100] = tokenizer.pad_token_id

#     # Decode predictions and labels from their token IDs
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     # Assuming NER model 'nlp' outputs structured data for NER
#     for text in pred_str:
#         print("input to ner: ", text)
#         ner_results = nlp(text)

#     print(ner_results)
#     pred_entities = []
#     for result in ner_results:
#         doc_entities = []
#         for entity in result:
#             # Normalize B-PER and I-PER to PER and include only if PER
#             if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
#                 doc_entities.append((entity['word'], 'PER'))
#         if doc_entities:  # Only append if there are PER entities in the document
#             pred_entities.append(doc_entities)
#         print("pred_entity: ", pred_entities)

#     # true_entities = []
#     # for key in ground_truth_annotations:
#     #     # Each document's annotations
#     #     doc_annotations = [tuple(entity) for entity in ground_truth_annotations[key]]
#     #     true_entities.append(doc_annotations)

#     true_entities = []
#     for key, value in ground_truth_annotations.items():
#         doc_annotations = [entity for entity in value if entity[1] == 'PER']
#         true_entities.append(doc_annotations)

#     # Compute NER metrics
#     print("computing ner metrics")
#     precision = precision_score(true_entities, pred_entities)
#     recall = recall_score(true_entities, pred_entities)
#     f1 = f1_score(true_entities, pred_entities)

#     # Calculate WER if needed
#     print("computing wer metric")
#     wer = 100 * metric.compute(predictions=pred_str, references=label_str)

#     return {"precision": precision, "recall": recall, "f1": f1, "wer": wer}


In [35]:
from seqeval.metrics import precision_score, recall_score, f1_score

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Handle padding masking
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode predictions and labels from their token IDs
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    

    pred_entities = []
    for result in pred_str:
        result = nlp(result)
        doc_entities = []
        for entity in result:
            # Normalize B-PER and I-PER to PER and include only if PER
            if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
                doc_entities.append((entity['word'], 'PER'))
        # if doc_entities:  # Only append if there are PER entities in the document
        pred_entities.append(len(doc_entities))
        # print("pred_entity: ", pred_entities)

    true_entities = []
    for result in label_str:
        result = nlp(result)
        doc_entities = []
        for entity in result:
            # Normalize B-PER and I-PER to PER and include only if PER
            if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
                doc_entities.append((entity['word'], 'PER'))
        # if doc_entities:  # Only append if there are PER entities in the document
        true_entities.append(len(doc_entities))
        # print("true_entity: ", true_entities)

    ratio = 0
    count = 0
    for i in range(len(pred_entities)):
        if true_entities[i]:
            ratio += 100 * (pred_entities[i] / true_entities[i])
            count += 1
    ratio /= count

    # # Assuming NER model 'nlp' outputs structured data for NER
    # for text in pred_str:
    #     print("input to ner: ", text)
    #     ner_results = nlp(text)

    # print(ner_results)
    # pred_entities = []
    # for result in ner_results:
    #     doc_entities = []
    #     for entity in result:
    #         # Normalize B-PER and I-PER to PER and include only if PER
    #         if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
    #             doc_entities.append((entity['word'], 'PER'))
    #     if doc_entities:  # Only append if there are PER entities in the document
    #         pred_entities.append(doc_entities)
    #     print("pred_entity: ", pred_entities)

    # true_entities = []
    # for key in ground_truth_annotations:
    #     # Each document's annotations
    #     doc_annotations = [tuple(entity) for entity in ground_truth_annotations[key]]
    #     true_entities.append(doc_annotations)



    # true_entities = []
    # for text in label_str:
    #     doc = nlp(text)
    #     doc_entities = []
    #     for ent in doc.ents:
    #         # Append entity with IOB tagging
    #         if ent.start == 0 or doc[ent.start - 1].ent_type_ != ent.label_:
    #             doc_entities.append(f"B-{ent.label_}")
    #         else:
    #             doc_entities.append(f"I-{ent.label_}")
    #         for token in doc[ent.start + 1:ent.end]:
    #             doc_entities.append(f"I-{ent.label_}")
    #     true_entities.append(doc_entities)


    # true_entities = []
    # for key, value in ground_truth_annotations.items():
    #     doc_annotations = [entity for entity in value if entity[1] == 'PER']
    #     true_entities.append(doc_annotations)

    # # Compute NER metrics
    # print("computing ner metrics")
    # precision = precision_score(true_entities, pred_entities)
    # recall = recall_score(true_entities, pred_entities)
    # f1 = f1_score(true_entities, pred_entities)

    # Calculate WER if needed
    print("computing wer metric")
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"ner percent": ratio, "wer": wer}


In [60]:
# # ground_entities = ner_ground(names_dict)

# import spacy
# from seqeval.metrics import precision_score, recall_score, f1_score
# from seqeval.scheme import IOB2

# # Load spaCy NER model
# new_nlp = spacy.load("en_core_web_sm")

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # Handle padding masking; adjust depending on your tokenizer's specifics
#     label_ids[label_ids == -100] = tokenizer.pad_token_id

#     # Decode predictions and labels from their token IDs
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     pred_entities = []
#     for text in pred_str:
#         doc = new_nlp(text)
#         doc_entities = []
#         for ent in doc.ents:
#             # Append entity with IOB tagging
#             if ent.start == 0 or doc[ent.start - 1].ent_type_ != ent.label_:
#                 doc_entities.append(f"B-{ent.label_}")
#             else:
#                 doc_entities.append(f"I-{ent.label_}")
#             for token in doc[ent.start + 1:ent.end]:
#                 doc_entities.append(f"I-{ent.label_}")
#         pred_entities.append(doc_entities)


#     true_entities = []
#     for text in label_str:
#         doc = new_nlp(text)
#         doc_entities = []
#         for ent in doc.ents:
#             # Append entity with IOB tagging
#             if ent.start == 0 or doc[ent.start - 1].ent_type_ != ent.label_:
#                 doc_entities.append(f"B-{ent.label_}")
#             else:
#                 doc_entities.append(f"I-{ent.label_}")
#             for token in doc[ent.start + 1:ent.end]:
#                 doc_entities.append(f"I-{ent.label_}")
#         true_entities.append(doc_entities)


#     # Compute NER metrics
#     precision = precision_score(true_entities, pred_entities, mode='strict', scheme=IOB2)
#     recall = recall_score(true_entities, pred_entities, mode='strict', scheme=IOB2)
#     f1 = f1_score(true_entities, pred_entities, mode='strict', scheme=IOB2)

#     # Calculate WER
#     print("computing wer metric")
#     wer = 100 * metric.compute(predictions=pred_str, references=label_str)

#     return {"precision": precision, "recall": recall, "f1": f1, "wer": wer}


In [36]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./names-whisper-en-spectrogram-original",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    push_to_hub_model_id="names-whisper-en-spectrogram-original",
)




In [37]:
# from transformers import Seq2SeqTrainer

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["test"],
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
# )

# trainer.train()

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs


In [38]:
trainer.train()

  0%|          | 0/5000 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 0.668, 'grad_norm': 7.672530651092529, 'learning_rate': 4.800000000000001e-07, 'epoch': 0.13}
{'loss': 0.5516, 'grad_norm': 4.618879795074463, 'learning_rate': 9.800000000000001e-07, 'epoch': 0.26}
{'loss': 0.4715, 'grad_norm': 3.5204668045043945, 'learning_rate': 1.48e-06, 'epoch': 0.39}
{'loss': 0.3971, 'grad_norm': 3.969933032989502, 'learning_rate': 1.98e-06, 'epoch': 0.52}
{'loss': 0.29, 'grad_norm': 3.4956040382385254, 'learning_rate': 2.4800000000000004e-06, 'epoch': 0.64}
{'loss': 0.243, 'grad_norm': 3.4974381923675537, 'learning_rate': 2.9800000000000003e-06, 'epoch': 0.77}
{'loss': 0.2284, 'grad_norm': 3.3919765949249268, 'learning_rate': 3.48e-06, 'epoch': 0.9}
{'loss': 0.2031, 'grad_norm': 2.5976808071136475, 'learning_rate': 3.980000000000001e-06, 'epoch': 1.03}
{'loss': 0.1736, 'grad_norm': 3.031116247177124, 'learning_rate': 4.48e-06, 'epoch': 1.16}
{'loss': 0.178, 'grad_norm': 2.6710314750671387, 'learning_rate': 4.980000000000001e-06, 'epoch': 1.29}
{'loss': 0

  0%|          | 0/97 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


computing wer metric
{'eval_loss': 0.1413373053073883, 'eval_ner percent': 104.83144462666647, 'eval_wer': 5.9865607819181434, 'eval_runtime': 197.788, 'eval_samples_per_second': 3.923, 'eval_steps_per_second': 0.49, 'epoch': 5.15}




{'loss': 0.0081, 'grad_norm': 0.3105965852737427, 'learning_rate': 8.835555555555557e-06, 'epoch': 5.28}
{'loss': 0.0101, 'grad_norm': 0.7461782097816467, 'learning_rate': 8.78e-06, 'epoch': 5.41}
{'loss': 0.0081, 'grad_norm': 0.19892141222953796, 'learning_rate': 8.724444444444445e-06, 'epoch': 5.54}
{'loss': 0.0079, 'grad_norm': 0.4248500466346741, 'learning_rate': 8.66888888888889e-06, 'epoch': 5.67}
{'loss': 0.009, 'grad_norm': 0.44265782833099365, 'learning_rate': 8.613333333333333e-06, 'epoch': 5.8}
{'loss': 0.0089, 'grad_norm': 2.658320903778076, 'learning_rate': 8.557777777777778e-06, 'epoch': 5.93}
{'loss': 0.008, 'grad_norm': 0.1356877237558365, 'learning_rate': 8.502222222222223e-06, 'epoch': 6.06}
{'loss': 0.0056, 'grad_norm': 0.29185619950294495, 'learning_rate': 8.446666666666668e-06, 'epoch': 6.19}
{'loss': 0.0048, 'grad_norm': 0.11643055081367493, 'learning_rate': 8.391111111111112e-06, 'epoch': 6.31}
{'loss': 0.0052, 'grad_norm': 1.0602757930755615, 'learning_rate': 8.

  0%|          | 0/97 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


computing wer metric
{'eval_loss': 0.1528462916612625, 'eval_ner percent': 104.72556193332642, 'eval_wer': 5.894929749541845, 'eval_runtime': 198.4524, 'eval_samples_per_second': 3.91, 'eval_steps_per_second': 0.489, 'epoch': 10.31}




{'loss': 0.0016, 'grad_norm': 0.04660412669181824, 'learning_rate': 6.613333333333334e-06, 'epoch': 10.44}
{'loss': 0.0014, 'grad_norm': 0.04812489449977875, 'learning_rate': 6.557777777777778e-06, 'epoch': 10.57}
{'loss': 0.0014, 'grad_norm': 0.04586305841803551, 'learning_rate': 6.502222222222223e-06, 'epoch': 10.7}
{'loss': 0.0018, 'grad_norm': 0.055737338960170746, 'learning_rate': 6.446666666666668e-06, 'epoch': 10.82}
{'loss': 0.0014, 'grad_norm': 0.05116792023181915, 'learning_rate': 6.391111111111111e-06, 'epoch': 10.95}
{'loss': 0.0018, 'grad_norm': 0.047065962105989456, 'learning_rate': 6.3377777777777786e-06, 'epoch': 11.08}
{'loss': 0.0017, 'grad_norm': 0.036227475851774216, 'learning_rate': 6.282222222222223e-06, 'epoch': 11.21}
{'loss': 0.0013, 'grad_norm': 0.03936515003442764, 'learning_rate': 6.2266666666666675e-06, 'epoch': 11.34}
{'loss': 0.002, 'grad_norm': 0.06453337520360947, 'learning_rate': 6.171111111111112e-06, 'epoch': 11.47}
{'loss': 0.0011, 'grad_norm': 0.04

  0%|          | 0/97 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


computing wer metric
{'eval_loss': 0.16284844279289246, 'eval_ner percent': 105.30742579377389, 'eval_wer': 5.976379556098554, 'eval_runtime': 199.3962, 'eval_samples_per_second': 3.892, 'eval_steps_per_second': 0.486, 'epoch': 15.46}




{'loss': 0.0007, 'grad_norm': 0.01992686279118061, 'learning_rate': 4.393333333333334e-06, 'epoch': 15.59}
{'loss': 0.0007, 'grad_norm': 0.019534708932042122, 'learning_rate': 4.337777777777778e-06, 'epoch': 15.72}
{'loss': 0.0007, 'grad_norm': 0.019361864775419235, 'learning_rate': 4.282222222222222e-06, 'epoch': 15.85}
{'loss': 0.0007, 'grad_norm': 0.028621623292565346, 'learning_rate': 4.226666666666667e-06, 'epoch': 15.98}
{'loss': 0.0006, 'grad_norm': 0.01895475760102272, 'learning_rate': 4.171111111111111e-06, 'epoch': 16.11}
{'loss': 0.0006, 'grad_norm': 0.018665099516510963, 'learning_rate': 4.115555555555556e-06, 'epoch': 16.24}
{'loss': 0.0006, 'grad_norm': 0.01781301200389862, 'learning_rate': 4.060000000000001e-06, 'epoch': 16.37}
{'loss': 0.0007, 'grad_norm': 0.019011227414011955, 'learning_rate': 4.004444444444445e-06, 'epoch': 16.49}
{'loss': 0.0006, 'grad_norm': 0.01836339943110943, 'learning_rate': 3.948888888888889e-06, 'epoch': 16.62}
{'loss': 0.0006, 'grad_norm': 0.

  0%|          | 0/97 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


computing wer metric
{'eval_loss': 0.16904863715171814, 'eval_ner percent': 104.92190924358158, 'eval_wer': 5.976379556098554, 'eval_runtime': 197.8879, 'eval_samples_per_second': 3.921, 'eval_steps_per_second': 0.49, 'epoch': 20.62}




{'loss': 0.0005, 'grad_norm': 0.014578601345419884, 'learning_rate': 2.1711111111111113e-06, 'epoch': 20.75}
{'loss': 0.0005, 'grad_norm': 0.014961343258619308, 'learning_rate': 2.1155555555555557e-06, 'epoch': 20.88}
{'loss': 0.0004, 'grad_norm': 0.012206647545099258, 'learning_rate': 2.06e-06, 'epoch': 21.01}
{'loss': 0.0004, 'grad_norm': 0.015218980610370636, 'learning_rate': 2.0044444444444446e-06, 'epoch': 21.13}
{'loss': 0.0004, 'grad_norm': 0.012669587507843971, 'learning_rate': 1.948888888888889e-06, 'epoch': 21.26}
{'loss': 0.0004, 'grad_norm': 0.013777726329863071, 'learning_rate': 1.8933333333333333e-06, 'epoch': 21.39}
{'loss': 0.0004, 'grad_norm': 0.011579757556319237, 'learning_rate': 1.837777777777778e-06, 'epoch': 21.52}
{'loss': 0.0005, 'grad_norm': 0.013147405348718166, 'learning_rate': 1.7822222222222225e-06, 'epoch': 21.65}
{'loss': 0.0004, 'grad_norm': 0.01393717247992754, 'learning_rate': 1.7266666666666667e-06, 'epoch': 21.78}
{'loss': 0.0004, 'grad_norm': 0.0138

  0%|          | 0/97 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


computing wer metric
{'eval_loss': 0.17204536497592926, 'eval_ner percent': 105.02856453368396, 'eval_wer': 5.989954523858006, 'eval_runtime': 198.7518, 'eval_samples_per_second': 3.904, 'eval_steps_per_second': 0.488, 'epoch': 25.77}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 7733.903, 'train_samples_per_second': 10.344, 'train_steps_per_second': 0.647, 'train_loss': 0.028067765967547894, 'epoch': 25.77}


TrainOutput(global_step=5000, training_loss=0.028067765967547894, metrics={'train_runtime': 7733.903, 'train_samples_per_second': 10.344, 'train_steps_per_second': 0.647, 'total_flos': 2.30868320256e+19, 'train_loss': 0.028067765967547894, 'epoch': 25.77319587628866})

In [39]:
kwargs = {
    "dataset_tags": "Libri",
    "dataset": "LibriSpeech",  # a 'pretty' name for the training dataset
    "dataset_args": "config: en, split: test",
    "language": "en",
    "model_name": "names Whisper small",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}


In [40]:
trainer.save_model() 
trainer.push_to_hub()
tokenizer.push_to_hub("names-whisper-en-spectrogram-original") 


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618

README.md:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shahd237/names-whisper-en-spectrogram-original/commit/784d80ef5f6cdcbdf53f11adf94e7195218556cd', commit_message='Upload tokenizer', commit_description='', oid='784d80ef5f6cdcbdf53f11adf94e7195218556cd', pr_url=None, pr_revision=None, pr_num=None)

# Testing

In [6]:
# Replace 'your-username/the-name-you-picked' with the appropriate model identifier
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor

finetuned_model = WhisperForConditionalGeneration.from_pretrained("shahd237/names-whisper-en-spectrogram-original")
finetuned_processor = WhisperProcessor.from_pretrained("shahd237/names-whisper-en-spectrogram-original")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Testing on random 1000 of Librispeech

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")
nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import librosa
import numpy as np
import whisper
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, Audio

In [9]:
#load data from random_1000_from_librispeech.csv
df_specs = pd.read_csv('random_1000_from_librispeech.csv')
df_specs

Unnamed: 0.1,Unnamed: 0,audio,sentence
0,0,/home/g7/Desktop/LibriSpeech/train-clean-100/5...,Olenin was as happy as a boy of twelve tie it ...
1,1,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,I could just make out that he had a book as we...
2,2,/home/g7/Desktop/LibriSpeech/train-clean-100/2...,And what a centrepiece it was it required the ...
3,3,/home/g7/Desktop/LibriSpeech/train-clean-100/6...,The iron was rusty the leather torn the wood w...
4,4,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,Will satisfy my everlasting hatred my courage ...
...,...,...,...
995,995,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,The piano bard the piano rhapsodist the piano ...
996,996,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,Then another and a different horror fell to my...
997,997,/home/g7/Desktop/LibriSpeech/train-clean-100/5...,But the cold drove us out and making a large f...
998,998,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,Only i beg it shall not be before midnight


In [10]:
# df_specs['sentence'] = df_specs['sentence'].apply(lambda x: x.capitalize())

df_specs = df_specs.sample(frac=1, random_state=42).reset_index(drop=True)
df_specs

Unnamed: 0.1,Unnamed: 0,audio,sentence
0,521,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,Love and water brought back all her strength s...
1,737,/home/g7/Desktop/LibriSpeech/train-clean-100/1...,That the captain determined to run into wigwam...
2,740,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,Growled sam kitteridge bitterly resenting the ...
3,660,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,Sir sidney colvin regrets that the love letter...
4,411,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,Leading their young full fledged and about as ...
...,...,...,...
995,106,/home/g7/Desktop/LibriSpeech/train-clean-100/4...,And unconscious of the danger stood her ground...
996,270,/home/g7/Desktop/LibriSpeech/train-clean-100/3...,And every knight shall have a squire and two y...
997,860,/home/g7/Desktop/LibriSpeech/train-clean-100/7...,His appearance was welcomed by a joyful shout ...
998,435,/home/g7/Desktop/LibriSpeech/train-clean-100/2...,Well isabel you must be aware that it is an aw...


In [12]:
import torch
from transformers import pipeline
from jiwer import wer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

pipe = pipeline(
    "automatic-speech-recognition",
    model=finetuned_model,
    tokenizer=finetuned_processor.tokenizer,
    feature_extractor=finetuned_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

n = len(df_specs)
wer_accumulator = []
cer_accumulator = []
pred_entities = []
predictions = []
sentences = []

for i in range (n):
  sample = df_specs['audio'][i]
  prediction = pipe(sample)["text"]
  print(i, sep=' ')
  #list of predictions
  predictions.append(prediction)
  #list of true sentences
  sentences.append(df_specs['sentence'][i])
  #calculate WER
  wer_ans = wer(prediction.lower(),df_specs['sentence'][i].lower())
  wer_accumulator.append(wer_ans)

# Calculate WER
avg_wer = 100 * (sum(wer_accumulator) / len(wer_accumulator))

print(f"Average Word Error Rate: {avg_wer:.2f}%")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [19]:

#  Calculate NER
#align first 
import jiwer

# make all predictions and sentences lowercase
predictions = [prediction.lower() for prediction in predictions]
sentences = [sentence.lower() for sentence in sentences]
out = jiwer.process_words(
    predictions,
    sentences,
)
visualization = jiwer.visualize_alignment(out)
ref_list = []
hyp_list = []

print("out:", out)
print("visualize: ",jiwer.visualize_alignment(out))
lines = visualization.split('\n')
for line in lines:
    if line.startswith("REF:"):
        ref_list.append(line.strip().replace("REF: ", ""))
    elif line.startswith("HYP:"):
        hyp_list.append(line.strip().replace("HYP: ", ""))

# Now you have REF and HYP lines in two separate lists
print("References:", ref_list)
print("Hypotheses:", hyp_list)


visualize:  sentence 2
REF: that the captain determined to run into ****** wigwamcove this is a snug little harbour not far from **** capehorn and here at christmas eve we anchored in smooth water the only thing which reminded us of the gale outside
HYP: that the captain determined to run into wigwam       cove this is a snug little harbour not far from cape     horn and here at christmas eve we anchored in smooth water the only thing which reminded us of the gale outside
                                                  I          S                                               I        S                                                                                                           

sentence 5
REF: leading their young full fledged and about as large and strong as the parents squirrels dry and elastic after the storms were busy about their stores of pine nuts and the latest     golden rods were still in bloom though it was now past the middle of october
HYP: leading their y

In [20]:
# calculate NER for each true sentence
true_entities = []
for i in range(len(ref_list)):
    result = nlp(ref_list[i])
    doc_entities = []
    for entity in result:
        # Normalize B-PER and I-PER to PER and include only if PER
        if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
            doc_entities.append((entity['word'], 'PER'))
    # if doc_entities:  # Only append if there are PER entities in the document
    true_entities.append(doc_entities)
print("len true_entity: ", len(true_entities))
print("true_entities: ", true_entities)

len true_entity:  712
true_entities:  [[], [], [], [('isabel', 'PER')], [], [], [], [], [], [], [], [], [], [('tr', 'PER'), ('##ogen', 'PER'), ('##s', 'PER')], [], [('mer', 'PER'), ('##ce', 'PER')], [], [], [('miss', 'PER'), ('gram', 'PER'), ('##ont', 'PER')], [], [('to', 'PER'), ('christ', 'PER'), ('##ol', 'PER')], [('fl', 'PER'), ('##ins', 'PER'), ('##tones', 'PER')], [('diana', 'PER')], [], [], [], [('epa', 'PER'), ('##nine', 'PER'), ('as', 'PER')], [], [('ut', 'PER'), ('##o', 'PER')], [('pr', 'PER'), ('##udence', 'PER'), ('margaret', 'PER'), ('pr', 'PER'), ('##udence', 'PER')], [], [], [('carly', 'PER'), ('##le', 'PER'), ('barbara', 'PER'), ('ma', 'PER'), ('##mma', 'PER')], [('sc', 'PER'), ('##ag', 'PER'), ('##g', 'PER'), ('##zy', 'PER')], [('eric', 'PER'), ('gunn', 'PER'), ('be', 'PER'), ('##orn', 'PER')], [], [('margaret', 'PER')], [], [], [], [], [], [], [], [], [], [], [('am', 'PER'), ('##nity', 'PER')], [('miss', 'PER'), ('##us', 'PER'), ('bran', 'PER'), ('##nan', 'PER')], [('

In [21]:
#merge names to one word if they are in the same entity depending on the hashtags #
true_entities_merged = []
for i in range(len(true_entities)):
    doc_entities = true_entities[i]
    doc_entities_merged = []
    for j in range(len(doc_entities)):
        if doc_entities[j][1] == 'PER':
            doc_entities_merged.append((doc_entities[j][0], 'PER'))
            while(j+1 < len(doc_entities) and doc_entities[j+1][1] == 'PER' and doc_entities[j+1][0][0] =='#'):
                #remove the hashtag and merge the names
                doc_entities_merged[-1] = (doc_entities_merged[-1][0] + doc_entities[j+1][0][2:], 'PER')
                doc_entities[j+1] = (doc_entities[j+1][0], '*')
                j += 1
            # doc_entities_merged.append(doc_entities[j])
    true_entities_merged.append(doc_entities_merged)

print("true_entities_merged: ", true_entities_merged)  
total_length = sum(len(u) for u in true_entities_merged)

true_entities_merged:  [[], [], [], [('isabel', 'PER')], [], [], [], [], [], [], [], [], [], [('trogens', 'PER')], [], [('merce', 'PER')], [], [], [('miss', 'PER'), ('gramont', 'PER')], [], [('to', 'PER'), ('christol', 'PER')], [('flinstones', 'PER')], [('diana', 'PER')], [], [], [], [('epanine', 'PER'), ('as', 'PER')], [], [('uto', 'PER')], [('prudence', 'PER'), ('margaret', 'PER'), ('prudence', 'PER')], [], [], [('carlyle', 'PER'), ('barbara', 'PER'), ('mamma', 'PER')], [('scaggzy', 'PER')], [('eric', 'PER'), ('gunn', 'PER'), ('beorn', 'PER')], [], [('margaret', 'PER')], [], [], [], [], [], [], [], [], [], [], [('amnity', 'PER')], [('missus', 'PER'), ('brannan', 'PER')], [('hyacinth', 'PER'), ('coronel', 'PER'), ('udo', 'PER')], [('d', 'PER'), ("'", 'PER'), ('artagnan', 'PER')], [('neilson', 'PER'), ('billy', 'PER'), ('alice', 'PER'), ('greggory', 'PER')], [], [], [('cassie', 'PER'), ('grandfather', 'PER')], [], [('elizabeth', 'PER')], [], [], [], [('rory', 'PER')], [('agatha', 'PER'

In [22]:
#add the other words add (add entity * to it) to the true entities merged list to have the same length as the predictions
final_true_entities_merged = [] #list of lists of tuples (word, entity) for each sentence for all the words in the sentence
for i in range(len(true_entities_merged)): #for each sentence 
    doc_entities = true_entities_merged[i] 
    ref_sentence = ref_list[i].split()
    k = 0

    final_true_entities = []
    for j in range(len(ref_sentence)):
        if (len(doc_entities) == k or doc_entities[k][0].lower() not in ref_sentence[j].lower()): #if the word is not in the entities list add it with entity *
            # print(ref_sentence[j], doc_entities[k][0])
            final_true_entities.append((ref_sentence[j], '*'))
        else:
            #if the word is a substring of the entity word, add the original word to the list iinstead of the entity word (WITH THE ENTITY)
            final_true_entities.append((ref_sentence[j], doc_entities[k][1]))
            k += 1
    final_true_entities_merged.append(final_true_entities)
print("true_entities_merged: ", final_true_entities_merged)
    



In [23]:
for i in range(len(final_true_entities_merged)):
    ref_sentence = ref_list[i].split()
    doc_entities = final_true_entities_merged[i]

    for j in range(len(ref_sentence)):
        if (len(ref_sentence[j]) != len(doc_entities[j][0])):
            print("ref_sentence: ", ref_sentence[j])
            print("doc_entities: ", doc_entities[j][0])

In [24]:
# calculate NER for each prediction
pred_entities = []
for i in range(len(hyp_list)):
    result = nlp(hyp_list[i])
    doc_entities = []
    for entity in result:
        # Normalize B-PER and I-PER to PER and include only if PER
        if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
            doc_entities.append((entity['word'], 'PER'))
        else:
            doc_entities.append((entity['word'], '*'))
    # if doc_entities:  # Only append if there are PER entities in the document
    pred_entities.append(doc_entities)
print("len pred_entities: ", len(pred_entities))
print("pred_entities: ", pred_entities)

len pred_entities:  712
pred_entities:  [[('wig', '*'), ('##wa', '*'), ('##m', '*'), ('cove', '*'), ('cape', '*'), ('horn', '*')], [], [], [('paris', '*'), ('isabel', 'PER')], [('mari', '*')], [], [], [], [], [], [('ry', 'PER'), ('##nch', 'PER')], [], [], [('trojan', '*'), ('fe', '*'), ('##e', '*'), ('rome', '*'), ('lap', '*'), ('##o', '*'), ('salt', '*'), ('##ere', '*'), ('##llo', '*')], [], [('mer', 'PER'), ('##ce', 'PER')], [], [], [('miss', 'PER'), ('gram', 'PER'), ('##mont', 'PER')], [], [('##r', '*'), ('old', 'PER'), ('toll', 'PER'), ('cr', 'PER'), ('##iste', 'PER'), ('##l', 'PER')], [('far', '*'), ('east', '*'), ('great', '*'), ('ocean', '*'), ('mankind', '*')], [('diana', 'PER')], [], [], [], [('ep', 'PER'), ('##oni', 'PER'), ('##ne', 'PER')], [], [('ud', '*'), ('##o', '*')], [('pr', 'PER'), ('##udence', 'PER'), ('marguerite', 'PER'), ('delicious', 'PER'), ('pr', 'PER'), ('##udence', 'PER')], [], [], [('carly', 'PER'), ('##le', 'PER'), ('barbara', 'PER'), ('ma', 'PER'), ('##mma

In [25]:
#merge names to one word if they are in the same entity depending on the hashtags #
pred_entities_merged = []
for i in range(len(pred_entities)):
    doc_entities = pred_entities[i]
    doc_entities_merged = []
    for j in range(len(doc_entities)):
        if doc_entities[j][1] == 'PER':
            doc_entities_merged.append((doc_entities[j][0], 'PER'))
            while(j+1 < len(doc_entities) and doc_entities[j+1][1] == 'PER' and doc_entities[j+1][0][0] =='#'):
                #remove the hashtag and merge the names
                doc_entities_merged[-1] = (doc_entities_merged[-1][0] + doc_entities[j+1][0][2:], 'PER')
                doc_entities[j+1] = (doc_entities[j+1][0], '*')
                j += 1
            # doc_entities_merged.append(doc_entities[j])
    pred_entities_merged.append(doc_entities_merged)

print("pred_entities_merged: ", pred_entities_merged)

pred_entities_merged:  [[], [], [], [('isabel', 'PER')], [], [], [], [], [], [], [('rynch', 'PER')], [], [], [], [], [('merce', 'PER')], [], [], [('miss', 'PER'), ('grammont', 'PER')], [], [('old', 'PER'), ('toll', 'PER'), ('cristel', 'PER')], [], [('diana', 'PER')], [], [], [], [('eponine', 'PER')], [], [], [('prudence', 'PER'), ('marguerite', 'PER'), ('delicious', 'PER'), ('prudence', 'PER')], [], [], [('carlyle', 'PER'), ('barbara', 'PER'), ('mamma', 'PER')], [('skaggsy', 'PER')], [('eric', 'PER'), ('gunnbiorn', 'PER')], [], [('margaret', 'PER')], [], [], [], [], [], [], [], [], [], [], [], [('missus', 'PER'), ('brannan', 'PER')], [('hyacinth', 'PER'), ('coronel', 'PER'), ('udo', 'PER')], [('d', 'PER'), ("'", 'PER'), ('artagnan', 'PER')], [('neilson', 'PER'), ('billy', 'PER'), ('alice', 'PER'), ('greggory', 'PER')], [], [], [('verry', 'PER'), ('cassy', 'PER'), ('grandfather', 'PER')], [], [('elizabeth', 'PER')], [('hetty', 'PER')], [], [], [], [('agatha', 'PER')], [], [], [], [], []

In [26]:
#add the other words add (add entity * to it) to the true entities merged list to have the same length as the predictions
final_pred_entities_merged = [] #list of lists of tuples (word, entity) for each sentence for all the words in the sentence
for i in range(len(pred_entities_merged)): #for each sentence 
    doc_entities = pred_entities_merged[i] 
    hyp_sentence = hyp_list[i].split()
    k = 0

    final_pred_entities = []
    for j in range(len(hyp_sentence)):
        if (len(doc_entities) == k or doc_entities[k][0].lower() not in hyp_sentence[j].lower()): #if the word is not in the entities list add it with entity *
            # print(ref_sentence[j], doc_entities[k][0])
            final_pred_entities.append((hyp_sentence[j], '*'))
        else:
            #if the word is a substring of the entity word, add the original word to the list iinstead of the entity word (WITH THE ENTITY)
            final_pred_entities.append((hyp_sentence[j], doc_entities[k][1]))
            k += 1
    final_pred_entities_merged.append(final_pred_entities)
print("final_pred_entities_merged: ", final_pred_entities_merged)



In [27]:
# Function to extract only the tags and replace '*' with 'O'
def extract_tags(nested_list):
    tags = []
    for sublist in nested_list:
        # Collect tags from each tuple in the sublist, replace '*' with 'O'
        sublist_tags = ['O' if tag == '*' else 'I-' + tag for word, tag in sublist]
        if sublist_tags:  # Only add non-empty lists
            tags.append(sublist_tags)
    return tags


In [28]:
true_extracted_tags = extract_tags(final_true_entities_merged)
predicted_extracted_tags = extract_tags(final_pred_entities_merged)

In [29]:
#calculate precision, recall, f1 score
from seqeval.metrics import precision_score, recall_score, f1_score

precision = precision_score(true_extracted_tags, predicted_extracted_tags)
recall = recall_score(true_extracted_tags, predicted_extracted_tags)
f1 = f1_score(true_extracted_tags, predicted_extracted_tags)

print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)


Precision:  0.8147058823529412
Recall:  0.7194805194805195
F1:  0.7641379310344827
