In [1]:
# This notebook uses https://github.com/openai/whisper with edits to the whisper_openAI/decoding.py to generate multiple hypothesis
import sys
import os
import argparse
import re
import json
import torch
import logging
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from datasets import load_dataset, DatasetDict, Audio
from dataclasses import dataclass
from typing import Dict, List, Union
from evaluate import load
from tqdm import tqdm
from datetime import datetime


In [4]:
speaker_id = "M01"  # Example value; replace with the actual Speaker ID as needed
learning_rate = 0.0001
train_batch_size = 4
eval_batch_size = 4
seed = 42
gradient_accumulation_steps = 2
optimizer = "adamw_torch"
lr_scheduler_type = "linear"
num_epochs = 20
keep_all_data = False
debug = False
repo_suffix = ""

print(f"Speaker ID: {speaker_id}")
print(f"Learning rate: {learning_rate}")
print(f"Training batch size: {train_batch_size}")
print(f"Evaluation batch size: {eval_batch_size}")
print(f"Random seed: {seed}")
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
print(f"Optimizer type: {optimizer}")
print(f"Learning rate scheduler type: {lr_scheduler_type}")
print(f"Number of epochs: {num_epochs}")
print(f"Keep all data: {keep_all_data}")
print(f"Debug mode: {debug}")
print(f"Repository suffix: {repo_suffix}")
test_speaker = speaker_id

Speaker ID: M01
Learning rate: 0.0001
Training batch size: 4
Evaluation batch size: 4
Random seed: 42
Gradient accumulation steps: 2
Optimizer type: adamw_torch
Learning rate scheduler type: linear
Number of epochs: 20
Keep all data: False
Debug mode: False
Repository suffix: 


In [5]:
import os
import logging

# Define the path to the CSV file
torgo_csv_path = "data_preparation/torgo.csv"

# Check if the path exists and is a file
if os.path.exists(torgo_csv_path) and os.path.isfile(torgo_csv_path):
    print("The CSV file exists.")
else:
    print("The CSV file does not exist.")

torgo_dataset_path = '/work/van-speech-nlp/data/torgo'
torgo_dataset_dir_path = torgo_dataset_path + \
        '/' if torgo_dataset_path[-1] != '/' else torgo_dataset_path
output_path = 'output'
print(f'torgo_dataset_path: {torgo_dataset_path}')
print(f'torgo_dataset_dir_path: {torgo_dataset_dir_path}')

repo_name = f'torgo_tiny_finetune_{test_speaker}{repo_suffix}'
repo_path = f'jindaxz/{repo_name}'

# Path to save model / checkpoints{repo_name}'
model_local_path = output_path + '/model/' + repo_name

pretrained_model_name = "openai/whisper-tiny"

The CSV file does not exist.
torgo_dataset_path: /work/van-speech-nlp/data/torgo
torgo_dataset_dir_path: /work/van-speech-nlp/data/torgo/


In [6]:
%cd ..

/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/Whispering-LLaMA


In [7]:
import numpy
# Renamed the Whisepr repo (https://github.com/openai/whisper) with the changed decoding.py file as whisper_openAI
import whisper_openAI.whisper as whisper
import torch
import tqdm
model, _ = whisper.load_model("tiny") # you can change the whisper model here to largev2 or large to swap the  model.

In [8]:
data_df = pd.read_csv(torgo_csv_path)
dataset_csv = load_dataset('csv', data_files=torgo_csv_path)

# Check if the following columns exist in the dataset ['session', 'audio', 'text', 'speaker_id']
expected_columns = ['session', 'audio', 'text', 'speaker_id']
not_found_columns = []
for column in expected_columns:
    if column not in dataset_csv['train'].column_names:
        not_found_columns.append(column)

if len(not_found_columns) > 0:
    logging.error(
        "The following columns are not found in the dataset:" + " [" + ", ".join(not_found_columns) + "]")
    sys.exit(1)

In [9]:
logging.info(
    "Splitting the dataset into training / validation / test sets...")

# Extract the unique speakers in the dataset
speakers = data_df['speaker_id'].unique()

logging.info("Unique speakers found in the dataset:")
logging.info(str(speakers) + '\n')

if test_speaker not in speakers:
    logging.error("Test Speaker not found in the dataset.")
    sys.exit(1)

valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [
    test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(
    lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(
    lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(
    lambda x: x == test_speaker, input_columns=['speaker_id'])

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

In [10]:
original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(
    torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

if not keep_all_data:
    # Update the three dataset splits (if ['test_data'] == 1, keep in test, if ['test_data'] == 0, keep in train and validation)
    torgo_dataset['train'] = torgo_dataset['train'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['validation'] = torgo_dataset['validation'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['test'] = torgo_dataset['test'].filter(
        lambda x: x['test_data'] == 1)

    # Drop the 'test_data' column
    torgo_dataset['train'] = torgo_dataset['train'].remove_columns([
                                                                   'test_data'])
    torgo_dataset['validation'] = torgo_dataset['validation'].remove_columns([
                                                                             'test_data'])
    torgo_dataset['test'] = torgo_dataset['test'].remove_columns([
                                                                 'test_data'])
    logging.info(
        f"After removal of repeated prompts, the number of data in each dataset is:")
    logging.info(
        f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
    logging.info(
        f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')

Filter:   0%|          | 0/14580 [00:00<?, ? examples/s]

Filter:   0%|          | 0/739 [00:00<?, ? examples/s]

In [11]:
# Remove special characters from the text
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'


def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex,
                           ' ', batch['text']).lower()
    return batch

torgo_dataset = torgo_dataset.map(remove_special_characters)

Map:   0%|          | 0/9519 [00:00<?, ? examples/s]

Map:   0%|          | 0/407 [00:00<?, ? examples/s]

In [12]:

print(torgo_dataset['train'][2]['text'])

meat


In [26]:
import whisper_openAI.whisper as whisper
from tqdm import tqdm

# Load the Whisper model
model, _ = whisper.load_model("tiny")

# Function to process each audio file
def process_audio(file_path):
    # Load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(file_path)
    audio = whisper.pad_or_trim(audio)
    
    # Make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    # Decode the audio
    options = whisper.DecodingOptions(fp16=False, without_timestamps=True, language='english')
    result, _ = whisper.decode(model, mel, options)
    
    result_text = ''.join(result)
    
    # Print the recognized text
    # print(result_text)
    return result_text

# Initialize the results and ground truth lists
recognized_texts = []
ground_truth_texts = []

def normalize_text(text):
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'
    text = re.sub(chars_to_ignore_regex, ' ', text).lower()
    return text
    
# Iterate over the dataset
for i in tqdm(range(len(torgo_dataset['test'])), desc="Processing audio files"):
    # Get the file path and ground truth from the dataset
    file_path = torgo_dataset['test'][i]['audio']
    ground_truth = torgo_dataset['test'][i]['text']
    
    # Process the audio file
    recognized_text = process_audio(file_path)
    recognized_text = normalize_text(recognized_text)
    ground_truth = normalize_text(ground_truth)
    
    # Append the results to the lists
    recognized_texts.append(recognized_text)
    ground_truth_texts.append(ground_truth)
    
    # Print the recognized text
    # print(f"text {i+1}/{len(torgo_dataset['test'])}: {recognized_text}")
    # print(f"Ground truth: {ground_truth}")
    # print()

# Calculate WER for each recognized text against the ground truth
wer_scores = [wer(gt, rt) for gt, rt in zip(ground_truth_texts, recognized_texts)]

# Print the average WER
average_wer = sum(wer_scores) / len(wer_scores)
print(f"Average WER: {average_wer}")

# Optional: Save the recognized texts and ground truths to files
# with open('recognized_texts.txt', 'w') as f:
#     for text in recognized_texts:
#         f.write(f"{text}\n")

# with open('ground_truth_texts.txt', 'w') as f:
#     for text in ground_truth_texts:
#         f.write(f"{text}\n")

Processing audio files: 100%|██████████| 407/407 [04:23<00:00,  1.54it/s]

Average WER: 1.3473965019419567





In [27]:
recognized_texts

['when a big eat while eating a bit crack and covers i dream for',
 'preat ',
 'so im gonna',
 'see you',
 'great ',
 'again being doing doing doing or not or id permit',
 'i dont believe in things to enjoy',
 'i longed for the only mere clean do we do',
 'fear',
 'bid ',
 'byebye',
 'wow he is very manly very handsome',
 'right',
 'bye dear',
 'no',
 'yet in the banks as worthy and ever',
 'and i get',
 'its real boo',
 'borg',
 'shit ',
 'whoa ',
 'here',
 'i will go',
 'ear ',
 'i can do it again when i do it again',
 'dambu',
 'fork ',
 'you will do not know about my grandpas',
 'goal ',
 'give it to abelach in a burnout in the face of the idmultry pick',
 'yep',
 'right',
 'nope ',
 'dread',
 'no ',
 'see you again and thank you to him crazy wide wide i am',
 'no ',
 'claw ',
 'rave',
 'with the',
 'you are in the pain of your daughter and the hope in the empty',
 'or',
 'pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp