In [1]:
import sys
import os
import argparse
import re
import json
import torch
import logging
import pandas as pd
import numpy as np

# from dotenv import load_dotenv
from datasets import load_dataset, DatasetDict, Audio
from dataclasses import dataclass
from typing import Dict, List, Union
from evaluate import load
from tqdm import tqdm
from datetime import datetime
from jiwer import wer


# parser = argparse.ArgumentParser(
#     description='Process speaker ID and optional parameters.')
# # Required argument: speaker ID
# parser.add_argument('--speaker_id',
#                     type=str,
#                     help='Speaker ID in the format [MF]C?[0-9]{2}')
# parser.add_argument("--model_name", type=str, help="Name of the Whisper model to load (e.g., 'tiny', 'base', 'small', 'medium', 'large')")

# args = parser.parse_args()

speaker_id = 'M03'
test_speaker = speaker_id

learning_rate = 0.0001
train_batch_size = 4
eval_batch_size = 4
seed = 42
gradient_accumulation_steps = 2
optimizer = "adamw_torch"
lr_scheduler_type = "linear"
num_epochs = 20
keep_all_data = False
debug = False
repo_suffix = ""

print(f"Speaker ID: {speaker_id}")
print(f"Learning rate: {learning_rate}")
print(f"Training batch size: {train_batch_size}")
print(f"Evaluation batch size: {eval_batch_size}")
print(f"Random seed: {seed}")
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
print(f"Optimizer type: {optimizer}")
print(f"Learning rate scheduler type: {lr_scheduler_type}")
print(f"Number of epochs: {num_epochs}")
print(f"Keep all data: {keep_all_data}")
print(f"Debug mode: {debug}")
print(f"Repository suffix: {repo_suffix}")
test_speaker = speaker_id

  from .autonotebook import tqdm as notebook_tqdm
  from pandas.core import (


Speaker ID: M03
Learning rate: 0.0001
Training batch size: 4
Evaluation batch size: 4
Random seed: 42
Gradient accumulation steps: 2
Optimizer type: adamw_torch
Learning rate scheduler type: linear
Number of epochs: 20
Keep all data: False
Debug mode: False
Repository suffix: 


In [2]:
from huggingsound import SpeechRecognitionModel

model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")

06/23/2024 15:32:55 - INFO - huggingsound.speech_recognition.model - Loading model...




In [3]:
import os
import logging

# Define the path to the CSV file
torgo_csv_path = "data_preparation/torgo.csv"

# Check if the path exists and is a file
if os.path.exists(torgo_csv_path) and os.path.isfile(torgo_csv_path):
    print("The CSV file exists.")
else:
    print("The CSV file does not exist.")

torgo_dataset_path = '/work/van-speech-nlp/data/torgo'
torgo_dataset_dir_path = torgo_dataset_path + \
        '/' if torgo_dataset_path[-1] != '/' else torgo_dataset_path
output_path = 'output'
print(f'torgo_dataset_path: {torgo_dataset_path}')
print(f'torgo_dataset_dir_path: {torgo_dataset_dir_path}')


The CSV file exists.
torgo_dataset_path: /work/van-speech-nlp/data/torgo
torgo_dataset_dir_path: /work/van-speech-nlp/data/torgo/


In [4]:
data_df = pd.read_csv(torgo_csv_path)
dataset_csv = load_dataset('csv', data_files=torgo_csv_path)

# Check if the following columns exist in the dataset ['session', 'audio', 'text', 'speaker_id']
expected_columns = ['session', 'audio', 'text', 'speaker_id']
not_found_columns = []
for column in expected_columns:
    if column not in dataset_csv['train'].column_names:
        not_found_columns.append(column)

Generating train split: 16394 examples [00:00, 268825.07 examples/s]


In [5]:
logging.info(
    "Splitting the dataset into training / validation / test sets...")

# Extract the unique speakers in the dataset
speakers = data_df['speaker_id'].unique()

logging.info("Unique speakers found in the dataset:")
logging.info(str(speakers) + '\n')

if test_speaker not in speakers:
    logging.error("Test Speaker not found in the dataset.")
    sys.exit(1)

valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [
    test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(
    lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(
    lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(
    lambda x: x == test_speaker, input_columns=['speaker_id'])



Filter: 100%|██████████| 16394/16394 [00:00<00:00, 401782.27 examples/s]
Filter: 100%|██████████| 16394/16394 [00:00<00:00, 429372.44 examples/s]
Filter: 100%|██████████| 16394/16394 [00:00<00:00, 622827.66 examples/s]


In [6]:
original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(
    torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

if not keep_all_data:
    # Update the three dataset splits (if ['test_data'] == 1, keep in test, if ['test_data'] == 0, keep in train and validation)
    torgo_dataset['train'] = torgo_dataset['train'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['validation'] = torgo_dataset['validation'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['test'] = torgo_dataset['test'].filter(
        lambda x: x['test_data'] == 1)

    # Drop the 'test_data' column
    torgo_dataset['train'] = torgo_dataset['train'].remove_columns([
                                                                   'test_data'])
    torgo_dataset['validation'] = torgo_dataset['validation'].remove_columns([
                                                                             'test_data'])
    torgo_dataset['test'] = torgo_dataset['test'].remove_columns([
                                                                 'test_data'])
    logging.info(
        f"After removal of repeated prompts, the number of data in each dataset is:")
    logging.info(
        f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
    logging.info(
        f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')


# In[11]:


# Remove special characters from the text
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'


def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex,
                           ' ', batch['text']).lower()
    return batch

torgo_dataset = torgo_dataset.map(remove_special_characters)


# In[12]:


print(torgo_dataset['train'][2]['text'])


Filter: 100%|██████████| 14519/14519 [00:00<00:00, 50846.78 examples/s]
Filter: 100%|██████████| 1075/1075 [00:00<00:00, 52702.73 examples/s]
Filter: 100%|██████████| 800/800 [00:00<00:00, 49997.66 examples/s]
Map: 100%|██████████| 9493/9493 [00:01<00:00, 8591.40 examples/s]
Map: 100%|██████████| 483/483 [00:00<00:00, 9734.00 examples/s]
Map: 100%|██████████| 442/442 [00:00<00:00, 9389.08 examples/s]

meat





In [14]:
from tqdm import tqdm


# Initialize the results and ground truth lists
recognized_texts = []
ground_truth_texts = []

def normalize_text(text):
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'
    text = re.sub(chars_to_ignore_regex, ' ', text).lower()
    return text
    
# Iterate over the dataset
for i in tqdm(range(len(torgo_dataset['test'])), desc="Processing audio files"):
    # Get the file path and ground truth from the dataset
    file_path = torgo_dataset['test'][i]['audio']
    ground_truth = torgo_dataset['test'][i]['text']
    
    # Process the audio file
    recognized_text_lst = model.transcribe([file_path])
    recognized_text=recognized_text_lst[0]["transcription"]
    recognized_text = normalize_text(recognized_text)
    ground_truth = normalize_text(ground_truth)
    
    # Append the results to the lists
    recognized_texts.append(recognized_text)
    ground_truth_texts.append(ground_truth)
    
    # Print the recognized text
    # print(f"text {i+1}/{len(torgo_dataset['test'])}: {recognized_text}")
    # print(f"Ground truth: {ground_truth}")
    # print()

# Calculate WER for each recognized text against the ground truth
wer_scores = [wer(gt, rt) for gt, rt in zip(ground_truth_texts, recognized_texts)]

# Print the average WER
average_wer = sum(wer_scores) / len(wer_scores)
print(f"Average WER: {average_wer}")

100%|██████████| 1/1 [00:01<00:00,  1.33s/it]2 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  1.62it/s]2 [00:01<09:47,  1.33s/it]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s]2 [00:01<06:42,  1.09it/s]
100%|██████████| 1/1 [00:00<00:00,  1.83it/s]2 [00:02<06:32,  1.12it/s]
100%|██████████| 1/1 [00:01<00:00,  1.23s/it]2 [00:03<05:31,  1.32it/s]
100%|██████████| 1/1 [00:01<00:00,  1.37s/it]2 [00:04<06:46,  1.08it/s]
100%|██████████| 1/1 [00:00<00:00,  1.21it/s]2 [00:05<07:51,  1.08s/it]
100%|██████████| 1/1 [00:00<00:00,  1.29it/s]2 [00:06<07:15,  1.00s/it]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s]2 [00:07<06:43,  1.07it/s]
100%|██████████| 1/1 [00:00<00:00,  1.48it/s]2 [00:08<06:34,  1.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.32it/s]42 [00:09<06:02,  1.19it/s]
100%|██████████| 1/1 [00:00<00:00,  1.17it/s]42 [00:09<05:51,  1.23it/s]
100%|██████████| 1/1 [00:00<00:00,  1.13it/s]42 [00:10<05:55,  1.21it/s]
100%|██████████| 1/1 [00:01<00:00,  1.97s/it]42 [00:11<06:02,  1.18it

: 

: 

In [None]:
# Print the average WER
average_wer = sum(wer_scores) / len(wer_scores)
print(f"Average WER: {average_wer}")

id="wav2vec2"
# Ensure the directory exists
output_dir = f'runs/{id}'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Optional: Save the recognized texts and ground truths to files
with open(f'{output_dir}/{id}_{speaker_id}_recognized_texts.txt', 'w') as f:
    for text in recognized_texts:
        f.write(f"{text}\n")

with open(f'{output_dir}/{id}_{speaker_id}_ground_truth_texts.txt', 'w') as f:
    for text in ground_truth_texts:
        f.write(f"{text}\n")
with open(f'{output_dir}/{id}_{speaker_id}_wer.txt', 'w') as f:
    f.write(f"Average WER: {average_wer}\n")