In [1]:
# modified from https://github.com/SlangLab-NU/torgo_inference_on_cluster/blob/main/train.py
import sys
import os
import argparse
import re
import json
import torch
import logging
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from datasets import load_dataset, DatasetDict, Audio
from dataclasses import dataclass
from typing import Dict, List, Union
from evaluate import load
from tqdm import tqdm
from datetime import datetime

!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/zhang.jinda1/.cache/huggingface/token
Login successful


In [2]:
speaker_id = "MC01"  # Example value; replace with the actual Speaker ID as needed
learning_rate = 0.0001
train_batch_size = 4
eval_batch_size = 4
seed = 42
gradient_accumulation_steps = 2
optimizer = "adamw_torch"
lr_scheduler_type = "linear"
num_epochs = 20
keep_all_data = False
debug = False
repo_suffix = ""

print(f"Speaker ID: {speaker_id}")
print(f"Learning rate: {learning_rate}")
print(f"Training batch size: {train_batch_size}")
print(f"Evaluation batch size: {eval_batch_size}")
print(f"Random seed: {seed}")
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
print(f"Optimizer type: {optimizer}")
print(f"Learning rate scheduler type: {lr_scheduler_type}")
print(f"Number of epochs: {num_epochs}")
print(f"Keep all data: {keep_all_data}")
print(f"Debug mode: {debug}")
print(f"Repository suffix: {repo_suffix}")

Speaker ID: MC01
Learning rate: 0.0001
Training batch size: 4
Evaluation batch size: 4
Random seed: 42
Gradient accumulation steps: 2
Optimizer type: adamw_torch
Learning rate scheduler type: linear
Number of epochs: 20
Keep all data: False
Debug mode: False
Repository suffix: 


In [3]:
if not re.match(r'^[MF]C?[0-9]{2}$', speaker_id):
    print("Please provide a valid speaker ID.")
    sys.exit(1)
test_speaker = speaker_id

if repo_suffix and not re.match(r'^[_-]', args.repo_suffix):
    repo_suffix = '_' + repo_suffix

In [4]:
config = {
    "evaluation_strategy": "steps",
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 2,
    "eval_delay": 0,
    "learning_rate": 0.0001,
    "weight_decay": 0.005,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "lr_scheduler_type": "linear",
    "warmup_ratio": 0.0,
    "warmup_steps": 1000,
    "save_strategy": "steps",
    "save_steps": 500,
    "save_total_limit": 3,
    "report_to": "all",
    "seed": 42,
    "eval_steps": 1000,
    "num_train_epochs": 20,
    "optim": "adamw_torch",
    "optim_args": None,
    "adafactor": False,
    "group_by_length": True,
    "length_column_name": "length",
    "push_to_hub": True,
    "hub_strategy": "every_save"
}

In [5]:
import os

# Define the path to the CSV file
torgo_csv_path = "./torgo.csv"

# Check if the path exists and is a file
if os.path.exists(torgo_csv_path) and os.path.isfile(torgo_csv_path):
    print("The CSV file exists.")
else:
    print("The CSV file does not exist.")

torgo_dataset_path = '/work/van-speech-nlp/data/torgo'
torgo_dataset_dir_path = torgo_dataset_path + \
        '/' if torgo_dataset_path[-1] != '/' else torgo_dataset_path
output_path = 'output'
print(f'torgo_dataset_path: {torgo_dataset_path}')
print(f'torgo_dataset_dir_path: {torgo_dataset_dir_path}')

repo_name = f'torgo_tiny_finetune_{test_speaker}{repo_suffix}'
repo_path = f'jindaxz/{repo_name}'

# Path to save model / checkpoints{repo_name}'
model_local_path = output_path + '/model/' + repo_name

pretrained_model_name = "openai/whisper-tiny"

The CSV file exists.
torgo_dataset_path: /work/van-speech-nlp/data/torgo
torgo_dataset_dir_path: /work/van-speech-nlp/data/torgo/


In [6]:
if not os.path.exists(output_path + '/logs'):
    os.makedirs(output_path + '/logs')

log_dir = f'{output_path}/logs/{repo_name}'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

log_file_name = test_speaker + '_train' + '_' + \
    datetime.now().strftime("%Y%m%d_%H%M%S") + '.log'
log_file_path = log_dir + '/' + log_file_name

logging.basicConfig(
    filename=log_file_path,
    filemode='a',
    format='%(asctime)s - %(message)s',
    datefmt='%d-%b-%y %H:%M:%S',
    level=logging.INFO
)
# Log to console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
logging.getLogger().addHandler(console_handler)

logging.info("Test Speaker: " + test_speaker)
logging.info("Log File Path: " + log_file_path + '\n')
if keep_all_data:
    logging.info("Keep all data in training/validation/test sets\n")

Test Speaker: MC01
Log File Path: output/logs/torgo_tiny_finetune_MC01/MC01_train_20240531_013451.log



In [7]:
data_df = pd.read_csv(torgo_csv_path)
dataset_csv = load_dataset('csv', data_files=torgo_csv_path)

# Check if the following columns exist in the dataset ['session', 'audio', 'text', 'speaker_id']
expected_columns = ['session', 'audio', 'text', 'speaker_id']
not_found_columns = []
for column in expected_columns:
    if column not in dataset_csv['train'].column_names:
        not_found_columns.append(column)

not_found_columns

[]

In [8]:
logging.info(
    "Splitting the dataset into training / validation / test sets...")

# Extract the unique speakers in the dataset
speakers = data_df['speaker_id'].unique()

logging.info("Unique speakers found in the dataset:")
logging.info(str(speakers) + '\n')

if test_speaker not in speakers:
    logging.error("Test Speaker not found in the dataset.")
    sys.exit(1)

valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [
    test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(
    lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(
    lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(
    lambda x: x == test_speaker, input_columns=['speaker_id'])

Splitting the dataset into training / validation / test sets...
Unique speakers found in the dataset:
['F01' 'F03' 'F04' 'FC01' 'FC02' 'FC03' 'M01' 'M02' 'M03' 'M04' 'M05'
 'MC01' 'MC02' 'MC03' 'MC04']



In [9]:
original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(
    torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

if not keep_all_data:
    # Update the three dataset splits (if ['test_data'] == 1, keep in test, if ['test_data'] == 0, keep in train and validation)
    torgo_dataset['train'] = torgo_dataset['train'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['validation'] = torgo_dataset['validation'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['test'] = torgo_dataset['test'].filter(
        lambda x: x['test_data'] == 1)

    # Drop the 'test_data' column
    torgo_dataset['train'] = torgo_dataset['train'].remove_columns([
                                                                   'test_data'])
    torgo_dataset['validation'] = torgo_dataset['validation'].remove_columns([
                                                                             'test_data'])
    torgo_dataset['test'] = torgo_dataset['test'].remove_columns([
                                                                 'test_data'])
    logging.info(
        f"After removal of repeated prompts, the number of data in each dataset is:")
    logging.info(
        f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
    logging.info(
        f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')

After removal of repeated prompts, the number of data in each dataset is:
Train:       8218/13178 (62%)
Validation:  483/1075 (44%)
Test:        508/2141 (23%)



In [10]:
# Remove special characters from the text
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex,
                           ' ', batch['text']).lower()
    return batch

torgo_dataset = torgo_dataset.map(remove_special_characters)

# Create a diciontary of tokenizer vocabularies
vocab_list = []
for dataset in torgo_dataset.values():
    for text in dataset['text']:
        text = text.replace(' ', '|')
        vocab_list.extend(text)

vocab_dict = {}
vocab_dict['[PAD]'] = 0
vocab_dict['<s>'] = 1
vocab_dict['</s>'] = 2
vocab_dict['[UNK]'] = 3
vocab_list = sorted(list(set(vocab_list)))
vocab_dict.update({v: k + len(vocab_dict)
                  for k, v in enumerate(vocab_list)})

logging.info("Vocab Dictionary:")
logging.info(str(vocab_dict) + '\n')

# Create a directory to store the vocab.json file
if not os.path.exists(output_path + '/vocab'):
    os.makedirs(output_path + '/vocab')

vocab_file_name = repo_name + '_vocab.json'
vocab_file_path = output_path + '/vocab/' + vocab_file_name

# Save the vocab.json file
with open(vocab_file_path, 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

Vocab Dictionary:
{'[PAD]': 0, '<s>': 1, '</s>': 2, '[UNK]': 3, "'": 4, 'a': 5, 'b': 6, 'c': 7, 'd': 8, 'e': 9, 'f': 10, 'g': 11, 'h': 12, 'i': 13, 'j': 14, 'k': 15, 'l': 16, 'm': 17, 'n': 18, 'o': 19, 'p': 20, 'q': 21, 'r': 22, 's': 23, 't': 24, 'u': 25, 'v': 26, 'w': 27, 'x': 28, 'y': 29, 'z': 30, '|': 31}



In [11]:
sampling_rate = 16000

## import feature extractor
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

## Load WhisperTokenizer
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="English", task="transcribe")

## Combine To Create A WhisperProcessor        
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="English", task="transcribe")

In [13]:
sampling_rate = 16000
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-tiny')
feature_extractor = WhisperFeatureExtractor(feature_size=1, sampling_rate=sampling_rate, padding_value=0.0, return_attention_mask=True)
# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

def prepare_torgo_dataset(batch):
    # Load audio data into batch
    audio = batch['audio']

    # Extract values
    batch["input_values"] = processor(
        audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # Encode to label ids
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids

    return batch

torgo_dataset = torgo_dataset.cast_column(
    "audio", Audio(sampling_rate=sampling_rate))
torgo_dataset = torgo_dataset.map(
    prepare_torgo_dataset, 
    remove_columns=['session', 'audio', 'speaker_id', 'text'], 
    num_proc=1)

Map:   0%|          | 0/8218 [00:00<?, ? examples/s]

MemoryError: Unable to allocate 803. GiB for an array with shape (480000, 448801) and data type float32

In [None]:


# Filter audio within a certain length
min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0

torgo_dataset = torgo_dataset.filter(
    lambda x: min_input_length_in_sec *
    sampling_rate < x < max_input_length_in_sec * sampling_rate,
    input_columns=["input_length"]
)

logging.info(
    "After filtering audio within a certain length, the number of data in each dataset is:")

if original_data_count['train'] != 0:
    logging.info(
        f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
else:
    logging.info(f'Train:       {len(torgo_dataset["train"])}/0 (0%)')

if original_data_count['validation'] != 0:
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
else:
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/0 (0%)')

if original_data_count['test'] != 0:
    logging.info(
        f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')
else:
    logging.info(f'Test:        {len(torgo_dataset["test"])}/0 (0%)\n')

# Remove the "input_length" column
torgo_dataset = torgo_dataset.remove_columns(["input_length"])