In [1]:
# This notebook uses https://github.com/openai/whisper with edits to the whisper_openAI/decoding.py to generate multiple hypothesis
import sys
import os
import argparse
import re
import json
import torch
import logging
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from datasets import load_dataset, DatasetDict, Audio
from dataclasses import dataclass
from typing import Dict, List, Union
from evaluate import load
from tqdm import tqdm
from datetime import datetime

!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/zhang.jinda1/.cache/huggingface/token
Login successful


In [2]:
speaker_id = "F01"  # Example value; replace with the actual Speaker ID as needed
learning_rate = 0.0001
train_batch_size = 4
eval_batch_size = 4
seed = 42
gradient_accumulation_steps = 2
optimizer = "adamw_torch"
lr_scheduler_type = "linear"
num_epochs = 20
keep_all_data = False
debug = False
repo_suffix = ""

print(f"Speaker ID: {speaker_id}")
print(f"Learning rate: {learning_rate}")
print(f"Training batch size: {train_batch_size}")
print(f"Evaluation batch size: {eval_batch_size}")
print(f"Random seed: {seed}")
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
print(f"Optimizer type: {optimizer}")
print(f"Learning rate scheduler type: {lr_scheduler_type}")
print(f"Number of epochs: {num_epochs}")
print(f"Keep all data: {keep_all_data}")
print(f"Debug mode: {debug}")
print(f"Repository suffix: {repo_suffix}")

if not re.match(r'^[MF]C?[0-9]{2}$', speaker_id):
    print("Please provide a valid speaker ID.")
    sys.exit(1)
test_speaker = speaker_id

if repo_suffix and not re.match(r'^[_-]', args.repo_suffix):
    repo_suffix = '_' + repo_suffix

Speaker ID: F01
Learning rate: 0.0001
Training batch size: 4
Evaluation batch size: 4
Random seed: 42
Gradient accumulation steps: 2
Optimizer type: adamw_torch
Learning rate scheduler type: linear
Number of epochs: 20
Keep all data: False
Debug mode: False
Repository suffix: 


In [3]:
import os
import logging

# Define the path to the CSV file
torgo_csv_path = "data_preparation/torgo.csv"

# Check if the path exists and is a file
if os.path.exists(torgo_csv_path) and os.path.isfile(torgo_csv_path):
    print("The CSV file exists.")
else:
    print("The CSV file does not exist.")

torgo_dataset_path = '/work/van-speech-nlp/data/torgo'
torgo_dataset_dir_path = torgo_dataset_path + \
        '/' if torgo_dataset_path[-1] != '/' else torgo_dataset_path
output_path = 'output'
print(f'torgo_dataset_path: {torgo_dataset_path}')
print(f'torgo_dataset_dir_path: {torgo_dataset_dir_path}')

repo_name = f'torgo_tiny_finetune_{test_speaker}{repo_suffix}'
repo_path = f'jindaxz/{repo_name}'

# Path to save model / checkpoints{repo_name}'
model_local_path = output_path + '/model/' + repo_name

pretrained_model_name = "openai/whisper-tiny"

The CSV file does not exist.
torgo_dataset_path: /work/van-speech-nlp/data/torgo
torgo_dataset_dir_path: /work/van-speech-nlp/data/torgo/


In [4]:
# moving to the whisper folder ; make sure you have the whisper environment on
%cd ..

/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/Whispering-LLaMA


In [5]:
import numpy
# Renamed the Whisepr repo (https://github.com/openai/whisper) with the changed decoding.py file as whisper_openAI
import whisper_openAI.whisper as whisper
import torch
import tqdm
model, _ = whisper.load_model("tiny") # you can change the whisper model here to largev2 or large to swap the  model.

In [6]:
data_df = pd.read_csv(torgo_csv_path)
dataset_csv = load_dataset('csv', data_files=torgo_csv_path)

# Check if the following columns exist in the dataset ['session', 'audio', 'text', 'speaker_id']
expected_columns = ['session', 'audio', 'text', 'speaker_id']
not_found_columns = []
for column in expected_columns:
    if column not in dataset_csv['train'].column_names:
        not_found_columns.append(column)

if len(not_found_columns) > 0:
    logging.error(
        "The following columns are not found in the dataset:" + " [" + ", ".join(not_found_columns) + "]")
    sys.exit(1)

In [7]:
logging.info(
    "Splitting the dataset into training / validation / test sets...")

# Extract the unique speakers in the dataset
speakers = data_df['speaker_id'].unique()

logging.info("Unique speakers found in the dataset:")
logging.info(str(speakers) + '\n')

if test_speaker not in speakers:
    logging.error("Test Speaker not found in the dataset.")
    sys.exit(1)

valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [
    test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(
    lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(
    lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(
    lambda x: x == test_speaker, input_columns=['speaker_id'])

In [8]:
original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(
    torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

if not keep_all_data:
    # Update the three dataset splits (if ['test_data'] == 1, keep in test, if ['test_data'] == 0, keep in train and validation)
    torgo_dataset['train'] = torgo_dataset['train'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['validation'] = torgo_dataset['validation'].filter(
        lambda x: x['test_data'] == 0)
    torgo_dataset['test'] = torgo_dataset['test'].filter(
        lambda x: x['test_data'] == 1)

    # Drop the 'test_data' column
    torgo_dataset['train'] = torgo_dataset['train'].remove_columns([
                                                                   'test_data'])
    torgo_dataset['validation'] = torgo_dataset['validation'].remove_columns([
                                                                             'test_data'])
    torgo_dataset['test'] = torgo_dataset['test'].remove_columns([
                                                                 'test_data'])
    logging.info(
        f"After removal of repeated prompts, the number of data in each dataset is:")
    logging.info(
        f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
    logging.info(
        f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
    logging.info(
        f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')

In [9]:
## convert the sample rate of every audio files using cast_column function
torgo_dataset = torgo_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
# Define the minimum and maximum input length in seconds
min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0
sampling_rate=16000

# Define the filtering functions based on input length
def filter_min_length(example):
    return example["audio"]["array"].shape[0] > min_input_length_in_sec * sampling_rate

def filter_max_length(example):
    return example["audio"]["array"].shape[0] < max_input_length_in_sec * sampling_rate

# Apply the filters
torgo_dataset = torgo_dataset.filter(filter_max_length)
torgo_dataset = torgo_dataset.filter(filter_min_length)

In [11]:
logging.info(
    f"After filter, the number of data in each dataset is:")
logging.info(
    f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
logging.info(
    f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
logging.info(
    f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')

In [18]:
train_dataset = torgo_dataset["train"]
validation_dataset = torgo_dataset["validation"]
test_dataset = torgo_dataset["test"]

In [19]:
train_dataset

Dataset({
    features: ['session', 'audio', 'text', 'speaker_id'],
    num_rows: 9638
})

In [22]:
import json
import os
import tqdm
import numpy as np


def generate_inference_json(dataset, dataset_name):
    to_json = []
    for i, item in enumerate(tqdm.tqdm(dataset)):
        # print(item)
        audio = item['audio']['array'].astype(np.single)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        ground_truth = item['text'].replace(' <COMMA>', ',').replace(' <PERIOD>', '.').replace(' <QUESTIONMARK>', '?').replace(' <EXCLAMATIONPOINT>', '!').lower()
        source = 'NP-Torgo'
        cat = 'NP-Torgo'
        time = len(audio)/16000
        path_to_file = item['audio']['path']
        random_temperature = np.random.randint(70, 81) / 100
        options = whisper.DecodingOptions(fp16=True, without_timestamps=True, temperature=random_temperature, best_of=200)
        result, _ = whisper.decode(model, mel, options)
        result = list(result)

        if len(result) <= 10:
            if random_temperature < 0.75:
                random_temperature += 0.2
            else:
                random_temperature += 0.1
            options = whisper.DecodingOptions(fp16=True, without_timestamps=True, temperature=random_temperature, best_of=200)
            result, _ = whisper.decode(model, mel, options)
            result = list(result)

        to_json.append({
            item['session']: {
                'temp': random_temperature,
                'path': path_to_file,
                'ground_truth': ground_truth,
                'inference': result,
                'source': source,
                'category': cat,
                'time': time,
                'path': path_to_file
            }
        })

    os.makedirs(f"Inference/gs_inferences", exist_ok=True)
    save_path = f'Inference/gs_inferences/{str(dataset_name)}.json'
    with open(save_path, "w") as file:
        json.dump(to_json, file, indent=4)


generate_inference_json(train_dataset, 'torgo_train')

  0%|          | 0/9638 [00:00<?, ?it/s]

{'session': 'F04-Session1-arrayMic-0009', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0009.wav', 'array': array([-0.0017395 , -0.00170898, -0.00106812, ...,  0.00469971,
        0.0012207 ,  0.00311279]), 'sampling_rate': 16000}, 'text': 'LEFT', 'speaker_id': 'F04'}


  0%|          | 1/9638 [00:00<1:15:02,  2.14it/s]

{'session': 'F04-Session1-arrayMic-0010', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0010.wav', 'array': array([ 0.00021362,  0.00286865, -0.00149536, ..., -0.0022583 ,
        0.00082397, -0.00164795]), 'sampling_rate': 16000}, 'text': 'AIR', 'speaker_id': 'F04'}


  0%|          | 2/9638 [00:00<1:01:42,  2.60it/s]

{'session': 'F04-Session1-arrayMic-0011', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0011.wav', 'array': array([-0.00140381,  0.00219727, -0.00115967, ...,  0.00170898,
       -0.00076294, -0.00021362]), 'sampling_rate': 16000}, 'text': 'TORN', 'speaker_id': 'F04'}


  0%|          | 3/9638 [00:01<1:02:38,  2.56it/s]

{'session': 'F04-Session1-arrayMic-0012', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0012.wav', 'array': array([-6.40869141e-04,  1.46484375e-03, -2.62451172e-03, ...,
        3.05175781e-05,  4.73022461e-03,  1.15966797e-03]), 'sampling_rate': 16000}, 'text': 'BORN', 'speaker_id': 'F04'}


  0%|          | 5/9638 [00:01<45:55,  3.50it/s]  

{'session': 'F04-Session1-arrayMic-0017', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0017.wav', 'array': array([-0.00100708,  0.00088501,  0.00250244, ..., -0.00100708,
        0.00189209,  0.00405884]), 'sampling_rate': 16000}, 'text': 'STICKS', 'speaker_id': 'F04'}
{'session': 'F04-Session1-arrayMic-0018', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0018.wav', 'array': array([ 0.00161743,  0.00274658,  0.00280762, ...,  0.00030518,
        0.00057983, -0.00146484]), 'sampling_rate': 16000}, 'text': 'STORM', 'speaker_id': 'F04'}


  0%|          | 6/9638 [00:01<42:33,  3.77it/s]

{'session': 'F04-Session1-arrayMic-0019', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0019.wav', 'array': array([-0.00231934, -0.00021362,  0.00183105, ..., -0.00167847,
        0.00119019, -0.00146484]), 'sampling_rate': 16000}, 'text': 'RAGE', 'speaker_id': 'F04'}


  0%|          | 7/9638 [00:02<46:57,  3.42it/s]

{'session': 'F04-Session1-arrayMic-0022', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0022.wav', 'array': array([-0.00033569,  0.00027466, -0.00439453, ...,  0.00085449,
        0.00106812, -0.00204468]), 'sampling_rate': 16000}, 'text': 'BEAT', 'speaker_id': 'F04'}


  0%|          | 9/9638 [00:02<40:35,  3.95it/s]

{'session': 'F04-Session1-arrayMic-0026', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0026.wav', 'array': array([-5.49316406e-04,  8.23974609e-04, -2.96020508e-03, ...,
        7.01904297e-04,  6.10351562e-05, -1.15966797e-03]), 'sampling_rate': 16000}, 'text': 'SWARM', 'speaker_id': 'F04'}
{'session': 'F04-Session1-arrayMic-0027', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0027.wav', 'array': array([ 0.00262451,  0.0032959 , -0.00054932, ...,  0.00570679,
       -0.00445557,  0.00271606]), 'sampling_rate': 16000}, 'text': 'USUALLY MINUS SEVERAL BUTTONS', 'speaker_id': 'F04'}


  0%|          | 10/9638 [00:03<1:04:51,  2.47it/s]

{'session': 'F04-Session1-arrayMic-0028', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0028.wav', 'array': array([-0.00244141,  0.0032959 , -0.00027466, ..., -0.00192261,
       -0.00125122, -0.00015259]), 'sampling_rate': 16000}, 'text': 'STORE', 'speaker_id': 'F04'}


  0%|          | 11/9638 [00:03<1:00:01,  2.67it/s]

{'session': 'F04-Session1-arrayMic-0032', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0032.wav', 'array': array([-0.00036621, -0.00082397, -0.00268555, ..., -0.00506592,
        0.00201416, -0.00442505]), 'sampling_rate': 16000}, 'text': 'NO', 'speaker_id': 'F04'}


  0%|          | 12/9638 [00:04<1:00:05,  2.67it/s]

{'session': 'F04-Session1-arrayMic-0034', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0034.wav', 'array': array([0.00732422, 0.01577759, 0.01428223, ..., 0.0005188 , 0.0050354 ,
       0.0010376 ]), 'sampling_rate': 16000}, 'text': 'GRANDFATHER LIKES TO BE MODERN IN HIS LANGUAGE', 'speaker_id': 'F04'}


  0%|          | 13/9638 [00:04<59:49,  2.68it/s]  

{'session': 'F04-Session1-arrayMic-0035', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0035.wav', 'array': array([-0.00097656,  0.00015259, -0.00100708, ...,  0.00091553,
       -0.00326538, -0.00112915]), 'sampling_rate': 16000}, 'text': 'TIP', 'speaker_id': 'F04'}


  0%|          | 15/9638 [00:05<1:00:34,  2.65it/s]

{'session': 'F04-Session1-arrayMic-0039', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0039.wav', 'array': array([ 0.00259399,  0.00238037,  0.00463867, ...,  0.00311279,
       -0.00302124,  0.00064087]), 'sampling_rate': 16000}, 'text': 'GROW', 'speaker_id': 'F04'}
{'session': 'F04-Session1-arrayMic-0041', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0041.wav', 'array': array([-0.00567627, -0.00338745,  0.01153564, ...,  0.00708008,
        0.        ,  0.00143433]), 'sampling_rate': 16000}, 'text': 'TWO', 'speaker_id': 'F04'}


  0%|          | 16/9638 [00:05<1:02:01,  2.59it/s]

{'session': 'F04-Session1-arrayMic-0043', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0043.wav', 'array': array([-0.00128174, -0.00222778,  0.00213623, ..., -0.00311279,
        0.00064087,  0.00231934]), 'sampling_rate': 16000}, 'text': 'BUT HE ALWAYS ANSWERS BANANA OIL', 'speaker_id': 'F04'}


  0%|          | 17/9638 [00:06<1:05:32,  2.45it/s]

{'session': 'F04-Session1-arrayMic-0044', 'audio': {'path': '/work/van-speech-nlp/data/torgo/F04/Session1/wav_arrayMic/0044.wav', 'array': array([-0.00042725, -0.00021362, -0.00213623, ...,  0.00048828,
       -0.00161743,  0.00326538]), 'sampling_rate': 16000}, 'text': 'AIR', 'speaker_id': 'F04'}


  0%|          | 17/9638 [00:06<1:03:23,  2.53it/s]


KeyboardInterrupt: 