In [1]:
import sys
import os
import argparse
import re
import json
import torch
import logging
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from datasets import load_dataset, DatasetDict, Audio
from dataclasses import dataclass
from typing import Dict, List, Union
from evaluate import load
from tqdm import tqdm
from datetime import datetime

In [2]:
test_speaker = speaker_id = 'F01'
keep_all_data = False
debug = False

In [3]:
torgo_csv_path='./torgo.csv'
data_df = pd.read_csv(torgo_csv_path)
dataset_csv = load_dataset('csv', data_files=torgo_csv_path)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Check if the following columns exist in the dataset ['session', 'audio', 'text', 'speaker_id']
expected_columns = ['session', 'audio', 'text', 'speaker_id']
not_found_columns = []
for column in expected_columns:
    if column not in dataset_csv['train'].column_names:
        not_found_columns.append(column)

if len(not_found_columns) > 0:
    logging.error(
        "The following columns are not found in the dataset:" + " [" + ", ".join(not_found_columns) + "]")
    sys.exit(1)

In [5]:
print("Splitting the dataset into training / validation / test sets...")

# Extract the unique speakers in the dataset
speakers = data_df['speaker_id'].unique()

print("Unique speakers found in the dataset:")
print(str(speakers) + '\n')

if test_speaker not in speakers:
    print("Test Speaker not found in the dataset.")
    sys.exit(1)

valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [test_speaker, valid_speaker]]

print("Train speakers:", train_speaker)
print("Validation speaker:", valid_speaker)
print("Test speaker:", test_speaker)

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(
    lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(
    lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(
    lambda x: x == test_speaker, input_columns=['speaker_id'])

print("Dataset split completed.")

Splitting the dataset into training / validation / test sets...
Unique speakers found in the dataset:
['F01' 'F03' 'F04' 'FC01' 'FC02' 'FC03' 'M01' 'M02' 'M03' 'M04' 'M05'
 'MC01' 'MC02' 'MC03' 'MC04']

Train speakers: ['F04', 'FC01', 'FC02', 'FC03', 'M01', 'M02', 'M03', 'M04', 'M05', 'MC01', 'MC02', 'MC03', 'MC04']
Validation speaker: F03
Test speaker: F01


Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Dataset split completed.


In [6]:
original_data_count = {
    'train': len(torgo_dataset['train']),
    'validation': len(torgo_dataset['validation']),
    'test': len(torgo_dataset['test'])
}

if not keep_all_data:
    # Update the three dataset splits (if ['test_data'] == 1, keep in test, if ['test_data'] == 0, keep in train and validation)
    torgo_dataset['train'] = torgo_dataset['train'].filter(lambda x: x['test_data'] == 0)
    torgo_dataset['validation'] = torgo_dataset['validation'].filter(lambda x: x['test_data'] == 0)
    torgo_dataset['test'] = torgo_dataset['test'].filter(lambda x: x['test_data'] == 1)

    # Drop the 'test_data' column
    torgo_dataset['train'] = torgo_dataset['train'].remove_columns(['test_data'])
    torgo_dataset['validation'] = torgo_dataset['validation'].remove_columns(['test_data'])
    torgo_dataset['test'] = torgo_dataset['test'].remove_columns(['test_data'])

    print("After removal of repeated prompts, the number of data in each dataset is:")
    print(f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
    print(f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
    print(f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)\n')

Filter:   0%|          | 0/15091 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1075 [00:00<?, ? examples/s]

Filter:   0%|          | 0/228 [00:00<?, ? examples/s]

After removal of repeated prompts, the number of data in each dataset is:
Train:       9749/15091 (64%)
Validation:  483/1075 (44%)
Test:        126/228 (55%)



In [7]:
# Remove special characters from the text
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\`\�0-9]'

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex,
                           ' ', batch['text']).lower()
    return batch

torgo_dataset = torgo_dataset.map(remove_special_characters)

Map:   0%|          | 0/9749 [00:00<?, ? examples/s]

Map:   0%|          | 0/483 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [8]:
torgo_dataset['train'][0]

{'session': 'F04-Session1-arrayMic-0007',
 'audio': '/F04/Session1/wav_arrayMic/0007.wav',
 'text': 'sheet',
 'speaker_id': 'F04'}

In [9]:
pwd

'/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data'

In [10]:

def process_dataset(dataset, test_speaker):
    # Define the output directory based on test speaker
    output_dir = os.path.join('.', test_speaker)

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for split_name, split_data in dataset.items():
        jsonl_filename = os.path.join(output_dir, f"{test_speaker}_{split_name}.jsonl")
        with open(jsonl_filename, 'w') as jsonl_file:
            for entry in split_data:
                json_entry = {
                    "key": entry['session'],
                    "source": entry['audio'],
                    "target": entry['text']
                }
                jsonl_file.write(json.dumps(json_entry) + '\n')
        print(f"{split_name} dataset saved to {jsonl_filename}")

# Example usage
process_dataset(torgo_dataset, test_speaker="speaker_id")

train dataset saved to F01_train.jsonl
validation dataset saved to F01_validation.jsonl
test dataset saved to F01_test.jsonl


In [1]:
import os
import shutil

# Define the base directory (update this to your actual directory path)
base_dir = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/torgo"

# List all files in the directory
files = os.listdir(base_dir)

# Process each file
for file_name in files:
    # Ignore non-jsonl files
    if not file_name.endswith(".jsonl"):
        continue

    # Extract the prefix (e.g., "F01" from "F01_train.jsonl")
    prefix = file_name.split('_')[0]

    # Create the subdirectory if it doesn't exist
    sub_dir = os.path.join(base_dir, prefix)
    os.makedirs(sub_dir, exist_ok=True)

    # Move the file to the corresponding subdirectory
    src_path = os.path.join(base_dir, file_name)
    dest_path = os.path.join(sub_dir, file_name)
    shutil.move(src_path, dest_path)

print("Files have been organized into individual folders.")

Files have been organized into individual folders.
