In [2]:
import pandas as pd
import os
import json
from tqdm import tqdm

# Define paths
audio_root = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/aphasia/data_processed/audios"
csv_path = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/aphasia/data_processed/dataset_splitted.csv"

# Read the CSV file
df = pd.read_csv(csv_path)

# Initialize dictionaries to store data splits
data_splits = {"train": [], "validation": [], "test": []}

# Process each row in the dataset with tqdm for progress tracking
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    split = row['split']
    file_name = row['file_cut']
    transcription = row['transcriptions']
    folder_name = row['folder_name']
    audio_path = os.path.join(audio_root, folder_name, file_name)

    if not os.path.exists(audio_path):
        print(f"Audio file {audio_path} not found")
        continue

    # Create the JSON object
    json_object = {
        "key": f"{file_name}_ASR",
        "source": audio_path,
        "target": transcription,
        "prompt": "Transcribe Speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated."
    }

    # Append to the appropriate split
    if split in data_splits:
        data_splits[split].append(json_object)

# Write each split to its own JSONL file and print the length of each split
for split, data in data_splits.items():
    jsonl_path = f"{split}.jsonl"
    with open(jsonl_path, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')
    print(f"{split.capitalize()} set: {len(data)} entries")

print("JSONL files created for train, validation, and test sets.")


Processing rows: 100%|██████████| 120747/120747 [01:30<00:00, 1330.88it/s]


Train set: 95353 entries
Validation set: 13162 entries
Test set: 12232 entries
JSONL files created for train, validation, and test sets.
