In [1]:
from datasets import load_dataset, Dataset
from tqdm import tqdm
import os
import json

# Load train-100 dataset (100-hour subset of the clean training data)
# librispeech_train_100 = load_dataset(
#     "librispeech_asr", 
#     "clean", 
#     split="train.100", 
#     cache_dir="/work/van-speech-nlp/temp", 
#     trust_remote_code=True
# )


# # Load validation (dev-360) dataset
# librispeech_val_360 = load_dataset(
#     "librispeech_asr", 
#     "clean", 
#     split="validation", 
#     cache_dir="/work/van-speech-nlp/temp", 
#     trust_remote_code=True
# )

# Load test-clean dataset
librispeech_test_clean = load_dataset(
    "librispeech_asr", 
    "clean", 
    split="test", 
    cache_dir="/work/van-speech-nlp/temp", 
    trust_remote_code=True
)

# Load test-other dataset
librispeech_test_other = load_dataset(
    "librispeech_asr", 
    "other", 
    split="test", 
    cache_dir="/work/van-speech-nlp/temp", 
    trust_remote_code=True
)

Downloading data:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.6G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import os
import json
import re
from tqdm import tqdm
from g2p import make_g2p

# Initialize G2P converter for English ARPAbet
transducer = make_g2p('eng', 'eng-arpabet')

# Define the identifier for the dataset
identifier = "librispeech"

def get_phonemes(sentence):
    """Convert sentence to phonemes using G2P."""
    phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
    phonemes = " ".join(phonemes_list)
    return phonemes

def create_jsonl_file(dataset, split_name, output_dir):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    jsonl_path = os.path.join(output_dir, f"{identifier}_{split_name}.jsonl")

    # If the file already exists, remove it
    if os.path.exists(jsonl_path):
        os.remove(jsonl_path)

    # Process the dataset and write to a JSONL file
    with open(jsonl_path, 'w') as jsonl_file:
        # cnt = 0
        for sample in tqdm(dataset, desc=f"Processing {split_name} split"):
            # cnt += 1
            # if cnt > 2:  # Limit to 3 samples for testing
            #     break
            audio_id = sample['id']
            audio_path = sample['audio']['path']
            transcription = sample['text'].lower()

            # Generate G2P phoneme transcription
            phonemes = get_phonemes(transcription)

            json_data = {
                "key": audio_id,
                "source": audio_path,
                "target": transcription,
                "phoneme": phonemes
            }

            jsonl_file.write(json.dumps(json_data) + "\n")

    print(f"Generated {jsonl_path}")

# Output directory for storing the JSONL files
output_directory = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech_asr_phoneme"

# Example dataset objects (replace with actual dataset objects)
# librispeech_train_100 = [...] 
# librispeech_val_360 = [...]

# Generate JSONL files for train and validation splits
# create_jsonl_file(librispeech_train_100, "train.100", output_directory)
# create_jsonl_file(librispeech_val_360, "validation", output_directory)
create_jsonl_file(librispeech_test_clean, "test_clean", output_directory)
create_jsonl_file(librispeech_test_other, "test_other", output_directory)

Processing train.100 split: 100%|██████████| 28539/28539 [35:18<00:00, 13.47it/s]


Generated /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech_asr_phoneme/librispeech_train.100.jsonl


Processing validation split: 100%|██████████| 2703/2703 [01:57<00:00, 22.95it/s]

Generated /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech_asr_phoneme/librispeech_validation.jsonl



