In [1]:
import os
import json
import re
from tqdm import tqdm
from g2p import make_g2p

# Initialize G2P converter for English ARPAbet
transducer = make_g2p('eng', 'eng-arpabet')

def get_phonemes(sentence):
    """Convert sentence to phonemes using G2P."""
    phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
    phonemes = " ".join(phonemes_list)
    return phonemes

def process_json_files(input_folder, output_folder):
    """Process all JSON files in the input folder and save the corresponding updated JSON files to the output folder."""
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for file_name in tqdm(os.listdir(input_folder), desc="Processing Files"):
        if file_name.endswith('.jsonl'):  # Only process .jsonl files
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Open the input JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = file.readlines()

            # Initialize list to store processed data
            updated_data = []

            # Iterate through each JSON line, process and update with phoneme transcript
            for line in tqdm(data, desc=f"Processing {file_name}"):
                data_dict = json.loads(line.strip())  # Load the JSON object
                target_sentence = data_dict.get('target', '')  # Get the target (transcription) sentence
                if target_sentence:
                    phoneme_transcript = get_phonemes(target_sentence)  # Get phoneme transcript
                    data_dict['phoneme'] = phoneme_transcript  # Add phoneme transcript to JSON object

                updated_data.append(data_dict)  # Store the updated object

            # Save the updated data back to the corresponding output JSON file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for updated_record in updated_data:
                    json.dump(updated_record, output_file)
                    output_file.write('\n')

# Define input and output folders
input_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/librispeech-100/"  # Update this to your input folder path
output_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/librispeech-100_phoneme"  # Update this to your output folder path

# Process all JSON files
process_json_files(input_folder, output_folder)

Processing dev-clean.jsonl: 100%|██████████| 2703/2703 [01:43<00:00, 26.15it/s]
Processing loaded_librispeech_test_clean.jsonl: 100%|██████████| 2620/2620 [01:39<00:00, 26.33it/s]
Processing loaded_librispeech_test_other.jsonl: 100%|██████████| 2939/2939 [01:39<00:00, 29.53it/s]
Processing train-clean-100.jsonl: 100%|██████████| 28539/28539 [32:01<00:00, 14.85it/s]
Processing Files: 100%|██████████| 4/4 [37:04<00:00, 556.24s/it]


: 