In [3]:
import os
import json
import re
from tqdm import tqdm

def process_json_files(input_folder, output_folder):
    """Process all JSON files in the input folder and save the corresponding updated JSON files to the output folder."""
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for file_name in tqdm(os.listdir(input_folder), desc="Processing Files"):
        if file_name.endswith('.jsonl'):  # Only process .jsonl files
            # Use regex to remove everything before 'train', 'validation', or 'test'
            new_file_name = re.sub(r'.*(train|validation|test)\.jsonl', r'\1.jsonl', file_name)
            
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, new_file_name)

            # Open the input JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = file.readlines()

            # Initialize list to store processed data
            updated_data = []

            # Iterate through each JSON line, process and update with phoneme transcript
            for line in tqdm(data, desc=f"Processing {file_name}"):
                data_dict = json.loads(line.strip())  # Load the JSON object
                target_sentence = data_dict.get('target', '')  # Get the target (transcription) sentence
                phoneme_transcript = data_dict.get('phoneme', '')  # Get phoneme transcript directly

                # Create an entry for `target` with prompt "Transcribe Speech to text."
                target_entry = {
                    "key": data_dict.get('key', ''),
                    "source": data_dict.get('source', ''),
                    "prompt": "Transcribe Speech to text.",
                    "target": target_sentence,
                }
                updated_data.append(target_entry)  # Add the entry with target and prompt

                # Create another entry with `phoneme` only and prompt "Transcribe speech into phonemes."
                if phoneme_transcript:
                    phoneme_entry = {
                        "key": data_dict.get('key', ''),
                        "source": data_dict.get('source', ''),
                        "prompt": "Transcribe Speech to phonemes.",
                        "target": phoneme_transcript,
                    }
                    updated_data.append(phoneme_entry)  # Add the entry with phoneme and prompt

            # Save the updated data back to the corresponding output JSON file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for updated_record in updated_data:
                    json.dump(updated_record, output_file)
                    output_file.write('\n')

# Define input and output folders
input_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme/"  # Update this to your input folder path
output_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_seperate"  # Update this to your output folder path

# Process all JSON files
process_json_files(input_folder, output_folder)

Processing ami_phoneme_test.jsonl: 100%|██████████| 7546/7546 [00:00<00:00, 120075.49it/s]
Processing ami_phoneme_train.jsonl: 100%|██████████| 66698/66698 [00:00<00:00, 152910.05it/s]
Processing ami_phoneme_validation.jsonl: 100%|██████████| 8351/8351 [00:00<00:00, 249626.79it/s]
Processing Files: 100%|██████████| 3/3 [00:02<00:00,  1.20it/s]


# covert with existing fields

In [1]:
import os
import json
import re
from tqdm import tqdm

def process_json_files(input_folder, output_folder):
    """Process all JSON files in the input folder and save the corresponding updated JSON files to the output folder."""
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for file_name in tqdm(os.listdir(input_folder), desc="Processing Files"):
        if file_name.endswith('.jsonl'):  # Only process .jsonl files
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Open the input JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = file.readlines()

            # Initialize list to store processed data
            updated_data = []

            # Iterate through each JSON line, process and update with phoneme transcript
            for line in tqdm(data, desc=f"Processing {file_name}"):
                data_dict = json.loads(line.strip())  # Load the JSON object
                target_sentence = data_dict.get('target', '')  # Get the target (transcription) sentence
                phoneme_transcript = data_dict.get('phoneme', '')  # Get phoneme transcript directly

                # Create an entry for `target` with prompt "Transcribe Speech to text."
                target_entry = {
                    "key": data_dict.get('key', ''),
                    "source": data_dict.get('source', ''),
                    "prompt": "Transcribe Speech to text.",
                    "target": target_sentence,
                }
                updated_data.append(target_entry)  # Add the entry with target and prompt

                # Create another entry with `phoneme` only and prompt "Transcribe speech into phonemes."
                if phoneme_transcript:
                    phoneme_entry = {
                        "key": data_dict.get('key', ''),
                        "source": data_dict.get('source', ''),
                        "prompt": "Transcribe Speech to phonemes.",
                        "target": phoneme_transcript,
                    }
                    updated_data.append(phoneme_entry)  # Add the entry with phoneme and prompt

            # Save the updated data back to the corresponding output JSON file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for updated_record in updated_data:
                    json.dump(updated_record, output_file)
                    output_file.write('\n')

# Define input and output folders
input_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme/"  # Update this to your input folder path
output_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_seperate"  # Update this to your output folder path

# Process all JSON files
process_json_files(input_folder, output_folder)

Processing ami_phoneme_test.jsonl: 100%|██████████| 7546/7546 [00:00<00:00, 155781.95it/s]
Processing ami_phoneme_train.jsonl: 100%|██████████| 66698/66698 [00:00<00:00, 139948.20it/s]
Processing ami_phoneme_validation.jsonl: 100%|██████████| 8351/8351 [00:00<00:00, 247260.20it/s]
Processing Files: 100%|██████████| 3/3 [00:02<00:00,  1.17it/s]


In [3]:
import os

# Define the directory where the files are located
directory = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme_seperate"

# Get the folder name dynamically
folder_name = os.path.basename(directory)  # Extracts 'librispeech-100_phoneme_seperate'

# Find the part of the folder name before the "_seperate" to use it for comparison
base_name = folder_name.split('_seperate')[0]

# Iterate through all files in the directory
for file_name in os.listdir(directory):
    # Check if the file starts with the folder's base name and ends with '.jsonl'
    if file_name.startswith(f'{base_name}_') and file_name.endswith('.jsonl'):
        # Create the new file name by replacing the prefix with the folder name
        new_file_name = file_name.replace(f'{base_name}_', f'{folder_name}_', 1)

        # Construct full file paths
        old_file_path = os.path.join(directory, file_name)
        new_file_path = os.path.join(directory, new_file_name)

        # Rename the file
        os.rename(old_file_path, new_file_path)

        # Print the renaming operation for verification
        print(f"Renamed: {file_name} -> {new_file_name}")

Renamed: librispeech-100_phoneme_test.jsonl -> librispeech-100_phoneme_seperate_test.jsonl
Renamed: librispeech-100_phoneme_train.jsonl -> librispeech-100_phoneme_seperate_train.jsonl
Renamed: librispeech-100_phoneme_val.jsonl -> librispeech-100_phoneme_seperate_val.jsonl
