In [None]:
import os
import json
import re
from tqdm import tqdm

def process_json_files(input_folder):
    """Process all JSON files in the input folder and save the corresponding updated JSON files to the output folder."""
    
    # Dynamically set output folder by appending "_separate" to the input folder name
    output_folder = f"{input_folder.rstrip(os.sep)}_separate"
    
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the input folder
    for file_name in tqdm(os.listdir(input_folder), desc="Processing Files"):
        if file_name.endswith('.jsonl'):  # Only process .jsonl files
            
            # Use regex to remove everything before 'train', 'validation', or 'test' in the file name
            new_file_name = re.sub(r'.*(train|validation|test)\.jsonl', r'\1.jsonl', file_name)
            
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, new_file_name)

            # Open the input JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = file.readlines()

            # Initialize list to store processed data
            updated_data = []

            # Iterate through each JSON line, process, and update with phoneme transcript
            for line in tqdm(data, desc=f"Processing {file_name}"):
                data_dict = json.loads(line.strip())  # Load the JSON object
                target_sentence = data_dict.get('target', '')  # Get the target (transcription) sentence
                phoneme_transcript = data_dict.get('phoneme', '')  # Get phoneme transcript directly

                # Create an entry for `target` with prompt "Transcribe Speech to text."
                target_entry = {
                    "key": data_dict.get('key', ''),
                    "source": data_dict.get('source', ''),
                    "prompt": "Transcribe Speech to text.",
                    "target": target_sentence,
                }
                updated_data.append(target_entry)  # Add the entry with target and prompt

                # Create another entry with `phoneme` only and prompt "Transcribe speech into phonemes."
                if phoneme_transcript:
                    phoneme_entry = {
                        "key": data_dict.get('key', ''),
                        "source": data_dict.get('source', ''),
                        "prompt": "Transcribe Speech to phonemes.",
                        "target": phoneme_transcript,
                    }
                    updated_data.append(phoneme_entry)  # Add the entry with phoneme and prompt

            # Save the updated data back to the corresponding output JSON file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for updated_record in updated_data:
                    json.dump(updated_record, output_file)
                    output_file.write('\n')

# Define the input folder
input_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/psst_phoneme"  # Update this to your input folder path

# Process all JSON files
process_json_files(input_folder)