# adjust prompt

# covert with existing fields

In [5]:
import os
import json
from tqdm import tqdm

def process_json_files_entrywise(input_folder1, input_folder2, output_folder):
    """
    Process two JSONL files with the same split, combine them entry-wise,
    and save the interleaved entries.

    Args:
        input_folder1 (str): Path to the first input folder.
        input_folder2 (str): Path to the second input folder.
        output_folder (str): Path to the output folder.
    """
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all files in the first input folder
    for file_name in tqdm(os.listdir(input_folder1), desc="Processing Files"):
        if file_name.endswith('.jsonl'):  # Only process .jsonl files
            input_file_path1 = os.path.join(input_folder1, file_name)
            input_file_path2 = os.path.join(input_folder2, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Open the input JSON files
            with open(input_file_path1, 'r', encoding='utf-8') as file1, open(input_file_path2, 'r', encoding='utf-8') as file2:
                data1 = file1.readlines()
                data2 = file2.readlines()

            # Ensure both files have the same number of lines
            if len(data1) != len(data2):
                raise ValueError(f"Files {file_name} in both folders have different numbers of lines.")

            # Initialize list to store processed data
            interleaved_data = []

            # Interleave lines from both files
            for line1, line2 in zip(data1, data2):
                data_dict1 = json.loads(line1.strip())
                data_dict2 = json.loads(line2.strip())
                
                # Append each entry from file1 and file2 in sequence
                interleaved_data.append(data_dict1)
                interleaved_data.append(data_dict2)

            # Save the interleaved data to the output file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for record in interleaved_data:
                    json.dump(record, output_file)
                    output_file.write('\n')

# Define the folder name once, and then use it for both input and output paths
folder_name = "aphasia"

base_path = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data"

input_folder1 = os.path.join(base_path, folder_name)
input_folder2 = os.path.join(base_path, f"{folder_name}_phoneme")
output_folder = os.path.join(base_path, f"{folder_name}_phoneme_seperate")

# Process the files and interleave the data
process_json_files_entrywise(input_folder1, input_folder2, output_folder)

Processing Files: 100%|██████████| 3/3 [00:05<00:00,  1.92s/it]
