In [1]:
import os
import json
import random

def sample_jsonl_files(input_folder, output_folder, sample_ratio=0.1, seed=42):
    """
    Randomly sample entries from JSONL files in the input folder and save to new JSONL files in the output folder.

    :param input_folder: Folder containing the input JSONL files
    :param output_folder: Folder to save the output sampled JSONL files
    :param sample_ratio: Sampling ratio, default is 0.1 (10%)
    :param seed: Random seed for reproducibility, default is 42
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Set random seed for reproducibility
    random.seed(seed)

    # List of JSONL files to process
    jsonl_files = ['ami_test.jsonl', 'ami_train.jsonl', 'ami_validation.jsonl']

    for file_name in jsonl_files:
        input_file = os.path.join(input_folder, file_name)
        output_file = os.path.join(output_folder, file_name)

        # Read all lines from the input JSONL file
        with open(input_file, 'r') as f:
            lines = f.readlines()

        # Determine the number of lines to sample
        num_lines = len(lines)
        sample_size = int(num_lines * sample_ratio)

        # Randomly sample indices
        sample_indices = random.sample(range(num_lines), sample_size)

        # Extract lines based on sampled indices
        sampled_lines = [lines[i] for i in sample_indices]

        # Write the sampled lines to the new JSONL file
        with open(output_file, 'w') as f:
            f.writelines(sampled_lines)

        print(f"Sampled {sample_size} entries from {file_name} and saved to {output_file}.")


input_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/ami'
output_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/ami-1h'

sample_jsonl_files(
    input_folder=input_folder_path,
    output_folder=output_folder_path,
    sample_ratio=1/60,
    seed=42
)

Sampled 125 entries from ami_test.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/ami-1h/ami_test.jsonl.
Sampled 1111 entries from ami_train.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/ami-1h/ami_train.jsonl.
Sampled 139 entries from ami_validation.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_expriments/slam-llm/data/ami-1h/ami_validation.jsonl.
