In [3]:
import os
import json
import random

def sample_n_json_entries(input_file, output_file, n=6, seed=42):
    """
    Randomly sample n entries from the input JSONL file and save to a new JSONL file.

    :param input_file: Path to the input JSONL file
    :param output_file: Path to save the output sampled JSONL file
    :param n: Number of samples to select, default is 6
    :param seed: Random seed for reproducibility, default is 42
    """
    # Set random seed for reproducibility
    random.seed(seed)

    # Read all lines from the input JSONL file
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Randomly select n entries from the file
    num_lines = len(lines)
    if num_lines > 0:
        sample_indices = random.sample(range(num_lines), min(n, num_lines))
        sampled_lines = [lines[i] for i in sample_indices]

        # Write the sampled entries to the new JSONL file
        with open(output_file, 'w') as f:
            f.writelines(sampled_lines)

        print(f"Sampled {len(sampled_lines)} entries from {input_file} and saved to {output_file}.")
    else:
        print(f"File {input_file} is empty or does not contain any lines.")


# Usage
input_folder = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme'
input_file_path = os.path.join(input_folder, f'test.jsonl')
output_file_path = os.path.join(input_folder, 'test_small.jsonl')

sample_n_json_entries(
    input_file=input_file_path,
    output_file=output_file_path,
    n=2,
    seed=42
)

Sampled 2 entries from /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme/test.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme/test_small.jsonl.
