In [1]:
import os
import json
import random

def sample_jsonl_files(input_folder, output_folder, sample_ratio=0.1, seed=42):
    """
    Randomly sample entries from JSONL files in the input folder and save to new JSONL files in the output folder.

    :param input_folder: Folder containing the input JSONL files
    :param output_folder: Folder to save the output sampled JSONL files
    :param sample_ratio: Sampling ratio, default is 0.1 (10%)
    :param seed: Random seed for reproducibility, default is 42
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Set random seed for reproducibility
    random.seed(seed)

    # List of JSONL files to process
    jsonl_files = ['ami_test.jsonl', 'ami_train.jsonl', 'ami_validation.jsonl']

    for file_name in jsonl_files:
        input_file = os.path.join(input_folder, file_name)
        output_file = os.path.join(output_folder, file_name)

        # Read all lines from the input JSONL file
        with open(input_file, 'r') as f:
            lines = f.readlines()

        # Determine the number of lines to sample
        num_lines = len(lines)
        sample_size = int(num_lines * sample_ratio)

        # Randomly sample indices
        sample_indices = random.sample(range(num_lines), sample_size)

        # Extract lines based on sampled indices
        sampled_lines = [lines[i] for i in sample_indices]

        # Write the sampled lines to the new JSONL file
        with open(output_file, 'w') as f:
            f.writelines(sampled_lines)

        print(f"Sampled {sample_size} entries from {file_name} and saved to {output_file}.")


input_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami'
output_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami-1h'

sample_jsonl_files(
    input_folder=input_folder_path,
    output_folder=output_folder_path,
    sample_ratio=1/60,
    seed=42
)

Sampled 125 entries from ami_test.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami-1h/ami_test.jsonl.
Sampled 1111 entries from ami_train.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami-1h/ami_train.jsonl.
Sampled 139 entries from ami_validation.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami-1h/ami_validation.jsonl.


In [2]:
import os
import json
import random

def sample_one_json_entry(input_folder, output_folder, seed=42):
    """
    Randomly sample one entry from each JSONL file in the input folder and save to new JSONL files in the output folder.

    :param input_folder: Folder containing the input JSONL files
    :param output_folder: Folder to save the output sampled JSONL files
    :param seed: Random seed for reproducibility, default is 42
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Set random seed for reproducibility
    random.seed(seed)

    # List all JSONL files in the input folder
    jsonl_files = [file for file in os.listdir(input_folder) if file.endswith('.jsonl')]

    for file_name in jsonl_files:
        input_file = os.path.join(input_folder, file_name)
        output_file = os.path.join(output_folder, file_name)

        # Read all lines from the input JSONL file
        with open(input_file, 'r') as f:
            lines = f.readlines()

        # Randomly select one entry from the file
        num_lines = len(lines)
        if num_lines > 0:
            sample_index = random.randint(0, num_lines - 1)
            sampled_line = lines[sample_index]

            # Write the sampled entry to the new JSONL file
            with open(output_file, 'w') as f:
                f.write(sampled_line)

            print(f"Sampled one entry from {file_name} and saved to {output_file}.")
        else:
            print(f"File {file_name} is empty or does not contain any lines.")


# Usage
input_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme'
output_folder_path = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme-test'

sample_one_json_entry(
    input_folder=input_folder_path,
    output_folder=output_folder_path,
    seed=42
)


Sampled one entry from librispeech-100_phoneme_test.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme-test/librispeech-100_phoneme_test.jsonl.
Sampled one entry from librispeech-100_phoneme_train.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme-test/librispeech-100_phoneme_train.jsonl.
Sampled one entry from librispeech-100_phoneme_val.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme-test/librispeech-100_phoneme_val.jsonl.
Sampled one entry from loaded_librispeech_test_other.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme-test/loaded_librispeech_test_other.jsonl.


In [1]:
import os
import json
import random

def sample_n_json_entries(input_file, output_file, n=6, seed=42):
    """
    Randomly sample n entries from the input JSONL file and save to a new JSONL file.

    :param input_file: Path to the input JSONL file
    :param output_file: Path to save the output sampled JSONL file
    :param n: Number of samples to select, default is 6
    :param seed: Random seed for reproducibility, default is 42
    """
    # Set random seed for reproducibility
    random.seed(seed)

    # Read all lines from the input JSONL file
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Randomly select n entries from the file
    num_lines = len(lines)
    if num_lines > 0:
        sample_indices = random.sample(range(num_lines), min(n, num_lines))
        sampled_lines = [lines[i] for i in sample_indices]

        # Write the sampled entries to the new JSONL file
        with open(output_file, 'w') as f:
            f.writelines(sampled_lines)

        print(f"Sampled {len(sampled_lines)} entries from {input_file} and saved to {output_file}.")
    else:
        print(f"File {input_file} is empty or does not contain any lines.")


# Usage
input_folder = '/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/psst_phoneme'
input_file_path = os.path.join(input_folder, f'test.jsonl')
output_file_path = os.path.join(input_folder, 'test_small.jsonl')

sample_n_json_entries(
    input_file=input_file_path,
    output_file=output_file_path,
    n=20,
    seed=42
)

Sampled 20 entries from /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/psst_phoneme/test.jsonl and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/psst_phoneme/test_small.jsonl.
