In [1]:
import os
import json

def filter_even_lines(input_file, output_file):
    """
    Reads a JSONL file line by line, preserves only the even lines (0-based indexing),
    and writes them to a new output JSONL file.
    """
    even_lines = []

    # Read the input file line by line
    with open(input_file, 'r') as infile:
        lines = infile.readlines()

        # Select only even lines (indexing starts at 0)
        for index, line in enumerate(lines):
            if index % 2 == 1:  # Select only even index lines
                even_lines.append(json.loads(line.strip()))

    # Write the filtered lines to the output file
    with open(output_file, 'w') as outfile:
        for entry in even_lines:
            outfile.write(json.dumps(entry) + "\n")

    print(f"Processed {len(even_lines)} entries and saved to {output_file}")

def process_all_jsonl_files(input_dir, output_dir):
    """
    Processes all JSONL files from the input directory, applies the filter to preserve
    only even lines, and saves them to the output directory with the same filenames.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate over all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".jsonl"):
            input_file = os.path.join(input_dir, filename)
            output_file = os.path.join(output_dir, filename)

            # Process the input file and generate the output file
            filter_even_lines(input_file, output_file)

# Example usage:
input_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_seperate"
output_folder = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_only"
process_all_jsonl_files(input_folder, output_folder)

Processed 7545 entries and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_only/test.jsonl
Processed 66681 entries and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_only/train.jsonl
Processed 8348 entries and saved to /work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami_phoneme_only/validation.jsonl


# clean up the file name

In [3]:
import os
import re

def clean_filenames(directory):
    """
    Renames specific files in the directory based on a regular expression pattern.
    Renames:
    - librispeech-100_phoneme_seperate_train.jsonl -> train.jsonl
    - librispeech-100_phoneme_seperate_val.jsonl -> val.jsonl
    - librispeech-100_phoneme_seperate_test.jsonl -> test.jsonl
    """
    # Define regex pattern to match the files
    pattern = re.compile(r"librispeech-100_phoneme_seperate_(train|val|test)\.jsonl")

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            # Extract the captured group (train, val, test)
            new_name = f"{match.group(1)}.jsonl"
            old_path = os.path.join(directory, filename)
            new_path = os.path.join(directory, new_name)

            # Rename the file
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} -> {new_name}")

# Example usage:
directory_path = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme_only"
clean_filenames(directory_path)

Renamed: librispeech-100_phoneme_seperate_test.jsonl -> test.jsonl
Renamed: librispeech-100_phoneme_seperate_train.jsonl -> train.jsonl
Renamed: librispeech-100_phoneme_seperate_val.jsonl -> val.jsonl


In [6]:
import os

def count_lines_and_preview_jsonl(directory):
    """
    Prints the name of each JSONL file in the specified directory, the number of lines in each file,
    and the first 4 lines of each file for debugging purposes.
    """
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a JSONL file
        if filename.endswith(".jsonl"):
            file_path = os.path.join(directory, filename)
            
            # Count the number of lines in the file
            with open(file_path, 'r') as f:
                lines = f.readlines()
                line_count = len(lines)
            
            # Print the result
            print(f"{filename}: {line_count} lines")
            print("First 4 lines:")
            
            # Print up to the first 4 lines for debugging
            for i, line in enumerate(lines[:4]):
                print(f"Line {i+1}: {line.strip()}")
            
            print("-" * 50)  # Separator for readability

# Example usage:
directory_path = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme_seperate"
count_lines_and_preview_jsonl(directory_path)

librispeech-100_phoneme_seperate_test.jsonl: 5240 lines
First 4 lines:
Line 1: {"key": "6930-75918-0000", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-clean/6930/75918/6930-75918-0000.flac", "prompt": "Transcribe Speech to text.", "target": "concord returned to its place amidst the tents"}
Line 2: {"key": "6930-75918-0000", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-clean/6930/75918/6930-75918-0000.flac", "prompt": "Transcribe Speech to phonemes.", "target": "K AA N K AO R D  R IH T ER N D  T UW  IH T S  P L EY S  AH M IH D S T  DH AH  T EH N T S "}
Line 3: {"key": "6930-75918-0001", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-clean/6930/75918/6930-75918-0001.flac", "prompt": "Transcribe Speech to text.", "target": "the english forwarded to the french baskets of flowers of which they had made a plentiful provision to greet the arrival of the young princess the french in return invited the english to a supper which was to be given the n

In [7]:
directory_path = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/librispeech-100_phoneme_only"
count_lines_and_preview_jsonl(directory_path)

loaded_librispeech_test_other.jsonl: 2939 lines
First 4 lines:
Line 1: {"key": "7902-96591-0000", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-other/7902/96591/7902-96591-0000.flac", "prompt": "Transcribe Speech to phonemes.", "target": "AY  AE M  F R AH M  DH AH  K AH T ER  L AY IH NG  AO F  DH AH  K OW S T "}
Line 2: {"key": "7902-96591-0001", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-other/7902/96591/7902-96591-0001.flac", "prompt": "Transcribe Speech to phonemes.", "target": "D AA N 'T IY  K R AY  HH IY  S EH D  AY  W AA Z  AH B L AY JH D  T UW  K AH M "}
Line 3: {"key": "7902-96591-0002", "source": "/work/van-speech-nlp/librispeech/LibriSpeech/test-other/7902/96591/7902-96591-0002.flac", "prompt": "Transcribe Speech to phonemes.", "target": "AH N D  AH N D  Y UW  HH AE V  N AA T  F AW N D  AW T  EH N IY TH IH NG  K EY M  IH N  K W IH K  F R AY T AH N D  T OW N Z "}
Line 4: {"key": "7902-96591-0003", "source": "/work/van-speech-nlp/librispeech/Lib