In [2]:
import datasets
from datasets import load_dataset, Audio
import logging

# Enable logging for the datasets library to see detailed information
logging.basicConfig(level=logging.INFO)

# Load the AMI dataset
dataset = load_dataset(
    "edinburghcstr/ami", "ihm",
    cache_dir='/work/van-speech-nlp/temp',
    use_auth_token='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'
)

# Print the number of data points in each split before filtering
print("Before filtering, the number of data in each dataset split is:")
for split, data in dataset.items():
    print(f"{split}: {len(data)}")

# Define the min and max input lengths in seconds
min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0

# Calculate input_length as the difference between end_time and begin_time
dataset = dataset.map(
    lambda x: {'input_length': x['end_time'] - x['begin_time']}
)

# Filter audio samples based on the calculated input_length
dataset = dataset.filter(
    lambda x: min_input_length_in_sec < x['input_length'] < max_input_length_in_sec
)

# Print the number of data points in each split after filtering
print("\nAfter filtering audio within a certain length, the number of data in each dataset split is:")
for split, data in dataset.items():
    print(f"{split}: {len(data)}")




Before filtering, the number of data in each dataset split is:
train: 108502
validation: 13098
test: 12643

After filtering audio within a certain length, the number of data in each dataset split is:
train: 66698
validation: 8351
test: 7546


In [None]:
dataset['train']

In [None]:
import os
import json
from tqdm import tqdm

identifier = "ami"

def create_jsonl_file(dataset, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for split in ['train', 'validation', 'test']:
        jsonl_path = os.path.join(output_dir, f"{identifier}_{split}.jsonl")

        if os.path.exists(jsonl_path):
            os.remove(jsonl_path)

        with open(jsonl_path, 'w') as jsonl_file:
            for sample in tqdm(dataset[split], desc=f"Processing {split} split"):
                audio_id = sample['audio_id']
                audio_path = sample['audio']['path']  
                transcription = sample['text'].lower()

                json_data = {
                    "key": audio_id,
                    "source": audio_path,
                    "target": transcription
                }

                jsonl_file.write(json.dumps(json_data) + "\n")

        print(f"Generated {jsonl_path}")

output_directory = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami"

create_jsonl_file(dataset, output_directory)