In [None]:
import datasets
from datasets import load_dataset, Audio
import logging

# Load the AMI dataset
dataset = load_dataset(
    "edinburghcstr/ami", "ihm", 
    cache_dir='/work/van-speech-nlp/temp',
    use_auth_token='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'
)

# Define the min and max input lengths in seconds
min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0

# Calculate input_length as the difference between end_time and begin_time
dataset = dataset.map(
    lambda x: {'input_length': x['end_time'] - x['begin_time']}
)

# Filter audio samples based on the calculated input_length
dataset = dataset.filter(
    lambda x: min_input_length_in_sec < x['input_length'] < max_input_length_in_sec
)

# Log the number of data points after filtering
logging.info(
    "After filtering audio within a certain length, the number of data in each dataset is:"
)



In [None]:
# Assuming the dataset is split into train/validation sets, log each count
if 'train' in dataset:
    original_data_count_train = len(dataset['train'])
    print(f'Train:       {len(dataset["train"])}/{original_data_count_train} ({len(dataset["train"]) * 100 // original_data_count_train}%)')
else:
    print(f'Train:       0/0 (0%)')

if 'validation' in dataset:
    original_data_count_validation = len(dataset['validation'])
    print(f'Validation:  {len(dataset["validation"])}/{original_data_count_validation} ({len(dataset["validation"]) * 100 // original_data_count_validation}%)')
else:
    print(f'Validation:  0/0 (0%)')

In [None]:
dataset['train']

In [None]:
import os
import json
from tqdm import tqdm

identifier = "ami"

def create_jsonl_file(dataset, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for split in ['train', 'validation', 'test']:
        jsonl_path = os.path.join(output_dir, f"{identifier}_{split}.jsonl")

        if os.path.exists(jsonl_path):
            os.remove(jsonl_path)

        with open(jsonl_path, 'w') as jsonl_file:
            for sample in tqdm(dataset[split], desc=f"Processing {split} split"):
                audio_id = sample['audio_id']
                audio_path = sample['audio']['path']  
                transcription = sample['text'].lower()

                json_data = {
                    "key": audio_id,
                    "source": audio_path,
                    "target": transcription
                }

                jsonl_file.write(json.dumps(json_data) + "\n")

        print(f"Generated {jsonl_path}")

output_directory = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami"

create_jsonl_file(dataset, output_directory)