In [1]:
import datasets
from datasets import load_dataset, Audio
import logging

# Enable logging for the datasets library to see detailed information
logging.basicConfig(level=logging.INFO)

# Load the AMI dataset
dataset = load_dataset(
    "edinburghcstr/ami", "ihm",
    cache_dir='/work/van-speech-nlp/temp',
    use_auth_token='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'
)

# Print the number of data points in each split before filtering
print("Before filtering, the number of data in each dataset split is:")
for split, data in dataset.items():
    print(f"{split}: {len(data)}")

# Define the min and max input lengths in seconds
min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0

# Calculate input_length as the difference between end_time and begin_time
dataset = dataset.map(
    lambda x: {'input_length': x['end_time'] - x['begin_time']}
)

# Filter audio samples based on the calculated input_length
dataset = dataset.filter(
    lambda x: min_input_length_in_sec < x['input_length'] < max_input_length_in_sec
)

# Print the number of data points in each split after filtering
print("\nAfter filtering audio within a certain length, the number of data in each dataset split is:")
for split, data in dataset.items():
    print(f"{split}: {len(data)}")




Before filtering, the number of data in each dataset split is:
train: 108502
validation: 13098
test: 12643

After filtering audio within a certain length, the number of data in each dataset split is:
train: 66698
validation: 8351
test: 7546


In [5]:
dataset['test'][0]

{'meeting_id': 'EN2002a',
 'audio_id': 'AMI_EN2002a_H00_MEE073_0019663_0019800',
 'text': 'YEAH',
 'audio': {'path': '/work/van-speech-nlp/temp/downloads/extracted/af54b322915698bd55d07a3bc1f323ed8802dca8159367bd7188421a458688af/EN2002a/eval_ami_en2002a_h00_mee073_0019663_0019800.wav',
  'array': array([-9.15527344e-05, -1.52587891e-04, -1.52587891e-04, ...,
         -3.05175781e-05, -3.05175781e-05, -6.10351562e-05]),
  'sampling_rate': 16000},
 'begin_time': 196.6300048828125,
 'end_time': 198.0,
 'microphone_id': 'H00',
 'speaker_id': 'MEE073',
 'input_length': 1.3699951171875}

In [6]:
# Assuming the dataset is loaded as 'dataset'
meeting_id_to_find = "AMI_TS3003c_H03_MTD012ME_0183799_0183930"

# Filter the dataset to find the specific meeting_id
filtered_data = dataset['test'].filter(lambda x: x['audio_id'] == meeting_id_to_find)

# Check the result
print(filtered_data)


Filter:   0%|          | 0/7546 [00:00<?, ? examples/s]

Dataset({
    features: ['meeting_id', 'audio_id', 'text', 'audio', 'begin_time', 'end_time', 'microphone_id', 'speaker_id', 'input_length'],
    num_rows: 1
})


In [8]:
filtered_data[0]

{'meeting_id': 'TS3003c',
 'audio_id': 'AMI_TS3003c_H03_MTD012ME_0183799_0183930',
 'text': 'YEAH',
 'audio': {'path': '/work/van-speech-nlp/temp/downloads/extracted/d613367cf787ca9a867f57a209cfd7c43eb5619b9b34597d3c24e5fd65298ddb/TS3003c/eval_ami_ts3003c_h03_mtd012me_0183799_0183930.wav',
  'array': array([ 0.00000000e+00, -1.22070312e-04, -3.05175781e-05, ...,
          4.88281250e-04,  2.13623047e-04,  9.15527344e-05]),
  'sampling_rate': 16000},
 'begin_time': 1837.989990234375,
 'end_time': 1839.300048828125,
 'microphone_id': 'H03',
 'speaker_id': 'MTD012ME',
 'input_length': 1.31005859375}

In [None]:
import os
import json
from tqdm import tqdm

identifier = "ami"

def create_jsonl_file(dataset, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for split in ['train', 'validation', 'test']:
        jsonl_path = os.path.join(output_dir, f"{identifier}_{split}.jsonl")

        if os.path.exists(jsonl_path):
            os.remove(jsonl_path)

        with open(jsonl_path, 'w') as jsonl_file:
            for sample in tqdm(dataset[split], desc=f"Processing {split} split"):
                audio_id = sample['audio_id']
                audio_path = sample['audio']['path']  
                transcription = sample['text'].lower()

                json_data = {
                    "key": audio_id,
                    "source": audio_path,
                    "target": transcription
                }

                jsonl_file.write(json.dumps(json_data) + "\n")

        print(f"Generated {jsonl_path}")

output_directory = "/work/van-speech-nlp/jindaznb/jslpnb/mllm_experiments/slam-llm/data/ami"

create_jsonl_file(dataset, output_directory)