In [2]:
import json
import os

data_dir = "/fs/archive/share/mm_datasets/t2t_data"

SUBSET_SPLIT = {
    "2wikimultihopqa": ['train', 'dev'],
    "bamboogle": ['test'],
    "hotpotqa": ['train', 'dev'],
    "musique": ['train', 'dev'],
}

datasets = {}
for dataset_name, splits in SUBSET_SPLIT.items():
    if dataset_name not in datasets:
        datasets[dataset_name] = {}

    for split in splits:
        dataset_path = f"{dataset_name}/{split}_with_retrieved_docs.jsonl"
        print(f"Attempting to read: {dataset_path}")

        data_from_file = []
        with open(os.path.join(data_dir, dataset_path), 'r', encoding='utf-8') as f:
            for line in f:
                data_from_file.append(json.loads(line.strip()))
        
        datasets[dataset_name][split] = data_from_file

        print(f"Successfully loaded {len(data_from_file)} items from {dataset_path}")

Attempting to read: 2wikimultihopqa/train_with_retrieved_docs.jsonl
Successfully loaded 15000 items from 2wikimultihopqa/train_with_retrieved_docs.jsonl
Attempting to read: 2wikimultihopqa/dev_with_retrieved_docs.jsonl
Successfully loaded 12576 items from 2wikimultihopqa/dev_with_retrieved_docs.jsonl
Attempting to read: bamboogle/test_with_retrieved_docs.jsonl
Successfully loaded 125 items from bamboogle/test_with_retrieved_docs.jsonl
Attempting to read: hotpotqa/train_with_retrieved_docs.jsonl
Successfully loaded 90447 items from hotpotqa/train_with_retrieved_docs.jsonl
Attempting to read: hotpotqa/dev_with_retrieved_docs.jsonl
Successfully loaded 7405 items from hotpotqa/dev_with_retrieved_docs.jsonl
Attempting to read: musique/train_with_retrieved_docs.jsonl
Successfully loaded 19938 items from musique/train_with_retrieved_docs.jsonl
Attempting to read: musique/dev_with_retrieved_docs.jsonl
Successfully loaded 2417 items from musique/dev_with_retrieved_docs.jsonl


In [9]:
processed_data = {}
for dataset_name, splits in SUBSET_SPLIT.items():
    for split in splits:
        new_split_data = []
        if dataset_name not in processed_data:
            processed_data[dataset_name] = {}
        for item in datasets[dataset_name][split]:
            new_item = {
                "qry": item["question"],
                "qry_image_path": [],
                "pos_text": item["retrieved_docs"][0],
                "pos_image_path": [],
                "neg_text": item["retrieved_docs"][10:],
                "neg_image_path": [],
                "ans": item["golden_answers"],
            }
            new_split_data.append(new_item)
        processed_data[dataset_name][split] = new_split_data
        print(f"Processed {len(new_split_data)} items for {dataset_name} - {split}")
    print("=" * 50)

Processed 15000 items for 2wikimultihopqa - train
Processed 12576 items for 2wikimultihopqa - dev
Processed 125 items for bamboogle - test
Processed 90447 items for hotpotqa - train
Processed 7405 items for hotpotqa - dev
Processed 19938 items for musique - train
Processed 2417 items for musique - dev
