In [12]:
from datasets import load_dataset, DatasetDict
import json
import os

# Medical dataset subsets
medical_benchmarks = [
    "chatdoctor_healthcaremagic", "chatdoctor_icliniq", "medical_meadow_cord19", "medical_meadow_health_advice",
    "medical_meadow_medical_flashcards", "medical_meadow_mediqa", "medical_meadow_medqa", "medical_meadow_mmml",
    "medical_meadow_pubmed_causal", "medical_meadow_wikidoc", "medical_meadow_wikidoc_patient_information"
]

# Alpaca 格式转换函数
def convert_to_alpaca_format(dataset):
    alpaca_dataset = []
    for data in dataset:
        # 假设数据集中有'instruction', 'input', 'output'字段，根据实际数据集调整字段名称
        alpaca_data = {
            "instruction": data.get('instruction', ""),
            "input": data.get('input', ""),
            "output": data.get('output', "")
        }
        alpaca_dataset.append(alpaca_data)
    return alpaca_dataset

# 保存数据集到 JSON 文件
def save_dataset(dataset, save_path):
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

# 主程序：遍历数据集并保存
save_path = "./data/llama-factory"

# 遍历医疗数据集
for benchmark in medical_benchmarks:
    try:
        # 加载完整数据集
        dataset = load_dataset(
            "Malikeh1375/medical-question-answering-datasets",
            data_files={"train": f"{benchmark}/train-*.parquet"},
            split="train"
        )

        # 切分数据集为 90% 训练集和 10% 测试+验证集
        train_testvalid = dataset.train_test_split(test_size=0.1)

        # 将测试+验证集切分为 50% 测试集和 50% 验证集
        test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

        # 合并为一个 DatasetDict
        train_test_valid_dataset = DatasetDict({
            'train': train_testvalid['train'],
            'test': test_valid['test'],
            'valid': test_valid['train']
        })

        # 转换为 Alpaca 格式并保存
        train_alpaca_dataset = convert_to_alpaca_format(train_test_valid_dataset['train'])
        test_alpaca_dataset = convert_to_alpaca_format(train_test_valid_dataset['test'])
        valid_alpaca_dataset = convert_to_alpaca_format(train_test_valid_dataset['valid'])

        # save_dataset(train_alpaca_dataset, f"{save_path}/Medical/{benchmark}_train.json")
        save_dataset(test_alpaca_dataset, f"{save_path}/Medical/{benchmark}_test.json")
        # save_dataset(valid_alpaca_dataset, f"{save_path}/Medical/{benchmark}_valid.json")

        print(f"Saved Medical benchmark {benchmark} train, test, and valid sets to Alpaca format.")
    except Exception as e:
        print(f"Failed to process Medical benchmark {benchmark}: {e}")


Saved Medical benchmark chatdoctor_healthcaremagic train, test, and valid sets to Alpaca format.
Saved Medical benchmark chatdoctor_icliniq train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_cord19 train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_health_advice train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_medical_flashcards train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_mediqa train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_medqa train, test, and valid sets to Alpaca format.


Using the latest cached version of the dataset since Malikeh1375/medical-question-answering-datasets couldn't be found on the Hugging Face Hub


Failed to process Medical benchmark medical_meadow_mmml: Couldn't find cache for Malikeh1375/medical-question-answering-datasets for config 'default-c7ef9f2d0f4ded5c'
Available configs in the cache: ['default-4f4b19e1fee98406', 'default-735cf2986ff21102', 'default-8073e0ffd7d4de50', 'default-8bc334fc6dbb8637', 'default-8ea3d03aa36961ec', 'default-a3d5c4e94d59ef60', 'default-a824bbe8c00394fc', 'default-b64e7dfa36979661', 'default-c9032c12c111c5d9', 'default-d6e51a8b35b1d2f1']
Saved Medical benchmark medical_meadow_pubmed_causal train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_wikidoc train, test, and valid sets to Alpaca format.
Saved Medical benchmark medical_meadow_wikidoc_patient_information train, test, and valid sets to Alpaca format.
