In [None]:
import os
import glob
import json

dataset_dir = "./dataset"
json_files = sorted(glob.glob(os.path.join(dataset_dir, "*.json")))

merged_data = []
current_id = 1

# 合并所有文件内容
for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        items = json.load(f)
        for item in items:
            item["id"] = f"{current_id:04d}"
            merged_data.append(item)
            current_id += 1

# 保存为合并后的 data.json 文件
output_path = "./data.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=2)

output_path


'./data.json'

In [4]:
import os
import json
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from collections import defaultdict

# 加载数据
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 加载模型
model = SentenceTransformer("all-MiniLM-L6-v2")

# 定义类别集合
labels = set(item["label"] for item in data)
dim = 384  # 模型输出维度
index_dir = "faiss_index"
os.makedirs(index_dir, exist_ok=True)

# 每个类别的向量索引 + 元数据
label_to_index = {}
label_to_data = defaultdict(list)

# 建索引
for label in labels:
    label_to_index[label] = faiss.IndexFlatL2(dim)

for item in data:
    question = item["question"]
    label = item["label"]
    vec = model.encode([question], convert_to_numpy=True).astype("float32")
    
    label_to_index[label].add(vec)
    label_to_data[label].append(item)

# 保存所有索引和元数据
for label in labels:
    index_path = os.path.join(index_dir, f"{label}_index.bin")
    faiss.write_index(label_to_index[label], index_path)
    
    meta_path = os.path.join(index_dir, f"{label}_meta.pkl")
    with open(meta_path, "wb") as f:
        pickle.dump(label_to_data[label], f)

print("✅ 所有类别的索引与元数据构建完成！")


  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  attn_output = torch.nn.functional.scaled_dot_product_attention(


✅ 所有类别的索引与元数据构建完成！


In [5]:
def search_by_label(model, query, label, top_k=5):
    index_path = f"faiss_index/{label}_index.bin"
    meta_path = f"faiss_index/{label}_meta.pkl"

    if not os.path.exists(index_path): return

    index = faiss.read_index(index_path)
    with open(meta_path, "rb") as f:
        metadata = pickle.load(f)

    query_vector = model.encode([query], convert_to_numpy=True).astype("float32")
    D, I = index.search(query_vector, top_k * 2)  # 多取一些，便于后续去重

    seen_questions = set()
    result = []

    for idx in I[0]:
        if idx < 0 or idx >= len(metadata):
            continue
        item = metadata[idx]
        if item["question"] in seen_questions:
            continue
        seen_questions.add(item["question"])
        result.append(item)
        if len(result) >= top_k:
            break
    
    return result
    # print(f"\n🔍 查询: {query} (类别: {label})")
    # for item in result:
    #     print(f"- Q: {item['question']}")
    #     print(f"  A: {item['answer']}")
    #     print(f"  Label: {item['label']}\n")


In [6]:
search_by_label(model,"Are vaccines safe?", "medication-side-effect", top_k = 10)


[{'question': "I've been taking warfarin for the past 3 months to prevent blood clots and I've been experiencing frequent nosebleeds. Is this a side effect of the medication?",
  'answer': 'Frequent nosebleeds can be a sign of excess anticoagulation. You should report this to your doctor immediately to adjust the dose.',
  'label': 'medication-side-effect',
  'id': '0055'},
 {'question': 'I am concerned about the potential interactions between my medications and a new supplement I started taking. I am taking azathioprine for my rheumatoid arthritis and was prescribed a probiotic that contains alfalfa.',
  'answer': 'Some supplements like alfalfa can affect immune modulation. Discuss any new supplement with your rheumatologist or pharmacist.',
  'label': 'medication-side-effect',
  'id': '0063'},
 {'question': "I've been taking paracetamol for 2 weeks, but my back pain has become worse. Could it be a side effect of the medication?",
  'answer': "While paracetamol generally doesn’t worse