# 民法RAG

### 获取环境变量

In [None]:
import os
# 获取环境变量
api_key = os.getenv('API_KEY')

### 准备数据

In [None]:
from glob import glob

text_lines = []

for file_path in glob("*.md", recursive=True):
    with open(file_path, "r") as file:
        file_text = file.read()

    text_lines += file_text.split("# ")

In [None]:
# 打印数据
len(text_lines)

In [None]:
from openai import OpenAI

deepseek_client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com/v1",  # DeepSeek API 的基地址
)

定义 embedding模型，使用milvus_model 来生成文本嵌入。使用DefaultEmeddingFunction

In [None]:
from pymilvus import model as milvus_model

embedding_model = milvus_model.DefaultEmbeddingFunction()

In [None]:
# 测试
test_embedding = embedding_model('This is a test sentence.')[0]
# 打印测试结果
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10]) # 生成10个

### 将数据加载到Milvus


In [None]:
# 创建Collection
from pymilvus import MilvusClinet

milvus_client = MilvusClinet(uri='./milvus_mfd.db')# 创建民法典向量数据库
collection_name = "law_collection"


# 检测 数据库是否存在，存在则删除
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)
# 创建Collection
milvus_client.create_collection(
    collection_name=collection_name,
    dimension = embedding_dim,
    metric_type ="IP", # 内聚距离
    consistency_level = 'Strong' # 一致性级别
),

### 插入数据

In [None]:
from tqdm import tqdm

data = []

doc_embeddings = embedding_model.encode_documents(text_lines)

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": doc_embeddings[i], "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

### 构建RAG

In [None]:
# 查询的问题
question = '债务人不履行到期债务怎么办？'

# 查询问题 返回3个结果
search_res = milvus_client.search(
    collection_name=collection_name,
    data=embedding_model.encode_queries(
        [question]
    ),  # 将问题转换为嵌入向量
    limit=3,  # 返回前3个结果
    search_params={"metric_type": "IP", "params": {}},  # 内积距离
    output_fields=["text"],  # 返回 text 字段
)

# 输出结果
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


### 使用LLM获取RAG响应


In [None]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

# 打印出内容
print(context)

In [None]:
# 创建询问格式
SYSTEM_PROMPT = """你是一个法律助手。你能够从提供的上下文段落片段中找到问题的答案。"""

USER_PROMPT = f"""
请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。
<context>
{context}
</context>
<question>
{question}
</question>
"""

# 使用DS提供给 ds-chat 模型根据提示生成响应
response = deepseek_client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)