In [2]:
# 通用 rag 逻辑
import os
import glob
import logging
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
# base_retrieval.py
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
import pinecone  # 假设使用Pinecone作为云端数据库


# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# 文档分片函数
def split_documents(documents, chunk_size=1000, overlap=100):
    """
    对文档进行分片，返回分片后的文档列表
    """
    # 将字符串列表转换为 Document 对象
    doc_objects = [Document(page_content=doc) for doc in documents]
    
    # 使用文本分片器进行分片
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    
    return text_splitter.split_documents(doc_objects)

In [4]:
# 构建本地或云端向量索引
def build_index(documents):
    """
    将文档转换为向量并构建索引
    """
    embeddings = SentenceTransformerEmbeddings(model_name="shibing624/text2vec-base-chinese")
    
    # 将文档转换为 Document 对象
    doc_objects = [Document(page_content=str(doc)) for doc in documents]
    
    # 创建 Chroma 向量索引
    index = Chroma.from_documents(doc_objects, embeddings)
    
    return index

In [5]:
# 检索相关文档
def recall_documents(query, index, k=5):
    """
    执行相似性检索并返回与查询最相关的 k 个文档
    """
    return index.similarity_search(query, k=k)