In [2]:
!pip install langchain==0.2.7 langchain_community==0.2.7 langchain_openai==0.1.15 langchain-chroma==0.1.1 langchainhub==0.1.20

Collecting langchain==0.2.7
  Downloading langchain-0.2.7-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain_community==0.2.7
  Downloading langchain_community-0.2.7-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_openai==0.1.15
  Downloading langchain_openai-0.1.15-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-chroma==0.1.1
  Downloading langchain_chroma-0.1.1-py3-none-any.whl.metadata (1.3 kB)
Collecting langchainhub==0.1.20
  Downloading langchainhub-0.1.20-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb<0.6.0,>=0.4.0 (from langchain-chroma==0.1.1)
  Downloading chromadb-0.5.16-py3-none-any.whl.metadata (6.8 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub==0.1.20)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Collecting build>=1.0.3 (from chromadb<0.6.0,>=0.4.0->langchain-chroma==0.1.1)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chroma

# 基于langchain实现私有知识库

## 1. 依赖安装部署

## 2. 依赖包导入

In [8]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatZhipuAI
from zhipuai import ZhipuAI
import os
from dotenv import load_dotenv

## 3. 初始化大模型

In [9]:
ZHIPUAI_API_KEY = "11f7603f3b47bb8f76abd6c0b7eb1017.rqh2hPOKFdfZLjOE"
WEB_URL = "https://github.com/WangShaoyu1/myBlogPaperMod/blob/master/content/posts/%E6%8E%A8%E8%8D%90%E4%B8%80%E4%B8%AA%E5%9C%A8%E4%BD%BF%E7%94%A8%E7%9A%84%E5%86%85%E7%BD%91%E7%A9%BF%E9%80%8F%E5%B7%A5%E5%85%B7cpolar.md"
ORI_WEB_URL = "https://lilianweng.github.io/posts/2023-06-23-agent/"

In [10]:
chat = ChatZhipuAI(
    model="glm-4",
    temperature=0.8,
    zhipuai_api_key=ZHIPUAI_API_KEY,
)

## 4. 加载解析数据

In [11]:
loader = WebBaseLoader(
    web_paths=(WEB_URL,),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("Box-sc-g0xbh4-0")
        )
    ),
)

In [12]:
docs = loader.load()

## 5. 数据切割

In [28]:
# 可视化观察切割效果的页面：https://chunkviz.up.railway.app/
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

## 6. 知识向量化

In [45]:
class EmbeddingGenerator:
    def __init__(self, model_name):
        self.model_name = model_name
        self.client = ZhipuAI(api_key=ZHIPUAI_API_KEY)

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            response = self.client.embeddings.create(model=self.model_name, input=text)
            if hasattr(response, 'data') and response.data:
                embeddings.append(response.data[0].embedding)
            else:
                # 如果获取嵌入失败，返回一个零向量
                embeddings.append([0] * 1024)  # 假设嵌入向量维度为 1024
        return embeddings


    def embed_query(self, query):
        # 使用相同的处理逻辑，只是这次只为单个查询处理
        response = self.client.embeddings.create(model=self.model_name, input=query)
        if hasattr(response, 'data') and response.data:
            return response.data[0].embedding
        return [0] * 1024  # 如果获取嵌入失败，返回零向量


embedding_generator = EmbeddingGenerator(model_name="embedding-3")

In [43]:
# 文本列表
texts = [content for document in splits for split_type, content in document if split_type == 'page_content']


## 7. 知识入库

In [46]:
chroma_store = Chroma(
    collection_name="example_collection",
    embedding_function=embedding_generator,  # 使用定义的嵌入生成器实例
    create_collection_if_not_exists=True
)


In [47]:
# 添加文本到 Chroma VectorStore
IDs = chroma_store.add_texts(texts=texts)

ValueError: Expected IDs to be a non-empty list, got 0 IDs

## 8. 构建检索器

In [48]:
retriever = chroma_store.as_retriever()

## 9.构建调用链路

In [49]:
prompt = hub.pull("rlm/rag-prompt")
print(prompt)

input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [50]:
prompt.pretty_print()


You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: [33;1m[1;3m{question}[0m 
Context: [33;1m[1;3m{context}[0m 
Answer:


In [51]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [52]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chat
    | StrOutputParser()
)


## 10. 进行提问

In [54]:
"""
1. 查询处理：该命令接受查询“什么是任务分解？”并将其传递给 retriever 组件。检索器本质上是系统中的搜索功能，设置为在预先索引的数据集中查找信息 - 这里是根据博客内容创建的矢量存储。
2. 语义搜索：检索器使用向量存储中存储的文本片段的嵌入（向量表示）来执行语义搜索。它将查询的向量表示与存储的片段的向量进行比较，以识别在语义上与查询最相似的片段。
3. 检索相关文本片段：根据相似度分数，检索器从博客中选择并返回与查询最匹配的文本片段。这些片段包含被认为与回答任务分解问题最相关的信息。
"""

rag_res = rag_chain.invoke("cpolar的操作步骤有哪些?")
print(rag_res)

我不知道cpolar的操作步骤是什么，因为提供的信息中没有相关的内容。如果可以提供更多上下文或详细信息，我可能能够帮助回答这个问题。


## 11. 删除数据，从头测试

In [None]:
chroma_store.delete_collection()