# 检索本地文档

In [None]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain_community.embeddings import QianfanEmbeddingsEndpoint
from langchain_community.embeddings import OpenAIEmbeddings

# embeddings = QianfanEmbeddingsEndpoint(model="bge_large_zh", endpoint="bge_large_zh")
embeddings = OpenAIEmbeddings()

In [None]:
# loader = CSVLoader(file_path='京东评论100.csv',encoding='utf-8',source_column='评论内容',metadata_columns=["型号"])
loader = CSVLoader(file_path='京东评论100.csv',encoding='utf-8')
docs = loader.load()
docs[5]

In [None]:
'''
构建内存向量库
'''
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings
)

'''
根据问题，找出相似度最高的评论
'''
query ="哪个型号比较美观"
# query = "哪个型号比较护眼"
related_docs = db.similarity_search(query)
related_docs

In [None]:
from langchain_community.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature = 0.0)
qdocs = "\n\n".join([related_docs[i].page_content for i in range(len(related_docs))])
prompt = f"{qdocs} Question: {query}"
llm.invoke(prompt)

In [None]:
'''
比较余弦相似度
'''
import numpy as np
# 余弦相似度。-1到1之间，值越大表示相似度越高
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

query ="哪个型号比较美观"
embedding1 = embeddings.embed_query(query)

txt1 = '''
型号: 2022时尚\n评论内容: 很快 外观大气漂亮
'''
embedding2 = embeddings.embed_query(txt1)

txt2='''
型号: 2023豪华\n评论内容: 很不错，还没有开机，但是速度还可以  外观也不错
'''
embedding3 = embeddings.embed_query(txt2)

distance12 = cosine_similarity(embedding1, embedding2)
distance13 = cosine_similarity(embedding1, embedding3)
print(f'distance12:{distance12}\ndistance13:{distance13}')

In [None]:
import langchain
langchain.debug = False
retriever = db.retriever()
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type = "stuff",
    retriever=retriever
)
qa_stuff.run(query)

In [None]:
'''
尝试不同参数类型。
"map_reduce"
'''

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain_community.llms.baidu_qianfan_endpoint import QianfanLLMEndpoint

llm = QianfanLLMEndpoint(temperature=0.1)
# llm = ChatOpenAI(temperature=0)

loader = TextLoader(file_path='客服.txt',encoding='utf-8')
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 3000,
    chunk_overlap = 100
)
split_documents = text_splitter.split_documents(docs)
len(split_documents)

In [None]:
db = DocArrayInMemorySearch.from_documents(
    split_documents, 
    embeddings
)

In [None]:
langchain.debug=True
llm = ChatOpenAI(temperature=0)
# llm = QianfanLLMEndpoint(temperature=0.1)
retriever = db.as_retriever()
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type = "map_reduce",
    retriever=retriever
)
query = "他们主要谈了什么内容"
qa_stuff.run(query)

In [260]:
'''
使用 refine
'''
loader = TextLoader(file_path='使用手册.md',encoding='utf-8')
docs = loader.load()
docs

[Document(page_content='# 糖醋排骨做法\n- 1、准备用料，排骨300克；菠萝适量；洋葱适量；老姜少许；葱少许；蒜瓣少许；生粉少许；老抽少许；白醋少许；料酒少许；盐适量；白糖少许\n- 2、洋葱切块，姜切片、葱切断，蒜瓣两个 菠萝一般切小粒，一般切块 排骨提前用清水，泡去血水，取出，沥下水\n- 3、生粉，姜片，部分洋葱，生抽 拌匀，腌制排骨15-20分钟\n- 4、小锅子放适量油，腌制好的排骨入锅大火炸40秒左右\n- 5、关火，余温继续炸1分钟  取出沥油  锅中放适量炸排骨的油\n- 6、放洋葱、蒜瓣、姜片拌炒爆香  放炸好的排骨，拌炒\n- 7、放菠萝小粒  放生抽  放白醋  放少许老抽，白糖一点点  拌炒均匀  放入葱段，加适量清水，没过排骨的量\n- 8、盖上锅盖大火煮开，继续煮5分钟  转中火焖煮，中间加盐，调味，煮至熟烂\n- 9、最后转大火收治，关火，再放菠萝块拌炒出锅', metadata={'source': '使用手册.md'})]

In [261]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 40,
    chunk_overlap = 0,
    separators=['\n']
)
split_documents = text_splitter.split_documents(docs)

In [270]:
db = DocArrayInMemorySearch.from_documents(
    split_documents, 
    embeddings
)
langchain.debug=True
retriever = db.as_retriever(search_kwargs={"k": 8})
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    # chain_type = "map_reduce",
    chain_type = "refine",
    retriever=retriever,
    verbose=True
)
query = "先腌制排骨，还是先切洋葱。不要多说废话"
qa_stuff.run(query)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "先腌制排骨，还是先切洋葱。不要多说废话"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:RefineDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:RefineDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "context_str": "\n- 2、洋葱切块，姜切片、葱切断，蒜瓣两个 菠萝一般切小粒，一般切块 排骨提前用清水，泡去血水，取出，沥下水",
  "question": "先腌制排骨，还是先切洋葱。不要多说废话"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 3:chain:RefineDocumentsChain > 4:chain:LLMChain > 5:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: Context information is below.\n------------\n\n- 2、洋葱切块，姜切片、葱切断，蒜瓣两个 菠萝一般切小粒，一般切块 排骨提前用清水，泡去血水，取出，沥下水\n------------\nGiven the context information and not prior knowledge, answer any questions\nHuman: 先腌制排骨，还是先切洋葱。不要多说废话"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 3:chain:Ref

'根据提供的新信息，我们可以先切洋葱。'