In [1]:
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain.document_transformers import (
    LongContextReorder,
)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma

# Get embeddings.
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# 注意embedding于使用的模型无关，embedding只影响到 retrieval 阶段，也就是计算相似度阶段

texts = [
    "Basquetball is a great sport.",
    "Fly me to the moon is one of my favourite songs.",
    "The Celtics are my favourite team.",
    "This is a document about the Boston Celtics",
    "I simply love going to the movies",
    "The Boston Celtics won the game by 20 points",
    "This is just a random text.",
    "Elden Ring is one of the best games in the last 15 years.",
    "L. Kornet is one of the best Celtics players.",
    "Larry Bird was an iconic NBA player.",
]

# Create a retriever
retriever = Chroma.from_texts(texts, embedding=embeddings).as_retriever(
    search_kwargs={"k": 10}
) # 这个参数指的是返回的条目数量。注意这里存在一个缓存，就是如果你改小了这个参数，那么后面你获得的结果就很有限了。你需要重启vscode才行。
query = "What can you tell me about the Celtics?"

# Get relevant documents ordered by relevance score
docs = retriever.get_relevant_documents(query)
docs

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


[Document(page_content='This is a document about the Boston Celtics'),
 Document(page_content='The Celtics are my favourite team.'),
 Document(page_content='L. Kornet is one of the best Celtics players.'),
 Document(page_content='The Boston Celtics won the game by 20 points'),
 Document(page_content='Larry Bird was an iconic NBA player.'),
 Document(page_content='Elden Ring is one of the best games in the last 15 years.'),
 Document(page_content='Basquetball is a great sport.'),
 Document(page_content='I simply love going to the movies'),
 Document(page_content='Fly me to the moon is one of my favourite songs.'),
 Document(page_content='This is just a random text.')]

In [3]:
# Reorder the documents:
# Less relevant document will be at the middle of the list and more
# relevant elements at beginning / end.
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content='The Celtics are my favourite team.'),
 Document(page_content='The Boston Celtics won the game by 20 points'),
 Document(page_content='Elden Ring is one of the best games in the last 15 years.'),
 Document(page_content='I simply love going to the movies'),
 Document(page_content='This is just a random text.'),
 Document(page_content='Fly me to the moon is one of my favourite songs.'),
 Document(page_content='Basquetball is a great sport.'),
 Document(page_content='Larry Bird was an iconic NBA player.'),
 Document(page_content='L. Kornet is one of the best Celtics players.'),
 Document(page_content='This is a document about the Boston Celtics')]

In [4]:
# We prepare and run a custom Stuff chain with reordered docs as context.
import os
os.environ["OPENAI_API_KEY"] = ""
# Override prompts
# This is the template for formatting each document
document_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}"
)
# This is the name for replacing documents in the input prompt
document_variable_name = "context"

stuff_prompt_override = """Given this text extracts:
-----
{context}
-----
Please answer the following question:
{query}"""
# This is the input prompt
prompt = PromptTemplate(
    template=stuff_prompt_override, input_variables=["context", "query"]
)

# Instantiate the chain
llm = OpenAI() # 获取LLM
# llm_chain = LLMChain(llm=llm, prompt=prompt) # 这是个run LLM (不是ChatModel)的链
'''
This chain takes a list of documents and first combines them into a single string. 
It does this by formatting each document into a string with the document_prompt 
and then joining them together with document_separator. 
It then adds that new string to the inputs with the variable name set by document_variable_name. 
Those inputs are then passed to the llm_chain.
也就是说这里在插入document的时候，首先是将单个document 用document_prompt格式化成字符串。
然后通过 document_variable_name 指定的name 将documents 插入到给LLM的prompt中。
'''
# chain = StuffDocumentsChain(
#     llm_chain=llm_chain,
#     document_prompt=document_prompt,
#     document_variable_name=document_variable_name,
# )
# chain.run(input_documents=reordered_docs, query=query)
# chain.invoke({'input_documents':reordered_docs, 'query':query})

chain =  prompt | llm
print(chain.invoke({"context":reordered_docs, "query":query}))



The Celtics are a professional basketball team based in Boston, Massachusetts. They have won several championships and have some of the most iconic players in NBA history, including Larry Bird. They recently won a game by 20 points, and they have a player named L. Kornet who is considered one of the best Celtics players.


In [5]:
from langchain.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
# 这个embedding和word embedding还不一样，因为这个是将整个句子embedding，而不是将一个单词embedding。
# 所以无论句子长短，embedding的结果都是一样长的向量。那么就好直接计算相似度了。

In [14]:
for e in embeddings:
    print(type(e), len(e), type(e[0]))

<class 'list'> 1536 <class 'float'>
<class 'list'> 1536 <class 'float'>
<class 'list'> 1536 <class 'float'>
<class 'list'> 1536 <class 'float'>
<class 'list'> 1536 <class 'float'>


In [6]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

[0.0053546813655943075,
 -0.0005715346531097275,
 0.038875909934336914,
 -0.0029596003572924623,
 -0.008966285328704282]

In [16]:
type(embedded_query)

list

In [19]:
import numpy as np
import scipy
for e in embeddings:
    doc = np.array(e)
    que = np.array(embedded_query)
    print('similarity:', 1 - scipy.spatial.distance.cosine(doc, que))

similarity: 0.7707570228601861
similarity: 0.7853193706420822
similarity: 0.8355098705084145
similarity: 0.7740705554272965
similarity: 0.7561473238913311


In [8]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.storage import LocalFileStore
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import CacheBackedEmbeddings

underlying_embeddings = OpenAIEmbeddings()
# 创建本地文件来存储cash
store = LocalFileStore("./cache/")
# 创建embedder
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

# 加载文件。源文件中每一行是一个或者两个句子。这估计就是传统NLP的工作了吧。
raw_documents = TextLoader("state_of_the_union.txt").load()
# 这将句子组合起来，但是保持在1000个字符以内
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

In [21]:
%%time
# 进行embedding 并且存储在文件中
db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 296 ms, sys: 7.64 ms, total: 304 ms
Wall time: 1.51 s


In [22]:
list(store.yield_keys())[:5]

['text-embedding-ada-002704c76af-3696-5383-9858-6585616669ef',
 'text-embedding-ada-00281426526-23fe-58be-9e84-6c7c72c8ca9a',
 'text-embedding-ada-002abeef673-2b2a-5614-b612-d4ff3ef54c23',
 'text-embedding-ada-0023f7b9f1f-79ae-55e3-966a-d0ec952476ed',
 'text-embedding-ada-002a5ef11e4-0474-5725-8d80-81c91943b37f']

In [23]:
from langchain.embeddings import CacheBackedEmbeddings 
from langchain.storage import InMemoryByteStore
# 这种方式创建的vector store只存在于内存中，不存在于磁盘上。
store = InMemoryByteStore()
# 这里有个奇怪的操作，那就是将store包装到embedder中
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)