In [52]:
! python3 -m pip install --upgrade pymilvus langchain openai tiktoken



In [32]:
from os import environ

ZILLIZ_ENDPOINT = "https://in01-4c18d4ae9d2398b.ali-cn-hangzhou.vectordb.zilliz.com.cn:19530" # example: "in01-17f69c292d4a50a.aws-us-west-2.vectordb.zillizcloud.com"
ZILLIZ_USER = "db_admin" # cluster 用户名
ZILLIZ_PASS = "xxx" # 上述用户名对应的密码
OPENAI_API_KEY = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # OpenAI API 密钥, 示例: "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

## Set up environment variables
environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [53]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Zilliz
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter

# 使用 WebBaseLoader 加载指定的页面
loader = WebBaseLoader([
    "https://milvus.io/docs/overview.md",
])

docs = loader.load()

# 使用文本分割器将文档分割成指定大小的块
text_splitter = CharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
docs = text_splitter.split_documents(docs)

In [54]:
print(len(docs))

6


In [55]:
print(docs[0])

page_content='Introduction Milvus documentationDocsTutorialsToolsBlogCommunityStars0Join SlackTry Managed Milvus FREESearchHomev2.2.x\u200bAbout MilvusWhat is MilvusMilvus AdoptersMilvus RoadmapMilvus LimitsReleasesEnhancement ProposalsBootcampGet StartedUser GuideAdministration GuideIntegrationsBenchmarksToolsReferenceExample ApplicationsFAQsAPI referenceIntroduction\nThis page aims to give you an overview of Milvus by answering several questions. After reading this page, you will learn what Milvus is and how it works, as well as the key concepts, why use Milvus, supported indexes and metrics, example applications, the architecture, and relevant tools.\nWhat is Milvus vector database?\nMilvus was created in 2019 with a singular goal: store, index, and manage massive embedding vectors generated by deep neural networks and other machine learning (ML) models.\nAs a database specifically designed to handle queries over input vectors, it is capable of indexing vectors on a trillion scale. 

In [56]:
# 指定用来将文档转换成对应向量表示的 Embedding 模型
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# 创建一个向量数据库来保存文档的向量表示。这里我们使用 Zilliz Cloud 来创建该数据库
vector_store = Zilliz.from_documents(
    docs,
    embedding=embeddings,
    connection_args={"uri": ZILLIZ_ENDPOINT, "user": ZILLIZ_USER, "password": ZILLIZ_PASS, "secure": True}
)

In [59]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
# query = "What is milvus?"
# query = "When was milvus created?"
query = "What is embedding?"
docs = vector_store.similarity_search(query)
res = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
print("Question：" + query + "\n")
print("Answer：" + res['output_text'])

Question：What is embedding?

Answer： Embedding is a feature abstraction of unstructured data, such as emails, IoT sensor data, Instagram photos, protein structures, and much more, which is used to convert unstructured data to embedding vectors and measure similarities among vectors.
SOURCES: https://milvus.io/docs/overview.md
