In [2]:
from bibtexparser import parse_file
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.milvus import Milvus
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [4]:
sources = parse_file("data/sources.bib")

urls = []

for entry in sources.entries:
    urls.append(entry.fields_dict["url"].value)

urls

['https://www.pinecone.io/learn/vector-database',
 'https://codelabs.milvus.io/vector-database-101-what-is-a-vector-database/#0',
 'https://codelabs.milvus.io/vector-database-101-introduction-to-unstructured-data/#0',
 'https://learn.microsoft.com/en-us/semantic-kernel/memories/vector-db',
 'https://zilliz.com/what-is-milvus',
 'https://zilliz.com/comparison/milvus-vs-weaviate',
 'https://milvus.io/docs/product_faq.md',
 'https://milvus.io/docs/operational_faq.md',
 'https://milvus.io/docs/benchmark.md']

In [70]:
loader = WebBaseLoader(web_path=urls)
docs = loader.load()

for doc in docs:
    doc.metadata = {'metadata': doc.metadata}

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " "],
    chunk_size = 200,
    chunk_overlap = 0,
)

docs = text_splitter.split_documents(docs)

In [71]:
embeddings = HuggingFaceEmbeddings()

vector_store = Milvus.from_documents(docs, embeddings, collection_name="iCitation", drop_old=True)

Batches: 100%|██████████| 15/15 [01:02<00:00,  4.15s/it]


In [73]:
sentence = "Milvus dynamically allocates worker nodes to perform each type of action. This enables simpler scalability and resource management while ensuring performance."

output = vector_store.similarity_search_with_score(sentence, 5)

import json
import textwrap

print("Related sources with scores (smaller means more similar):")

sources = {}

for doc, score in output:
    text = doc.page_content
    source = json.loads(doc.metadata['metadata'].decode('utf-8'))['source']
    wrapped_text = '\n'.join(textwrap.wrap(text, width=60))

    if source in sources:
        sources[source].append((score, wrapped_text))
    else:
        sources[source] = [(score, wrapped_text)]

for key, val in sources.items():
    print(key)
    for passage in val:
        print(f"{passage[0]:.2f} " + passage[1])
        print("")
    print("")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.59it/s]

Related sources with scores (smaller means more similar):
https://zilliz.com/comparison/milvus-vs-weaviate
0.27 Regarding scalability, Milvus uses worker nodes for each
type of action  (components to handle connections, data
nodes to handle ingestion, index nodes to index, and query
nodes to search). Each nod

0.61 e has its own assigned CPU and memory resources. Milvus can
dynamically allocate new nodes to an action group, speeding
up operations or reducing the number of nodes, thus freeing
resources for othe


https://milvus.io/docs/benchmark.md
0.58 We have recently run a benchmark against Milvus 2.2.3 and
have the following key findings:

0.61 Milvus instances (standalone or cluster) are deployed via
Helm on a Kubernetes cluster based on physical or virtual
machines.


https://zilliz.com/what-is-milvus
0.61 n various use cases. With extensive isolation of individual
system components, Milvus is highly resilient and
reliable.Highly ScalableMilvus's distributed and high-
throughput n


