In [1]:
from pymilvus import MilvusClient

client = MilvusClient(uri="http://localhost:19530")

In [2]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768)

In [3]:
from pymilvus import model


embedding_fn = model.DefaultEmbeddingFunction()

texts = [
    "Milvus is an open-source vector database for scalable similarity search.",
    "Vector search enables semantic search in unstructured data like text or images.",
    "You can deploy Milvus using Docker or Kubernetes for flexibility.",
    "Embedding models like BERT convert text into vector representations.",
    "Hybrid search in Milvus combines keyword-based and vector-based retrieval.",
    "Use FAISS, HNSW, or IVF indexes in Milvus for efficient similarity search.",
    "Milvus supports filtering results with scalar fields like tags or categories.",
    "Text embeddings capture semantic meaning beyond exact keyword matching.",
    "You can use sentence-transformers to generate embeddings for your documents.",
    "Milvus integrates well with FastAPI, LangChain, and other modern tools.",
    "Reranking strategies help improve search result quality after retrieval.",
    "Milvus is commonly used in AI applications like chatbots and recommendation systems.",
]


vectors = embedding_fn.encode_documents(texts)
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": i, "vector": vectors[i], "text": texts[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Dim: 768 (768,)
Data has 12 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [4]:
res = client.insert(collection_name="demo_collection", data=data)

print(res)

{'insert_count': 12, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}


In [5]:
#vector search
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])

res = client.search(
    collection_name="demo_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res)


data: [[{'id': 11, 'distance': 0.10358095169067383, 'entity': {'text': 'Milvus is commonly used in AI applications like chatbots and recommendation systems.', 'subject': 'history'}}, {'id': 4, 'distance': 0.09719092398881912, 'entity': {'text': 'Hybrid search in Milvus combines keyword-based and vector-based retrieval.', 'subject': 'history'}}]]


In [1]:
# Importing PDF
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf("jess105.pdf")


In [2]:
def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = split_text(pdf_text)
print("Number of chunks:", len(chunks))
print("First chunk:", chunks[0])

Number of chunks: 70
First chunk: We use dif ferent things in our daily life made
from metal. Can you list a number of items
used in your house made of metals. Where do
these metals come from?
You have studied that the earth’s crust is
made up of different minerals embedded in the
rocks. V arious metals ar e extracted fr om these
minerals after proper refinement.
Minerals are an indispensable part of our
lives. Almost everything we use, from a tiny pin
to a towering building or a big ship, all are
made from minerals. The railway


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection

connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000)
]

schema = CollectionSchema(fields, description="PDF Chunk Embeddings")
collection = Collection("pdf_chunks", schema)


index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
collection.create_index(field_name="embedding", index_params=index_params)


collection.load()


In [5]:
import pandas as pd

data_to_insert = [embeddings.tolist(), chunks]
collection.insert([data_to_insert[0], data_to_insert[1]])
collection.flush()


In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

query = "What are ill effects of mining?"
query_vector = model.encode([query])[0]  # 1D vector


In [27]:
# search_params = {
#     "metric_type": "L2",  
#     "params": {"nlist": 1024}
# }

# results = collection.search(
#     data=[query_vector],
#     anns_field="embedding",       
#     param=search_params,
#     limit=15,                      
#     output_fields=["text"]        
# )

search_params = {
    "metric_type": "L2",       
    "params": {"nprobe": 10}   
}

collection.load()

results = collection.search(
    data=[query_vector],
    anns_field="embedding",       
    param=search_params,
    limit=10,                     
    output_fields=["text"]      
)




In [23]:
for hit in results[0]:
    print(f"Score (distance): {hit.distance:.4f}")
    print(f"Matched Text:\n{hit.entity.get('text')}")
    
    print("-" * 50)


Score (distance): 0.5526
Matched Text:
s.
Hazards of Mining
Have you ever wondered about the efforts the miners make in making life comfortable
for you? What are the impacts of mining on
the health of the miners and the environment?
The dust and noxious fumes inhaled by
miners make them vulnerable to pulmonary
diseases. The risk of collapsing mine roofs,
inundation and fires in coalmines are a
constant threat to miners.
The water sources in the region get
contaminated due to mining. Dumping of waste
and slurry leads to degradation of
--------------------------------------------------
Score (distance): 1.0106
Matched Text:
insecticides and paints.
Fig. 5.2:   Iron ore mine
Dig a little deeper: Superimpose the maps
showing distribution of iron ore, manganese,
coal and iron and steel industry . Do you see
any correlation. Why?
Non-Ferrous Minerals
India’s reserves and production of non-
ferrous minerals is not very satisfactory.
However , these minerals, which include
copper , bauxite, le

In [34]:
reranked_results = sorted(results[0], key=lambda x: x.distance) 
for reranked_result in reranked_results:
    print(f"Reranked Score (distance): {reranked_result.distance:.4f}")
    print(f"Reranked Matched Text:\n{reranked_result.entity.get('text')}")
    
    print("-" * 50) 

Reranked Score (distance): 0.5526
Reranked Matched Text:
s.
Hazards of Mining
Have you ever wondered about the efforts the miners make in making life comfortable
for you? What are the impacts of mining on
the health of the miners and the environment?
The dust and noxious fumes inhaled by
miners make them vulnerable to pulmonary
diseases. The risk of collapsing mine roofs,
inundation and fires in coalmines are a
constant threat to miners.
The water sources in the region get
contaminated due to mining. Dumping of waste
and slurry leads to degradation of
--------------------------------------------------
Reranked Score (distance): 1.0106
Reranked Matched Text:
insecticides and paints.
Fig. 5.2:   Iron ore mine
Dig a little deeper: Superimpose the maps
showing distribution of iron ore, manganese,
coal and iron and steel industry . Do you see
any correlation. Why?
Non-Ferrous Minerals
India’s reserves and production of non-
ferrous minerals is not very satisfactory.
However , these minerals

In [35]:
print("chunks", len(chunks))
print(type(chunks[1]))
print(len(results))
print("results", len(results[0]))
print(results)

chunks 70
<class 'str'>
1
results 10
data: [[{'id': 458017999143319542, 'distance': 0.5526074767112732, 'entity': {'text': 's.\nHazards of Mining\nHave you ever wondered about the efforts the miners make in making life comfortable\nfor you? What are the impacts of mining on\nthe health of the miners and the environment?\nThe dust and noxious fumes inhaled by\nminers make them vulnerable to pulmonary\ndiseases. The risk of collapsing mine roofs,\ninundation and fires in coalmines are a\nconstant threat to miners.\nThe water sources in the region get\ncontaminated due to mining. Dumping of waste\nand slurry leads to degradation of'}}, {'id': 458017999143319534, 'distance': 1.0105557441711426, 'entity': {'text': 'insecticides and paints.\nFig. 5.2:   Iron ore mine\nDig a little deeper: Superimpose the maps\nshowing distribution of iron ore, manganese,\ncoal and iron and steel industry . Do you see\nany correlation. Why?\nNon-Ferrous Minerals\nIndia’s reserves and production of non-\nferro