In [1]:
# Importing PDF
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf("jess105.pdf")


In [2]:
def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = split_text(pdf_text)
print("Number of chunks:", len(chunks))
print("First chunk:", chunks[0])

Number of chunks: 70
First chunk: We use dif ferent things in our daily life made
from metal. Can you list a number of items
used in your house made of metals. Where do
these metals come from?
You have studied that the earth’s crust is
made up of different minerals embedded in the
rocks. V arious metals ar e extracted fr om these
minerals after proper refinement.
Minerals are an indispensable part of our
lives. Almost everything we use, from a tiny pin
to a towering building or a big ship, all are
made from minerals. The railway


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection

connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000)
]

schema = CollectionSchema(fields, description="PDF Chunk Embeddings")
collection = Collection("pdf_chunks", schema)


index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
collection.create_index(field_name="embedding", index_params=index_params)


collection.load()


In [5]:
import pandas as pd

data_to_insert = [embeddings.tolist(), chunks]
collection.insert([data_to_insert[0], data_to_insert[1]])
collection.flush()


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

query = "Which states have wind farm in India?"
query_vector = model.encode([query])[0]  # 1D vector


In [7]:
search_params = {
    "metric_type": "L2",       
    "params": {"nprobe": 10}   
}

collection.load()

results = collection.search(
    data=[query_vector],
    anns_field="embedding",       
    param=search_params,
    limit=10,                     
    output_fields=["text"]      
)

In [8]:
for hit in results[0]:
    print(f"Score (distance): {hit.distance:.4f}")
    print(f"Matched Text:\n{hit.entity.get('text')}")
    
    print("-" * 50)


Score (distance): 0.5477
Matched Text:
 on firewood and dung cakes,
which in turn will contribute to environmental
conservation and adequate supply of manure
in agriculture.Fig. 5.10:
  Solar operated electronic milk
testing equipment
Fig. 5.11:   Wind mills – NagarcoilWind power
India has great potential of wind power . The
largest wind far m cluster is located in T amil
Nadu from Nagarcoil to Madurai. Apart from
these, Andhra Pradesh, Kar nataka, Gujarat,
Kerala, Maharashtra and Lakshadweep have
important wind farms. Nagarcoil and

--------------------------------------------------
Score (distance): 0.5477
Matched Text:
 on firewood and dung cakes,
which in turn will contribute to environmental
conservation and adequate supply of manure
in agriculture.Fig. 5.10:
  Solar operated electronic milk
testing equipment
Fig. 5.11:   Wind mills – NagarcoilWind power
India has great potential of wind power . The
largest wind far m cluster is located in T amil
Nadu from Nagarcoil to Madurai. Ap

In [9]:
reranked_results = sorted(results[0], key=lambda x: x.distance) 
for reranked_result in reranked_results:
    print(f"Reranked Score (distance): {reranked_result.distance:.4f}")
    print(f"Reranked Matched Text:\n{reranked_result.entity.get('text')}")
    
    print("-" * 50) 

Reranked Score (distance): 0.5477
Reranked Matched Text:
 on firewood and dung cakes,
which in turn will contribute to environmental
conservation and adequate supply of manure
in agriculture.Fig. 5.10:
  Solar operated electronic milk
testing equipment
Fig. 5.11:   Wind mills – NagarcoilWind power
India has great potential of wind power . The
largest wind far m cluster is located in T amil
Nadu from Nagarcoil to Madurai. Apart from
these, Andhra Pradesh, Kar nataka, Gujarat,
Kerala, Maharashtra and Lakshadweep have
important wind farms. Nagarcoil and

--------------------------------------------------
Reranked Score (distance): 0.5477
Reranked Matched Text:
 on firewood and dung cakes,
which in turn will contribute to environmental
conservation and adequate supply of manure
in agriculture.Fig. 5.10:
  Solar operated electronic milk
testing equipment
Fig. 5.11:   Wind mills – NagarcoilWind power
India has great potential of wind power . The
largest wind far m cluster is located in T ami

In [10]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key


In [11]:
top_n = 10
top_chunks = [res.entity.get("text") for res in reranked_results[:top_n]]
context = "\n\n".join(top_chunks)

In [12]:
question = "Which states have wind farm in India?"

In [13]:
template = PromptTemplate.from_template(
    """You are a helpful assistant.

    Use the following context to answer the query. Only return the specific answer, be brief, and do not repeat the context.

    Context:
    {context}

    Question: {question}
    Answer:"""
)

In [14]:
llm = ChatOpenAI(model="gpt-3.5-turbo")
parser = StrOutputParser()
chain = template | llm | parser
answer = chain.invoke({"context": context, "question": question})

In [15]:
question = "Which states have wind farm in India?"

In [16]:
print(" Final Answer:\n", answer)

 Final Answer:
 Tamil Nadu, Andhra Pradesh, Karnataka, Gujarat, Kerala, Maharashtra and Lakshadweep.
