In [1]:
# Importing PDF
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf("jeff103.pdf")


In [4]:
def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = split_text(pdf_text)
print("Number of chunks:", len(chunks))
print("First chunk:", chunks[0])

Number of chunks: 46
First chunk: BEFORE YOU R EAD
Since the earliest times, humans have dreamt of conquering the
skies. Here are two stories about flying.
I.A young seagull is afraid to fly. How does he conquer his fear?
II.A pilot is lost in storm clouds. Does he arrive safe? Who helps
him?
I
His First Flight
THE young seagull was alone on his ledge. His two
brothers and his sister had already flown away the
day before. He had been afraid to fly with them.
Somehow when he had taken a little run forward to
the brink of the ledg


In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection

connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000)
]

schema = CollectionSchema(fields, description="PDF Chunk Embeddings")
collection = Collection("pdf_chunks", schema)


index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
collection.create_index(field_name="embedding", index_params=index_params)


collection.load()


In [7]:
import pandas as pd

data_to_insert = [embeddings.tolist(), chunks]
collection.insert([data_to_insert[0], data_to_insert[1]])
collection.flush()


In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

query = "In which school seagull was studing"
query_vector = model.encode([query])[0]  # 1D vector


In [9]:
search_params = {
    "metric_type": "L2",       
    "params": {"nprobe": 10}   
}

collection.load()

results = collection.search(
    data=[query_vector],
    anns_field="embedding",       
    param=search_params,    
    limit=10,                     
    output_fields=["text"]      
)

In [10]:
for hit in results[0]:
    print(f"Score (distance): {hit.distance:.4f}")
    print(f"Matched Text:\n{hit.entity.get('text')}")
    
    print("-" * 50)


Score (distance): 0.8464
Matched Text:
ranteed, or was it important for
you to try, regardless of a possibility of failure?
We have just r ead about the first flight of a young seagull. Y our teacher will
now divide the class into groups. Each group will work on one of the following
topics. Prepare a presentation with your group members and then present it
to the entire class.
•Progression of Models of Airplanes
•Progression of Models of Motorcars
•Birds and Their Wing Span
•Migratory Birds — Tracing Their Flights
Write a short compo
--------------------------------------------------
Score (distance): 0.8464
Matched Text:
ranteed, or was it important for
you to try, regardless of a possibility of failure?
We have just r ead about the first flight of a young seagull. Y our teacher will
now divide the class into groups. Each group will work on one of the following
topics. Prepare a presentation with your group members and then present it
to the entire class.
•Progression of Models of Air

In [11]:
reranked_results = sorted(results[0], key=lambda x: x.distance) 
for reranked_result in reranked_results:
    print(f"Reranked Score (distance): {reranked_result.distance:.4f}")
    print(f"Reranked Matched Text:\n{reranked_result.entity.get('text')}")
    
    print("-" * 50) 

Reranked Score (distance): 0.8464
Reranked Matched Text:
ranteed, or was it important for
you to try, regardless of a possibility of failure?
We have just r ead about the first flight of a young seagull. Y our teacher will
now divide the class into groups. Each group will work on one of the following
topics. Prepare a presentation with your group members and then present it
to the entire class.
•Progression of Models of Airplanes
•Progression of Models of Motorcars
•Birds and Their Wing Span
•Migratory Birds — Tracing Their Flights
Write a short compo
--------------------------------------------------
Reranked Score (distance): 0.8464
Reranked Matched Text:
ranteed, or was it important for
you to try, regardless of a possibility of failure?
We have just r ead about the first flight of a young seagull. Y our teacher will
now divide the class into groups. Each group will work on one of the following
topics. Prepare a presentation with your group members and then present it
to the entire 

In [12]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key


In [13]:
top_n = 10
top_chunks = [res.entity.get("text") for res in reranked_results[:top_n]]
context = "\n\n".join(top_chunks)

In [None]:
question = query
template = PromptTemplate.from_template(
    """You are a helpful assistant.

    Use the following context to answer the query. Only return the specific answer, be brief, and do not repeat the context. Give answer if only the answer is available in context else say "Sorry, answer for this question is not present in context".

    Context:
    {context}

    Question: {question}
    Answer:"""
)

In [15]:
llm = ChatOpenAI(model="gpt-3.5-turbo")
parser = StrOutputParser()
chain = template | llm | parser
answer = chain.invoke({"context": context, "question": question})

In [16]:
print(" Final Answer:\n", answer)

 Final Answer:
 Sorry, answer for this question is not present in context.
