# Install Dependencies

In [1]:
#!pip install -r requirements.txt

# STEP1- Import Libraries
# Purpose: Load all Python libraries needed for PDF reading, NLP, embeddings, and search.

In [1]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
import re





# STEP2- PDF Loader
# Purpose: Upload or read the PDF and extract all text.

In [2]:
pdf_path = "data/sample.pdf"  # Replace with your PDF path
reader = PdfReader(pdf_path)

pdf_text = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        pdf_text += text + " "

print("PDF loaded successfully")


PDF loaded successfully


# STEP3- Preprocess PDF Text
# Purpose: Clean and split PDF into chunks for semantic search.

In [3]:
def chunk_text(text, chunk_size=150, overlap=30):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

documents = chunk_text(pdf_text)
df = pd.DataFrame(documents, columns=["text"])
print("Total chunks created:", len(df))



Total chunks created: 36


# STEP4- Load Pretrained Embedding Model
# Purpose: Prepare embedding model for semantic similarity search.

In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded")

Embedding model loaded


# STEP5- Create Embeddings
# Purpose: Convert PDF text chunks into numeric vectors for semantic search.

In [5]:
embeddings = model.encode(df["text"].tolist(), show_progress_bar=True)
print("Embeddings generated:", embeddings.shape)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings generated: (36, 384)


# STEP6- Create FAISS Index

# Purpose: Store embeddings in FAISS for fast nearest-neighbor search

In [6]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

print("PDF chunks indexed in FAISS:", index.ntotal)


PDF chunks indexed in FAISS: 36


# STEP7 - Semantic Search Function

# Purpose: Find the top-k relevant PDF chunks based on question similarity.

In [7]:
def semantic_search(query, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [df.iloc[i]["text"] for i in indices[0]]


# Step 8- Answer Generation Logic (KEY STEP)
# purpose: Extracts relevant PDF content and returns clean bullet-point answers based on the user’s question.

In [8]:
def generate_answer(question, retrieved_chunks, max_points=3):
    text = " ".join(retrieved_chunks)
    lines = text.replace("•", "\n").replace("\r", "\n").split("\n")
    bullets = []

    query_words = [w.lower() for w in question.split() if len(w) > 3]

    for line in lines:
        line = line.strip()
        low = line.lower()
        words = line.split()

        # Skip headings/faculty/unit/short gibberish lines
        if (
            len(line) < 15 or
            "professor" in low or
            "dept" in low or
            "unit" in low or
            sum(len(w) == 1 for w in words) > len(words) * 0.6
        ):
            continue

        # Split line into sentences
        sentences = re.split(r'(?<=[.!?]) +', line)
        for sent in sentences:
            sent = sent.strip()
            if len(sent) > 20 and any(word in sent.lower() for word in query_words):
                bullets.append(f"- {sent}")
            if len(bullets) == max_points:
                break
        if len(bullets) == max_points:
            break

    if bullets:
        return "\n".join(bullets)
    else:
        return "Answer not clearly found in the document."


# STEP 9- Combine Search + Answer

# Purpose: End-to-end answer from PDF.

In [9]:
def direct_pdf_search(question):
    chunks = semantic_search(question, top_k=5)
    query_words = [w.lower() for w in question.split() if len(w) > 3]

    candidate_sentences = []

    for chunk in chunks:
        sentences = re.split(r'(?<=[.!?]) +', chunk)
        for sent in sentences:
            sent_low = sent.lower().strip()
            # Skip headings/questions
            if len(sent_low) < 30:
                continue
            if any(word in sent_low for word in query_words):
                # skip if sentence ends with a question mark (likely a heading)
                if sent_low.endswith('?'):
                    continue
                candidate_sentences.append((sent.strip(), sum(word in sent_low for word in query_words)))

    if not candidate_sentences:
        return "Answer not clearly found in the document."

    # Sort sentences by keyword matches
    candidate_sentences.sort(key=lambda x: x[1], reverse=True)

    # Take top 2–3 sentences
    bullets = [f"- {s[0]}" for s in candidate_sentences[:3]]
    return "\n".join(bullets)


# STEP10- Interactive Question Loop
# purpose: Continuously takes user questions and displays answers, enabling real-time interaction with the PDF content.

In [None]:
print("PDF Question Answering System")
print("Type 'exit' to stop\n")

while True:
    query = input("Ask a question from the PDF: ")
    if query.lower() == "exit":
        break

    answer = direct_pdf_search(query)
    print("\nAnswer:\n", answer)
    print("-" * 60)


PDF Question Answering System
Type 'exit' to stop



Ask a question from the PDF:  what is Need for Cyber Law?



Answer:
 - Need for Cyber Law(cont’d) • Cyberspace offers never-seen-before economic efficiency.
- Need for Cyber Law(cont’d) • A software source code worth crores of rupees or a movie can be pirated across the globe within hours of their release.
- computer trespassing • Computer vandalism • Transmission of harmful programmes • Stealing secret information & data • Copy rights Against Property Against government • * Hacking government website • * Cyber extortion • * Cyber terrorism • * Computer viruses Some other crimes: • Logic bombs -virus, worms, Trojan horse, email bombing • Spamming - E-mail abuse Need for Cyber Law There are various reasons why it is extremely difficult for conventional law to cope with cyberspace.
------------------------------------------------------------


Ask a question from the PDF:  what is Cyber Laws of India?



Answer:
 - Hence the need for Cyberlaws in India.
- Initially it may seem that Cyberlaws is a very technical field and that it does not have any bearing to most activities in Cyberspace.
- Cyber Laws of India • In Simple way we can say that cyber crime is unlawful acts wherein the computer is either a tool or a target or both.
------------------------------------------------------------


Ask a question from the PDF:  Why Cyberlaw in India ?



Answer:
 - Hence the need for Cyberlaws in India.
- • Cyberlaw is important because it touches almost all aspects of transactions and activities on and concerning the Internet, the World Wide Web and Cyberspace.
- Initially it may seem that Cyberlaws is a very technical field and that it does not have any bearing to most activities in Cyberspace.
------------------------------------------------------------


Ask a question from the PDF:  Does Cyberlaw concern me ?



Answer:
 - • Yes, Cyberlaw does concern you.
- • Cyberlaw is important because it touches almost all aspects of transactions and activities on and concerning the Internet, the World Wide Web and Cyberspace.
- Initially it may seem that Cyberlaws is a very technical field and that it does not have any bearing to most activities in Cyberspace.
------------------------------------------------------------
