# Install Dependencies

In [1]:
#!pip install -r requirements.txt

# STEP1- Import Libraries
# Purpose: Load all Python libraries needed for PDF reading, NLP, embeddings, and search.

In [95]:
import numpy as np
import pandas as pd
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import faiss

print("Step 1: Libraries imported successfully")


Step 1: Libraries imported successfully


# STEP2- PDF Loader
# Purpose: Upload or read the PDF and extract all text.

In [96]:
pdf_path = "data/sample.pdf"  # Replace with your PDF path
reader = PdfReader(pdf_path)

pdf_text = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        pdf_text += text + " "

print("PDF loaded successfully")


PDF loaded successfully


# STEP3- Preprocess PDF Text
# Purpose: Clean and split PDF into chunks for semantic search.

In [97]:
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

documents = chunk_text(pdf_text)
df = pd.DataFrame(documents, columns=["text"])
print(f"Step 3: Total chunks created = {len(df)}")

Step 3: Total chunks created = 17


# STEP4- Load Pretrained Embedding Model
# Purpose: Prepare embedding model for semantic similarity search.

In [98]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Step 4: Embedding model loaded")


Step 4: Embedding model loaded


# STEP 5 - Create FAISS Index

# Purpose: Store embeddings in FAISS for fast nearest-neighbor search

In [99]:
embeddings = model.encode(df["text"].tolist())
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
print(f"Step 5: PDF chunks indexed in FAISS = {index.ntotal}")


Step 5: PDF chunks indexed in FAISS = 17


# STEP 6 - Semantic Search

# Purpose: Find the top-k relevant PDF chunks based on question similarity.

In [100]:
def semantic_search(query, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [df.iloc[i]["text"] for i in indices[0]]


# STEP 7 - Answer Generation (bullet-point style)

In [102]:
def generate_answer(question, retrieved_chunks, max_points=3):
    text = " ".join(retrieved_chunks)
    sentences = text.replace("•", " ").replace("\n", " ").split(".")
    
    stop_words = {"what","is","are","define","explain","describe","why","the","a","an","of","in","law"}
    keywords = [w.lower() for w in question.split() if w.lower() not in stop_words]
    
    bullets = []
    for sent in sentences:
        sent = sent.strip()
        low = sent.lower()
        if (
            len(sent) < 30 or
            "faculty" in low or
            "professor" in low or
            "dept" in low or
            "unit" in low or
            "ethics" in low or
            sent.isupper()
        ):
            continue
        if any(k in low for k in keywords):
            bullets.append(f"- {sent}")
        if len(bullets) == max_points:
            break

    # Fallback: first meaningful line
    if not bullets:
        for sent in sentences:
            sent = sent.strip()
            if len(sent) > 40:
                bullets.append(f"- {sent}")
            if len(bullets) == max_points:
                break

    if bullets:
        return "\n".join(bullets)
    return "Answer not clearly found in the document."


# STEP 8 - Direct PDF Search

In [104]:
def direct_pdf_search(question):
    retrieved_chunks = semantic_search(question, top_k=3)
    return generate_answer(question, retrieved_chunks, max_points=3)


# STEP 9 - Demo Questions (to show on GitHub)

In [106]:
demo_questions = [
    "What is Cyber Law?",
    "What are Cyber crimes?",
    "Why Cyberlaw in India?"
]

for q in demo_questions:
    ans = direct_pdf_search(q)
    print(f"Question: {q}\nAnswer:\n{ans}\n{'-'*60}")


Question: What is Cyber Law?
Answer:
- What is Cyber Law?   Cyber Law is the law governing cyber space
- Cyber space is a very wide term and includes computers, networks, software, data storage devices (such as hard disks, USB disks etc), the Internet, websites, emails and even electronic devices such as cell phones, ATM machines etc
- Cyber law encompasses laws relating to 1
------------------------------------------------------------
Question: What are Cyber crimes?
Answer:
- Cyber Laws of India   In Simple way we can say that cyber crime is unlawful acts wherein the computer is either a tool or a target or both
- Cyber crimes can involve criminal activities that are traditional in nature, such as theft, fraud, forgery, defamation and mischief, all of which are subject to the Indian Penal Code
- We can categorize Cyber crimes in two ways   The Computer as a Target :-using a computer to attack other computers
------------------------------------------------------------
Question: Why C

# STEP10- Interactive Question Loop
# purpose: Continuously takes user questions and displays answers, enabling real-time interaction with the PDF content.

In [None]:
print("PDF Question Answering System")
print("Type 'exit' to stop\n")

while True:
    query = input("Ask a question from the PDF: ")
    if query.lower() == "exit":
        break

    answer = direct_pdf_search(query)
    print("\nAnswer:\n", answer)
    print("-" * 60)


PDF Question Answering System
Type 'exit' to stop



Ask a question from the PDF:  what is cyber crime?



Answer:
 - Cyber Laws of India   In Simple way we can say that cyber crime is unlawful acts wherein the computer is either a tool or a target or both
- Cyber crimes can involve criminal activities that are traditional in nature, such as theft, fraud, forgery, defamation and mischief, all of which are subject to the Indian Penal Code
- We can categorize Cyber crimes in two ways   The Computer as a Target :-using a computer to attack other computers
------------------------------------------------------------


Ask a question from the PDF:  what is cyber law?



Answer:
 - What is Cyber Law?   Cyber Law is the law governing cyber space
- Cyber space is a very wide term and includes computers, networks, software, data storage devices (such as hard disks, USB disks etc), the Internet, websites, emails and even electronic devices such as cell phones, ATM machines etc
- Cyber law encompasses laws relating to 1
------------------------------------------------------------


Ask a question from the PDF:  what is Information Warfare?



Answer:
 - the adversary’s information without involving the intervening perceptive and analytical functions
- Indirect Information Warfare changes the adversary’s information by creating phenomena that the adversary must then observe and analyze
- Information Warfare: Past and Present Information warfare is not a new phenomenon
------------------------------------------------------------
