In [16]:
# Step 1: Required libraries install karo (sirf pehli baar)
!pip install pdfplumber sentence-transformers transformers faiss-cpu torch requests --quiet

# Step 2: Imports
import os
import pdfplumber
import requests
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# Step 3: Create folder for PDFs
pdf_folder = "sample_pdfs"
if not os.path.exists(pdf_folder):
    os.makedirs(pdf_folder)
    print(f"Folder '{pdf_folder}' created.")

# Step 4: URLs of 10 public research papers (arXiv PDFs)
pdf_urls = [
    "https://arxiv.org/pdf/1810.04805.pdf",  # BERT
    "https://arxiv.org/pdf/1907.11692.pdf",  # RoBERTa
    "https://arxiv.org/pdf/1908.10084.pdf",  # Sentence-BERT
    "https://arxiv.org/pdf/2003.08271.pdf",  # Transformers survey
    "https://arxiv.org/pdf/1908.09447.pdf",  # FAISS paper
    "https://arxiv.org/pdf/2004.13632.pdf",  # Document understanding
    "https://arxiv.org/pdf/1906.02221.pdf",  # QA with BERT
    "https://arxiv.org/pdf/1708.02709.pdf",  # Deep Learning NLP
    "https://arxiv.org/pdf/1609.02728.pdf",  # Information Retrieval
    "https://arxiv.org/pdf/1801.06146.pdf",  # AI basics
]

# Step 5: Download PDFs automatically
for i, url in enumerate(pdf_urls):
    file_name = os.path.join(pdf_folder, f"paper_{i+1}.pdf")
    if not os.path.exists(file_name):
        print(f"Downloading {url} ...")
        r = requests.get(url)
        with open(file_name, "wb") as f:
            f.write(r.content)
    else:
        print(f"{file_name} already exists, skipping download.")

print("All PDFs are downloaded.")

# Step 6: Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        texts = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(texts)

# Step 7: Extract text from all PDFs
pdf_texts = {}
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        full_path = os.path.join(pdf_folder, filename)
        text = extract_text_from_pdf(full_path)
        pdf_texts[filename] = text

print(f"Extracted text from {len(pdf_texts)} PDFs.")

# Step 8: Load sentence transformer model for embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Step 9: Chunk text into 500-character chunks for better embedding
def chunk_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

all_chunks = []
chunk_to_doc = []

for doc_name, text in pdf_texts.items():
    chunks = chunk_text(text)
    all_chunks.extend(chunks)
    chunk_to_doc.extend([doc_name] * len(chunks))

print(f"Created {len(all_chunks)} chunks from documents.")

# Step 10: Generate embeddings for all chunks
embeddings = embedder.encode(all_chunks, show_progress_bar=True)

# Step 11: Create FAISS index and add embeddings
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings))

print("FAISS index created with all embeddings.")

# Step 12: Load Question Answering pipeline (HuggingFace)
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Step 13: Function to answer query using nearest neighbor chunks
def answer_query(query, top_k=3):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)

    answers = []
    for idx in indices[0]:
        context = all_chunks[idx]
        doc = chunk_to_doc[idx]
        result = qa_pipeline(question=query, context=context)
        answers.append({
            "document": doc,
            "answer": result['answer'],
            "score": result['score'],
            "context_snippet": context[:200]
        })
    return answers

# Step 14: Example query run
query = "What penalty is described?"
results = answer_query(query)

# Step 15: Print answers
for i, res in enumerate(results, 1):
    print(f"\nAnswer {i}:")
    print(f"Document: {res['document']}")
    print(f"Answer: {res['answer']}")
    print(f"Score: {res['score']:.4f}")
    print(f"Context snippet: {res['context_snippet']}...")


Downloading https://arxiv.org/pdf/1810.04805.pdf ...
Downloading https://arxiv.org/pdf/1907.11692.pdf ...
Downloading https://arxiv.org/pdf/1908.10084.pdf ...
Downloading https://arxiv.org/pdf/2003.08271.pdf ...
Downloading https://arxiv.org/pdf/1908.09447.pdf ...
Downloading https://arxiv.org/pdf/2004.13632.pdf ...
Downloading https://arxiv.org/pdf/1906.02221.pdf ...
Downloading https://arxiv.org/pdf/1708.02709.pdf ...
Downloading https://arxiv.org/pdf/1609.02728.pdf ...
Downloading https://arxiv.org/pdf/1801.06146.pdf ...




All PDFs are downloaded.




Extracted text from 10 PDFs.
Created 1222 chunks from documents.


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

FAISS index created with all embeddings.


Device set to use cpu



Answer 1:
Document: paper_6.pdf
Answer: 
ff
reducingworkingsta
Score: 0.0050
Context snippet: t declared a lock down on the 23rd of March and took early
measures to reduce physical contact, like stopping the local trains and auto-rickshaws or
ff
reducingworkingsta ingovernmentbyhalforshuttingd...

Answer 2:
Document: paper_6.pdf
Answer: 
ff
a di erence
Score: 0.0027
Context snippet: ne week after implementation of the lock down whereas states like Gujarat, Maha-
rashtra, Karnataka, Madhya Pradesh, Delhi attain their peak number of cases at around the
end of the lock down period w...

Answer 3:
Document: paper_6.pdf
Answer: lockdown till 31st; violators will be booked
Score: 0.0495
Context snippet: case. 2020. url: https://www.cnbc.
com/2020/01/30/india-confirms-first-case-of-the-coronavirus.html.
[15] Tanya Thomas. Maharashtra imposes lockdown till 31st; violators will be booked. 2020.
url: htt...
