#  Combined Ingestion: PDFs + Web Pages → FAISS Index

In [None]:
# 📦 Install required packages
!pip install langchain sentence-transformers faiss-cpu pymupdf beautifulsoup4 requests


## 📄 Step 1: Load PDFs

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from pathlib import Path

pdf_dir = Path(r"C:/Rag_data")
pdf_files = list(pdf_dir.glob("*.pdf"))
all_docs = []

for pdf in pdf_files:
    loader = PyMuPDFLoader(str(pdf))
    pages = loader.load()
    for i, doc in enumerate(pages):
        doc.metadata["source"] = pdf.name
        doc.metadata["page"] = i + 1
    all_docs.extend(pages)

print(f"✅ Loaded {len(all_docs)} pages from {len(pdf_files)} PDFs.")


##  Step 2: Load Web Pages

In [None]:
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document

urls = [
    "https://en.wikipedia.org/wiki/Natural_language_processing",
    "https://www.ibm.com/topics/natural-language-processing"
]

for url in urls:
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
            tag.decompose()
        text = soup.get_text(separator="\n")
        cleaned = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
        doc = Document(page_content=cleaned, metadata={"source": url})
        all_docs.append(doc)
    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")

print(f"✅ Total documents after web scraping: {len(all_docs)}")


##  Step 3: Chunk All Documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(all_docs)

for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_index"] = i

print(f"📦 Created {len(chunks)} chunks.")
print("Sample chunk:", chunks[0].page_content[:300])


##  Step 4: Embed and Store in FAISS

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
index = FAISS.from_documents(chunks, embedder)
index.save_local("combined_faiss_index")

print("✅ FAISS index saved to 'combined_faiss_index'")


##  Load Embedder and FAISS Index

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load embedder
embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Load FAISS index
index = FAISS.load_local(
    folder_path="combined_faiss_index",
    embeddings=embedder,
    allow_dangerous_deserialization=True
)

print("✅ FAISS index loaded.")


##  Run Semantic Query

In [None]:
# Ask a semantic question
query = "What is NLP and how is it used in real life?"

# Get top 3 relevant chunks with similarity scores
results = index.similarity_search_with_score(query, k=3)

# Display results
for i, (doc, score) in enumerate(results, 1):
    print(f"\n🔹 Rank {i} (Score: {score:.4f})")
    print(f"Source: {doc.metadata.get('source')} | Page: {doc.metadata.get('page', 'N/A')}")
    print(doc.page_content[:500])


##  Plot the similarity scores

In [None]:
import matplotlib.pyplot as plt

# 🧾 Extract scores, sources, and preview text
scores = []
sources = []
texts = []

for doc, score in results:
    scores.append(score)
    sources.append(doc.metadata.get("source", "unknown"))
    texts.append(doc.page_content[:100].replace("\n", " ") + "...")

plt.figure(figsize=(10, 6))
bars = plt.barh(range(len(scores)), scores, color='skyblue')
plt.yticks(range(len(scores)), [f"{i+1}. {s}" for i, s in enumerate(sources)])
plt.xlabel("Similarity Score (lower is better)")
plt.title("Top-k FAISS Document Similarity Scores")

for bar, score in zip(bars, scores):
    plt.text(bar.get_width() + 0.01, bar.get_y() + 0.4, f"{score:.4f}")

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
