In [None]:

import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')
from docx import Document as DocxDocument


# 📄 PDF/Text Parsing & Web Scraping
import fitz  
from bs4 import BeautifulSoup
import requests

from sentence_transformers import SentenceTransformer

# 🔢 Vector Store
import faiss

# 🧠 LLMs
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# 🧱 RAG Chains (optional, but powerful)
from langchain_community.vectorstores import FAISS as LC_FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import warnings
warnings.filterwarnings("ignore")

print("✅ All libraries imported successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joane\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.



✅ All libraries imported successfully.


In [None]:

def load_pdf(file_path):
    """Extract text and metadata from PDF using PyMuPDF."""
    doc = fitz.open(file_path)
    texts = []
    for i, page in enumerate(doc):
        page_text = page.get_text()
        if page_text.strip():
            texts.append({
                "text": clean_text(page_text),
                "metadata": {
                    "source": os.path.basename(file_path),
                    "type": "pdf",
                    "page": i + 1
                }
            })
    doc.close()
    return texts


def load_text_file(file_path):
    """Extract text and metadata from TXT."""
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return [{
        "text": clean_text(raw_text),
        "metadata": {
            "source": os.path.basename(file_path),
            "type": "txt"
        }
    }]


def load_docx_file(file_path):
    """Extract text and metadata from a DOCX (Word) file."""
    doc = DocxDocument(file_path)
    full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return [{
        "text": clean_text(full_text),
        "metadata": {
            "source": os.path.basename(file_path),
            "type": "docx"
        }
    }]


def load_webpage(url):
    """Extract cleaned webpage text + source metadata."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for script in soup(["script", "style"]):
        script.decompose()
    raw_text = soup.get_text(separator="\n")
    return [{
        "text": clean_text(raw_text),
        "metadata": {
            "source": url,
            "type": "web"
        }
    }]


def clean_text(text):
    """Basic text cleaning (unchanged)."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()


In [None]:
#Load Multiple Documents from a Folder (PDF, TXT, DOCX) + URLs

docs = []

# 🔹 Load from files
input_dir = r"C:\Users\joane\OneDrive\Desktop\Sem 7\RAG\input_docs"  # ← Put all your files here
for filename in os.listdir(input_dir):
    filepath = os.path.join(input_dir, filename)
    if filename.lower().endswith(".pdf"):
        docs.extend(load_pdf(filepath))
    elif filename.lower().endswith(".txt"):
        docs.extend(load_text_file(filepath))
    elif filename.lower().endswith(".docx"):
        docs.extend(load_docx_file(filepath))

# 🔹 Load from web pages (optional)
web_urls = [
    # "https://en.wikipedia.org/wiki/Artificial_intelligence",
    # "https://www.example.com/article.html"
]
for url in web_urls:
    docs.extend(load_webpage(url))

# ✅ Preview result
for i, doc in enumerate(docs):
    print(f"\n📄 Document {i+1} — Source: {doc['metadata']['source']}")
    print(doc['text'][:300])  # Show first 300 chars only




📄 Document 1 — Source: Report on IEEE DBCE Student Branch Meet.docx
Report on IEEE DBCE Student Branch Meet & Greet Date: 26th March 2025 Organized by: IEEE Student Branch, Don Bosco College of Engineering (IEEE DBCE SB) The IEEE DBCE Student Branch conducted a Meet & Greet session to introduce committee members, foster networking, and plan future activities. The ev

📄 Document 2 — Source: Resume.pdf
Ayden Xavier Alvito Joanes +91 9923577502 | joanesayden@gmail.com |   Ayden Joanes |   Ayden Joanes | Bengaluru, India Machine Learning Intern USP: A self-taught AI enthusiast and active swing trader driven to integrate deep learning with finance. I offer hands-on ML experience, creative thinking, a

📄 Document 3 — Source: Resume.pdf
Fake News Detection Using XLM-RoBERTa and Microsoft Autogen with Dynamic Weighted Agents Tools: Python, Transformers, HuggingFace, XLM-RoBERTa, Microsoft Autogen, Pandas, scikit-learn   Objective: Built a flexible fake news detection system using a fine-tuned

In [None]:
#Smart Chunking with Metadata Preservation

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

# Final container for all processed chunks
chunked_docs = []

# Go through each doc from the loader
for doc in docs:
    splits = text_splitter.split_text(doc["text"])
    for i, chunk in enumerate(splits):
        chunked_docs.append(Document(
            page_content=chunk,
            metadata={**doc["metadata"], "chunk": i + 1}  # retain metadata + chunk number
        ))

print(f"✅ Chunked into {len(chunked_docs)} total pieces.")
print("📄 Sample chunk:\n", chunked_docs[0].page_content)
print("📎 Metadata:", chunked_docs[0].metadata)


✅ Chunked into 22 total pieces.
📄 Sample chunk:
 Report on IEEE DBCE Student Branch Meet & Greet Date: 26th March 2025 Organized by: IEEE Student Branch, Don Bosco College of Engineering (IEEE DBCE SB) The IEEE DBCE Student Branch conducted a Meet & Greet session to introduce committee members, foster networking, and plan future activities. The event aimed to strengthen collaboration, encourage active participation, and align with IEEE s mission of advancing technology for the benefit of society
📎 Metadata: {'source': 'Report on IEEE DBCE Student Branch Meet.docx', 'type': 'docx', 'chunk': 1}


In [None]:
# Create embeddings for the chunks and store in FAISS vector DB

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # lightweight, fast, and accurate
)

# Create a FAISS vector store from the documents
vector_store = LC_FAISS.from_documents(chunked_docs, embedding_model)

# Save the vector store locally
vector_store.save_local("vector_store")

print("✅ FAISS index created and saved successfully.")


✅ FAISS index created and saved successfully.


In [None]:
#Load the saved FAISS index + Ask questions using a generator model

import torch
import requests
from transformers import pipeline

# (1) Prevent download issues
requests.adapters.DEFAULT_RETRIES = 5

# (2) Load vector store
retriever = LC_FAISS.load_local(
    "vector_store",
    embedding_model,
    allow_dangerous_deserialization=True
).as_retriever(search_kwargs={"k": 5})

# (3) Load the FLAN-T5 model
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512,
    device=0 if torch.cuda.is_available() else -1,
    revision="main"
)

# Ask a question (with optional source metadata)
def ask_question(query, return_sources=False):
    # 1. Retrieve relevant chunks
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # 2. Construct the prompt
    prompt = (
        f"Answer the following question using only the context below:\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    # 3. Generate answer
    response = generator(prompt, do_sample=False)[0]['generated_text']
    answer = response.strip()

    # 4. Optionally return sources
    if return_sources:
        sources = list({doc.metadata.get('source', 'Unknown') for doc in retrieved_docs})
        return answer, sources
    else:
        return answer

import time

# Ask interactively from user input
while True:
    user_query = input("\n🔍 Ask a question (or type 'exit' to quit): ")
    
    if user_query.lower() in ["exit", "quit"]:
        print("👋 Exiting the Q&A session...")
        time.sleep(0.5)
        break

    answer, citations = ask_question(user_query, return_sources=True)
    
    print("\n" + "="*60)
    print(f"❓ Question: {user_query}")
    print(f"🧠 Answer: {answer}")
    print(f"📎 Sources: {citations}")
    print("="*60)


Device set to use cuda:0



❓ Question: What is Ayden's area of interest?
🧠 Answer: Deeply intrigued by large language models, autonomous multi-agent systems, and behavioral psychology in tech. Enthusiastic about equity markets, human-centered design, and exploring startup ecosystems
📎 Sources: ['Report on IEEE DBCE Student Branch Meet.docx', 'Resume.pdf']
👋 Exiting the Q&A session...
