In [1]:

!pip install -U langchain-community faiss-cpu pymupdf python-docx nltk pandas --quiet

print("✅ All essential 5 packages installed successfully.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does 

In [5]:
# Standard Libraries
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')

# File + Web Parsing
import fitz  # PyMuPDF for PDF reading
from bs4 import BeautifulSoup
import requests
from docx import Document as DocxDocument

# Embeddings + Vector Store
from sentence_transformers import SentenceTransformer
import faiss
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS as LC_FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# LLMs
from transformers import pipeline, AutoTokenizer  # needed for Zephyr

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

print("✅ All libraries imported successfully.")


✅ All libraries imported successfully.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# 📦 Required Imports for Utility Functions
import os
import re
import requests
from bs4 import BeautifulSoup
from docx import Document as DocxDocument
import fitz  # PyMuPDF


In [6]:
# Improved Utility Functions for Multi-Document Ingestion with Error Handling & Logging

def load_pdf(file_path):
    """Extract text and metadata from PDF using PyMuPDF."""
    try:
        doc = fitz.open(file_path)
    except Exception as e:
        print(f"❌ Failed to load PDF {file_path}: {e}")
        return []

    texts = []
    for i, page in enumerate(doc):
        page_text = page.get_text()
        if page_text.strip():
            texts.append({
                "text": clean_text(page_text),
                "metadata": {
                    "source": os.path.basename(file_path),
                    "type": "pdf",
                    "page": i + 1
                }
            })
    doc.close()
    print(f"✅ Loaded PDF: {file_path}")
    return texts


def load_text_file(file_path):
    """Extract text and metadata from TXT."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()
    except Exception as e:
        print(f"❌ Failed to load TXT file {file_path}: {e}")
        return []

    print(f"✅ Loaded TXT: {file_path}")
    return [{
        "text": clean_text(raw_text),
        "metadata": {
            "source": os.path.basename(file_path),
            "type": "txt"
        }
    }]


def load_docx_file(file_path):
    """Extract text and metadata from a DOCX (Word) file."""
    try:
        doc = DocxDocument(file_path)
        full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    except Exception as e:
        print(f"❌ Failed to load DOCX file {file_path}: {e}")
        return []

    print(f"✅ Loaded DOCX: {file_path}")
    return [{
        "text": clean_text(full_text),
        "metadata": {
            "source": os.path.basename(file_path),
            "type": "docx"
        }
    }]


def load_webpage(url):
    """Extract cleaned webpage text + source metadata."""
    if not url.startswith("http"):
        print(f"❌ Invalid URL skipped: {url}")
        return []

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()
        raw_text = soup.get_text(separator="\n")
    except Exception as e:
        print(f"❌ Failed to load webpage {url}: {e}")
        return []

    print(f"✅ Loaded Webpage: {url}")
    return [{
        "text": clean_text(raw_text),
        "metadata": {
            "source": url,
            "type": "web"
        }
    }]


def clean_text(text):
    """Basic text cleaning (unchanged)."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()


In [7]:
# Load Multiple Documents from a Folder (PDF, TXT, DOCX) + URLs

docs = []

# Only for Colab: Mount Google Drive and set the input directory
from google.colab import drive
drive.mount('/content/drive')

input_dir = "/content/drive/MyDrive/RAG2/input_docs"  # Update this if needed

# Check if input folder is empty
if not os.listdir(input_dir):
    print(f"⚠️ Warning: No files found in {input_dir}")

# Loop over all files in the directory
unsupported = []
for filename in os.listdir(input_dir):
    filepath = os.path.join(input_dir, filename)
    try:
        if filename.lower().endswith(".pdf"):
            docs.extend(load_pdf(filepath))
        elif filename.lower().endswith(".txt"):
            docs.extend(load_text_file(filepath))
        elif filename.lower().endswith(".docx"):
            docs.extend(load_docx_file(filepath))
        else:
            unsupported.append(filename)
    except Exception as e:
        print(f"⚠️ Failed to load {filename}: {e}")

# Log skipped/unsupported files (optional transparency)
if unsupported:
    print(f"\n📛 Skipped {len(unsupported)} unsupported file(s):", unsupported)

# 🔹 Load from web pages (optional)
web_urls = [
    # "https://en.wikipedia.org/wiki/Artificial_intelligence",
    # "https://example.com/some-article"
]

for url in web_urls:
    try:
        docs.extend(load_webpage(url))
    except Exception as e:
        print(f"⚠️ Failed to load webpage {url}: {e}")

# Preview result
for i, doc in enumerate(docs[:5]):  # Limit preview to first 5
    print(f"\n📄 Document {i+1} — Source: {doc['metadata']['source']}")
    print(doc['text'][:300], "...")  # Show first 300 chars only

# Summary
print(f"\n✅ Loaded {len(docs)} total documents.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loaded DOCX: /content/drive/MyDrive/RAG2/input_docs/Report on IEEE DBCE Student Branch Meet.docx
✅ Loaded PDF: /content/drive/MyDrive/RAG2/input_docs/Resume.pdf

📄 Document 1 — Source: Report on IEEE DBCE Student Branch Meet.docx
Report on IEEE DBCE Student Branch Meet & Greet Date: 26th March 2025 Organized by: IEEE Student Branch, Don Bosco College of Engineering (IEEE DBCE SB) The IEEE DBCE Student Branch conducted a Meet & Greet session to introduce committee members, foster networking, and plan future activities. The ev ...

📄 Document 2 — Source: Resume.pdf
Ayden Xavier Alvito Joanes +91 9923577502 | joanesayden@gmail.com |   Ayden Joanes |   Ayden Joanes | Bengaluru, India Machine Learning Intern USP: A self-taught AI enthusiast and active swing trader driven to integrate deep learning with finance. I offer hands-on ML experience, creative thinking, 

In [None]:
# ✅ Required imports (if not already done)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from tqdm import tqdm


In [8]:
# Smart Chunking with Metadata Preservation + Logging

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

chunked_docs = []
empty_docs = 0

for doc in tqdm(docs, desc="🔃 Chunking documents"):
    if not doc["text"].strip():
        empty_docs += 1
        continue  # skip empty docs

    try:
        splits = text_splitter.split_text(doc["text"])
        for i, chunk in enumerate(splits):
            chunked_docs.append(Document(
                page_content=chunk,
                metadata={**doc["metadata"], "chunk": i + 1}
            ))
    except Exception as e:
        print(f"❌ Error while chunking document {doc['metadata'].get('source', 'Unknown')}: {e}")

# Summary
print(f"\n✅ Chunked into {len(chunked_docs)} total pieces.")
if empty_docs:
    print(f"⚠️ Skipped {empty_docs} empty document(s).")

# Preview sample chunk
if chunked_docs:
    print("\n📄 Sample chunk:\n", chunked_docs[0].page_content)
    print("📎 Metadata:", chunked_docs[0].metadata)
else:
    print("⚠️ No chunks generated.")


🔃 Chunking documents: 100%|██████████| 3/3 [00:00<00:00, 2521.63it/s]


✅ Chunked into 22 total pieces.

📄 Sample chunk:
 Report on IEEE DBCE Student Branch Meet & Greet Date: 26th March 2025 Organized by: IEEE Student Branch, Don Bosco College of Engineering (IEEE DBCE SB) The IEEE DBCE Student Branch conducted a Meet & Greet session to introduce committee members, foster networking, and plan future activities. The event aimed to strengthen collaboration, encourage active participation, and align with IEEE s mission of advancing technology for the benefit of society
📎 Metadata: {'source': 'Report on IEEE DBCE Student Branch Meet.docx', 'type': 'docx', 'chunk': 1}





In [9]:
# Create embeddings for the chunks and store in FAISS vector DB

from time import time

# Model Selection (easy switch later)
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Swap to 'BAAI/bge-base-en-v1.5' or others for better performance

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

if not chunked_docs:
    raise ValueError("❌ No chunks found. Please run the chunking step first.")

print(f"\n🚀 Starting embedding with model: {model_name}")
start = time()

# Embed & create FAISS vector store
vector_store = LC_FAISS.from_documents(chunked_docs, embedding_model)

# Save FAISS index
save_path = "/content/drive/MyDrive/RAG2/vector_store4"
vector_store.save_local(save_path)

print(f"✅ FAISS index created and saved to: {save_path}")
print(f"⏱️ Time taken: {round(time() - start, 2)}s")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🚀 Starting embedding with model: sentence-transformers/all-MiniLM-L6-v2
✅ FAISS index created and saved to: /content/drive/MyDrive/RAG2/vector_store4
⏱️ Time taken: 0.75s


In [10]:
import sys
import subprocess

# Install accelerate if not already available
subprocess.check_call([sys.executable, "-m", "pip", "install", "accelerate"])

# Confirm it's now accessible in this kernel
import accelerate
print("✅ accelerate version in notebook kernel:", accelerate.__version__)


✅ accelerate version in notebook kernel: 1.8.1


In [11]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("💻 Using device:", device)


💻 Using device: cuda


In [12]:
!pip install bitsandbytes accelerate transformers --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# Before running this block , please restart session


from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

model_id = "HuggingFaceH4/zephyr-7b-beta"
print("📦 Loading Zephyr-7B in 4-bit quantized mode...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_skip_modules=None
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)


📦 Loading Zephyr-7B in 4-bit quantized mode...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [2]:
import torch
import json
from datetime import datetime
from transformers import pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS as LC_FAISS

# Ensure correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("💻 Using device:", device)

# Load vector store from Google Drive
retriever = LC_FAISS.load_local(
    "/content/drive/MyDrive/RAG2/vector_store4",
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    allow_dangerous_deserialization=True
).as_retriever(search_kwargs={"k": 5})

# You already loaded model and tokenizer in Step 2
# Do NOT reload model/tokenizer here again!

# Generator pipeline using already-loaded model
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=False
)

# Ask a question
def ask_question(query, return_sources=False, log_file="qa_log.jsonl"):
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # Refined prompt
    system_prompt = (
        "You are a knowledgeable and trustworthy assistant who only uses the given context to answer."
        " If the answer is not in the context, say 'I don't know'. Avoid hallucinating facts."
        " Be concise, clear, and include relevant details if needed."
    )

    prompt = (
        f"<|system|>\n{system_prompt}</s>\n"
        f"<|user|>\nContext:\n{context}\n\nQuestion: {query}</s>\n"
        f"<|assistant|>"
    )

    if len(prompt) > 4000:
        prompt = prompt[-4000:]

    raw_output = generator(prompt)[0]['generated_text']
    answer = raw_output.split("<|assistant|>")[-1].strip()

    is_low_quality = len(answer.split()) < 10 or "I'm not sure" in answer
    sources = list({doc.metadata.get("source", "Unknown") for doc in retrieved_docs})

    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "question": query,
        "answer": answer,
        "sources": sources
    }
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(log_entry) + "\n")

    if return_sources:
        return answer, sources, is_low_quality
    return answer

# Mode selection: Interactive or Batch
mode = input("💡 Choose mode — 'interactive' or 'batch': ").strip().lower()

if mode == "batch":
    question_file = "/content/drive/MyDrive/RAG2/Questions3.txt"
    output_file = "/content/drive/MyDrive/RAG2/batch_log4.jsonl"

    with open(question_file, "r", encoding="utf-8") as f:
        questions = [line.strip() for line in f if line.strip()]

    print(f"🧪 Running batch mode on {len(questions)} questions...")

    for i, q in enumerate(questions):
        print(f"\n🔹 Question {i+1}: {q}")
        try:
            answer, citations, low_quality = ask_question(q, return_sources=True, log_file=output_file)
            print(f"🧠 Answer: {answer}")
            if low_quality:
                print("⚠️ Low-quality flag triggered.")
            print(f"📎 Sources: {citations}")
        except Exception as e:
            print(f"❌ Error on question {i+1}: {e}")

    print("✅ Batch run complete.")

elif mode == "interactive":
    while True:
        user_query = input("\n🔍 Ask a question (or type 'exit' to quit): ")
        if user_query.lower() in ["exit", "quit"]:
            print("👋 Exiting the Q&A session...")
            break
        try:
            answer, citations, low_quality = ask_question(user_query, return_sources=True)
            print("\n" + "=" * 60)
            print(f"❓ Question: {user_query}")
            print(f"🧠 Answer: {answer}")
            if low_quality:
                print("⚠️ Note: This answer may be incomplete or low-confidence.")
            print(f"📎 Sources: {citations}")
            print("=" * 60)
        except Exception as e:
            print(f"❌ Error during answer generation: {e}")
else:
    print("⚠️ Invalid mode selected. Please type 'interactive' or 'batch'.")


💻 Using device: cuda


  HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


💡 Choose mode — 'interactive' or 'batch': batch
🧪 Running batch mode on 3 questions...

🔹 Question 1: What is the mission of IEEE?


  retrieved_docs = retriever.get_relevant_documents(query)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧠 Answer: I don't have access to the personal beliefs or intentions of an organization, but based on the given context, the mission of ieee is not explicitly stated. However, the organization's website (https://www.ieee.org/about/whatisieee.html) states that the mission of ieee is "to advance technology for the benefit of humanity, and to promote the scientific and professional knowledge and practice of electric and electronic engineering and allied disciplines."
📎 Sources: ['Report on IEEE DBCE Student Branch Meet.docx']

🔹 Question 2: List 3 major events hosted by the DBCE Student Branch.


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧠 Answer: Based on the context provided, the three major events hosted by the DBCE Student Branch mentioned in the report are:

1. Meet & Greet session: This event was organized to introduce committee members, foster networking, and plan future activities. It aimed to strengthen collaboration, encourage active participation, and align with IEEE's mission of advancing technology for the benefit of society.

2. Workshops, hackathons, technical talks, and professional development sessions: These events were discussed during the Meet & Greet session, and they are part of the branch's plans for organizing upcoming IEEE activities. They are aimed at enhancing technical and leadership skills, promoting active participation, and encouraging students to participate in IEEE conferences and competitions.

3. Outreach efforts: The discussion during the Meet & Greet session also focused on strengthening outreach efforts to promote IEEE within the college and engage more students in its initiatives.

In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

CUDA available: False
GPU name: No GPU detected


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

CUDA available: True
GPU name: NVIDIA GeForce RTX 3050 Laptop GPU
