IMPORT

In [1]:
import faiss
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder

import traceback


  from .autonotebook import tqdm as notebook_tqdm


EMBEDDING MODEL

In [2]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(chunks):
    embeddings = model.encode(chunks, show_progress_bar=True)
    return embeddings

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 409.74it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


PDF TEXT EXTRACTION

In [None]:
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter

import hashlib

def pdf_fingerprint(text):
    """
    Generate a hash for entire PDF content to detect duplicate PDFs
    """
    cleaned_text = " ".join(text.split())  # normalize whitespace
    return hashlib.sha256(cleaned_text.encode("utf-8")).hexdigest()


def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def chunk_text(text, chunk_size=400, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_text(text)
    return chunks


FAISS

In [None]:
# üîπ VectorDB with Duplicate Removal

import faiss
import numpy as np
import hashlib

def chunk_hash(chunk):
    """
    Compute a hash for a text chunk to detect duplicates.
    """
    return hashlib.md5(chunk.encode("utf-8")).hexdigest()

def pdf_hash(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()


uploaded_pdf_hashes = set()

class VectorDB:
    def __init__(self, embedding_dim):
        self.embedding_dim = embedding_dim
        self.index = faiss.IndexFlatL2(embedding_dim)
        self.chunks = []           # Stores {"text": ..., "metadata": ...}
        self.chunk_hashes = set()  # Track duplicates

    def add_chunks(self, chunks, embeddings, metadata_list=None):
        """
        Add chunks to the FAISS index, ignoring duplicates.
        """
        if metadata_list is None:
            metadata_list = [{} for _ in chunks]

        new_chunks = []
        new_embeddings = []

        for i, chunk in enumerate(chunks):
            h = chunk_hash(chunk)
            if h not in self.chunk_hashes:
                self.chunk_hashes.add(h)
                new_chunks.append(chunk)
                new_embeddings.append(embeddings[i])
                self.chunks.append({
                    "text": chunk,
                    "metadata": {
                        **metadata_list[i],
                        "doc_id": metadata_list[i].get("doc_id", "unknown")
                    }
                })


        # Add only new embeddings to FAISS
        if new_embeddings:
            self.index.add(np.array(new_embeddings).astype('float32'))

        #print(f"Added {len(new_chunks)} new chunks. Total chunks: {len(self.chunks)}")
        print(f"[VectorDB] Indexed {len(new_chunks)} chunks (Total stored: {len(self.chunks)})")

    def retrieve(self, query_embedding, top_k=5):
        """
        Retrieve top-k most similar chunks for a query embedding.
        Returns:
            - List of chunks (with text + metadata)
            - Similarity scores
        """
        if self.index.ntotal == 0:
            return [], []

        D, I = self.index.search(np.array([query_embedding]).astype('float32'), top_k)
        results = [self.chunks[i] for i in I[0]]
        scores = 1 - D.flatten()  # convert L2 distances to pseudo-similarity
        return results, scores


JUDGE LLM

In [None]:
# Cross-Encoder Judge (lightweight & fast)
JUDGE_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
judge_model = CrossEncoder(JUDGE_MODEL)

def llm_judge(query, chunks, threshold=0.6, min_good_chunks=2):
    """
    Returns True if retrieved chunks are sufficient to answer the query
    """
    if not chunks:
        return False

    pairs = [(query, chunk["text"]) for chunk in chunks]
    scores = judge_model.predict(pairs)

    good_chunks = sum(score >= threshold for score in scores)
    return good_chunks >= min_good_chunks


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [00:00<00:00, 424.99it/s, Materializing param=classifier.weight]                                    
BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


LOAD PDF

In [None]:
def load_pdfs(pdf_files, seen_pdf_hashes=None):
    if seen_pdf_hashes is None:
        seen_pdf_hashes = set()

    all_chunks = []

    for file in pdf_files:
        text = extract_text_from_pdf(file.name)
        h = pdf_hash(text)

        if h in seen_pdf_hashes:
            print(f"‚ö†Ô∏è Skipping duplicate PDF: {file.name}")
            continue

        seen_pdf_hashes.add(h)

        chunks = chunk_text(text)
        all_chunks.extend(chunks)

        print(f"‚úÖ {file.name} ‚Üí {len(chunks)} chunks")

    print(f"\nTOTAL NEW CHUNKS: {len(all_chunks)}")
    return all_chunks, seen_pdf_hashes


QUERY CHECKER

In [8]:
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [9]:
%pip install python-dotenv
from dotenv import load_dotenv
import os

load_dotenv()

Note: you may need to restart the kernel to use updated packages.


True

In [10]:
import os
from openai import OpenAI

LOW_TH = 0.25
HIGH_TH = 0.45
TOP_K = 5
MAX_RETRIEVER_ATTEMPTS = 3

api_key = os.getenv("OPENAI_API_KEY")
base_url= os.getenv("BASE_URL")
from openai import OpenAI
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content



def generate_answer(query, retrieved_chunks):
    """
    Generate answer using retrieved PDF chunks.
    """
    context = "\n\n".join(
        c["text"] if isinstance(c, dict) else c
        for c in retrieved_chunks
    )

    prompt = f"""
    You are an academic assistant.
    Answer ONLY using the provided context.
    If the answer is not found, say "I don't know".

    CONTEXT:
    {context}

    QUESTION:
    {query}

    ANSWER:
    """

    response=llm(prompt)
    return response.strip()



# -----------------------------
# Cross-Encoder Judge for evaluation
# -----------------------------
from sentence_transformers import CrossEncoder

judge_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def hallucination_check(answer, retrieved_chunks):
    """
    True  -> hallucinated
    False -> grounded
    """
    context = "\n".join(
        f"- {chunk['text']}" for chunk in retrieved_chunks
    )

    prompt = f"""
You are a strict fact checker.

CONTEXT:
{context}

ANSWER:
{answer}

Question:
Is the answer fully supported by the context?
Reply with only YES or NO.
"""

    verdict = llm(prompt).strip().upper()
    print("Hallucination verdict:", verdict)

    return verdict == "NO"



    

def correctness_check(answer, query):
    """
    Uses LLM to judge if the answer correctly answers the query.

    Returns:
    - True  ‚Üí LOW correctness (needs regeneration)
    - False ‚Üí GOOD correctness
    """

    prompt = f"""
    You are a strict evaluator.

    Question:
    {query}

    Answer:
    {answer}

    Is the answer correct and complete?

    Respond with only one word:
    YES or NO
    """

    response = llm(prompt).strip().upper()

    print("[Correctness Judge - LLM]:", response)

    return response != "YES"







def adaptive_router(pdf_files, query):
    try:
        if not pdf_files:
            return {"Error": "No PDF uploaded"}

        # 1Ô∏è‚É£ Load PDFs
        all_chunks = load_pdfs(pdf_files)
        if not all_chunks:
            return {"Error": "No text extracted"}

        # 2Ô∏è‚É£ Embeddings
        chunk_embeddings = get_embeddings(all_chunks)
        query_embedding = get_embeddings([query])[0]

        # 3Ô∏è‚É£ FAISS Indexing
        vector_db = VectorDB(embedding_dim=len(chunk_embeddings[0]))
        vector_db.add_chunks(all_chunks, chunk_embeddings)

        retrieved_chunks, scores = vector_db.retrieve(query_embedding, top_k=TOP_K)
        max_sim = float(max(scores))

        # 4Ô∏è‚É£ Routing logic
        stage = "FAISS Similarity"
        retriever_attempts = 0
        answer = ""

        if max_sim < LOW_TH:
            route = "üß† General LLM (Low similarity)"
            answer = llm(query)
            is_hallucinated = False
            is_low_correctness = False


        elif max_sim > HIGH_TH:
            route = "üìÑ PDF-based RAG (High similarity)"
            retrieved_chunks, _ = vector_db.retrieve(
                query_embedding, top_k=len(all_chunks)
            )
            retriever_attempts = len(retrieved_chunks)
            answer = generate_answer(query, retrieved_chunks)
            is_hallucinated = hallucination_check(answer, retrieved_chunks)

            # üîπ Check correctness
            is_low_correctness = correctness_check(answer, query)

            # üîπ Regenerate if needed
            max_attempts = 3
            attempt = 0
            while (is_hallucinated or is_low_correctness) and attempt < max_attempts:
                attempt += 1
                
                # If hallucinated ‚Üí try retrieving more chunks (expand top-k)
                if is_hallucinated:
                    retrieved_chunks, _ = vector_db.retrieve(query_embedding, top_k=len(retrieved_chunks)+TOP_K)

                # Regenerate answer
                answer = generate_answer(query, retrieved_chunks)

                # Re-check
                is_hallucinated = hallucination_check(answer, retrieved_chunks)
                is_low_correctness = correctness_check(answer, query)

        else:
            # 5Ô∏è‚É£ Retriever + Judge
            stage = "Retriever + Judge"
            judge_decision = False
            final_chunks = []

            for attempt in range(MAX_RETRIEVER_ATTEMPTS):
                k = TOP_K * (attempt + 1)
                candidate_chunks, _ = vector_db.retrieve(query_embedding, top_k=k)

                if llm_judge(query, [c["text"] for c in candidate_chunks]):
                    judge_decision = True
                    final_chunks = candidate_chunks
                    retriever_attempts = len(candidate_chunks)
                    break

            if judge_decision:
                route = "üìÑ PDF-based RAG (Judge confirmed)"
                answer = generate_answer(query, final_chunks)
                is_hallucinated = hallucination_check(answer, final_chunks)
                is_low_correctness = correctness_check(answer, query)

            else:
                route = "üß† General LLM (Judge rejected)"
                answer = llm(query)

        # üî• THIS RETURN CONTROLS GRADIO OUTPUT
        return {
            "Routing Decision": route,
            "Decision Stage": stage,
            "Max Cosine Similarity": round(max_sim, 3),
            "Retrieved Chunks": retriever_attempts,
            "Answer": answer,
            "Hallucinated": is_hallucinated,
            "Low Correctness": is_low_correctness
        }


    except Exception as e:
        return {
            "Error": str(e)
        }

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [00:00<00:00, 480.37it/s, Materializing param=classifier.weight]                                    
BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [11]:
fake_chunks = [
    {"text": "Python is a programming language created by Guido van Rossum."},
    {"text": "Python was first released in 1991."}
]

answer_ok = "Python was created by Guido van Rossum."
answer_bad = "Python was created by Elon Musk."

print("Hallucinated (OK):", hallucination_check(answer_ok, fake_chunks))
print("Hallucinated (BAD):", hallucination_check(answer_bad, fake_chunks))


Hallucination verdict: YES
Hallucinated (OK): False
Hallucination verdict: NO
Hallucinated (BAD): True


In [None]:
query = "Who created Python?"

answer_good = "Python was created by Guido van Rossum."
answer_bad = "Python is a popular programming language."

print("Low correctness (GOOD):", correctness_check(answer_good, query))
print("Low correctness (BAD):", correctness_check(answer_bad, query))


[Correctness Judge - LLM]: YES
Low correctness (GOOD): False
[Correctness Judge - LLM]: NO
Low correctness (BAD): True


GRADIO

In [None]:
import gradio as gr
gr.Interface(
    fn=adaptive_router,
    inputs=[
        gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
        gr.Textbox(label="Ask a question")
    ],
    outputs=gr.JSON(label="Routing Result"),
    title="Adaptive RAG (Notebook Mode)",
    description="Cosine similarity + Retriever Grader + Judge"
).launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




C:\Users\showbiya\AppData\Local\Temp\gradio\4489c96c65070de787d873107aece9179117ef1b02e0adf9472b40e004c6f5e3\22IS601-LM-1.1.pdf ‚Üí 42 chunks

TOTAL CHUNKS: 42


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  2.02it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 36.33it/s]


[VectorDB] Indexed 42 chunks (Total stored: 42)
Hallucination verdict: YES
[Correctness Judge - LLM]: NO
C:\Users\showbiya\AppData\Local\Temp\gradio\4489c96c65070de787d873107aece9179117ef1b02e0adf9472b40e004c6f5e3\22IS601-LM-1.1.pdf ‚Üí 42 chunks

TOTAL CHUNKS: 42


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  2.54it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10.72it/s]


[VectorDB] Indexed 42 chunks (Total stored: 42)
C:\Users\showbiya\AppData\Local\Temp\gradio\4489c96c65070de787d873107aece9179117ef1b02e0adf9472b40e004c6f5e3\22IS601-LM-1.1.pdf ‚Üí 42 chunks

TOTAL CHUNKS: 42


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.17s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.92it/s]


[VectorDB] Indexed 42 chunks (Total stored: 42)


``
PDF ‚Üí Chunks
     ‚Üí Embeddings
     ‚Üí FAISS Index
Query ‚Üí Embedding
      ‚Üí FAISS search
      ‚Üí Threshold routing
      ‚Üí (Optional) Retriever + Judge loop
      ‚Üí Final decision

``

# üîπ Adaptive RAG Flow (Query Checker + Retriever Grader)

## **1Ô∏è‚É£ Query Checker (Router)**

**Goal:** Determine if the user query is related to the uploaded PDFs.  

**Flow:**

1. User uploads PDFs ‚Üí **extract text** ‚Üí **split into chunks** ‚Üí **generate embeddings** ‚Üí **store in FAISS**.
2. User enters query ‚Üí **convert query to embedding**.
3. **FAISS search:** Compare query embedding against all PDF chunks.
4. **Decision based on similarity thresholds:**
   - **High similarity** ‚Üí PDF-based RAG  
   - **Low similarity** ‚Üí General LLM  
   - **Ambiguous similarity** ‚Üí **Cross-Encoder Judge** checks if the query is actually answerable from PDFs.

**Summary:**  
- Uses **FAISS** for similarity search.  
- Uses **Judge** only for ambiguous cases.  

---

## **2Ô∏è‚É£ Retriever Checker (Grader Loop)**

**Goal:** Ensure that the retrieved PDF chunks are sufficient to answer the query.  

**Flow:**

1. Take **top-k chunks** from FAISS (from query checker) ‚Üí **candidate chunks**.
2. **Judge each batch** to check if they are actually relevant to the query.
3. If not enough relevant chunks ‚Üí retrieve **next batch of top-k chunks** from FAISS (repeat up to 3 attempts max).
4. Decide:
   - **Judge confirmed** ‚Üí use these chunks for PDF-based RAG  
   - **Judge rejected** ‚Üí fallback to General LLM

**Summary:**  
- Uses **FAISS** to retrieve candidate chunks.  
- Uses **Judge** to validate relevance and sufficiency.  


Analogy

Think of it like a library system:
FAISS = just the shelf positions of the books (embeddings and indices).
VectorDB = shelf positions + actual books + catalog info (it keeps the text and metadata).

When you search:

- Query ‚Üí embedding ‚Üí FAISS ‚Üí returns indices of nearest embeddings.
- VectorDB ‚Üí uses these indices ‚Üí returns actual text chunks so you can pass them to the LLM judge.

---
