# 📝 Basic RAG Demo

This notebook demonstrates:
- Loading sample documents
- Building embeddings with SentenceTransformers
- Storing & retrieving with FAISS
- Comparing plain LLM vs RAG responses


In [None]:
!pip install sentence-transformers faiss-cpu openai


In [None]:
from sentence_transformers import SentenceTransformer
import faiss

docs = [
    open("data/documents/invoice1.txt").read(),
    open("data/documents/contract1.txt").read(),
    open("data/documents/kyc1.txt").read(),
]

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(docs)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings.astype("float32"))


In [None]:
query = "What does the contract say about GDPR?"
q_emb = model.encode([query])

scores, idx = index.search(q_emb.astype("float32"), k=2)
for i, j in enumerate(idx[0]):
    print(f"Doc {j} → {docs[j][:200]}")


# ✂️ Chunking Strategies for RAG
We test:
- Fixed size
- Overlapping
- Semantic (sentence-based)


In [None]:
import re

text = open("data/documents/contract1.txt").read()

def fixed_chunk(text, size=50):
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

def overlap_chunk(text, size=50, overlap=10):
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words-overlap), size-overlap)]

def semantic_chunk(text, max_len=200):
    sentences = re.split(r'[.!?]', text)
    chunks, cur = [], ""
    for s in sentences:
        if len(cur) + len(s) > max_len:
            chunks.append(cur.strip())
            cur = s
        else:
            cur += " " + s
    if cur: chunks.append(cur)
    return chunks

print("Fixed:", fixed_chunk(text, 40)[:2])
print("Semantic:", semantic_chunk(text)[:2])


# 🔍 FAISS Retrieval Demo
Explore FAISS similarity search:
- Index docs
- Run query
- Show Recall@k


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

docs = [open(f"data/documents/{f}").read() for f in ["invoice1.txt","contract1.txt","kyc1.txt"]]

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(docs)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings.astype("float32"))

query = "Which document talks about payment terms?"
q_emb = model.encode([query])
scores, idx = index.search(q_emb.astype("float32"), 2)

print("Top results:", [docs[i][:120] for i in idx[0]])


# 🤖 Multi-Agent RAG Demo
We simulate:
- Document Agent → parses fields
- Regulation Agent → retrieves matching rules
- Governance Agent → validates & explains


In [None]:
class DocumentAgent:
    def extract(self, text):
        return {"has_gdpr": "GDPR" in text, "length": len(text)}

class RegulationAgent:
    def fetch_rules(self, query):
        if "GDPR" in query:
            return "GDPR requires explicit consent for data transfers."
        return "No match."

class GovernanceAgent:
    def validate(self, doc, rules):
        if doc["has_gdpr"]:
            return f"Doc mentions GDPR → {rules}"
        return "No regulatory match."

doc_text = open("data/documents/contract1.txt").read()
doc_agent, reg_agent, gov_agent = DocumentAgent(), RegulationAgent(), GovernanceAgent()

doc_info = doc_agent.extract(doc_text)
rules = reg_agent.fetch_rules("GDPR check")
result = gov_agent.validate(doc_info, rules)
print(result)


# 🚀 Advanced RAG Patterns
We try:
- Multi-hop retrieval
- Multimodal (text + image placeholders)
- Adaptive retrieval


In [None]:
# Step 1: Refine query
query = "Does the invoice mention deadlines for payment?"
refined = "What is the payment due date in the invoice?"

# Step 2: Retrieve
from sentence_transformers import SentenceTransformer
import faiss
docs = [open("data/documents/invoice1.txt").read()]
model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode(docs)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb.astype("float32"))

q_emb = model.encode([refined])
_, idx = index.search(q_emb.astype("float32"), 1)
print("Multi-hop Answer:", docs[idx[0][0]])


👉 Multimodal RAG requires combining **text embeddings** + **image embeddings** (e.g., CLIP).  
For now, this notebook sets up placeholders for extending text RAG into multimodal pipelines.
