In [None]:
# Install required packages (use FAISS for vector store)
%pip install -q faiss-cpu langgraph langchain langchain-openai python-dotenv
%pip install -q langchain-chroma pypdf python-docx
%pip install -q langchain-community python-dotenv

In [None]:
# Imports
from langgraph.graph import START, END, StateGraph, MessagesState
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import ToolNode
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from IPython.display import Image, display
from typing import Literal
import os

print("✅ All imports successful")

In [None]:
# Load API key
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found! Please set it in your .env file.")

print("✅ API key loaded")

In [None]:
# Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    api_key=openai_api_key
)

print(f"✅ LLM initialized: {llm.model_name}")

In [None]:
# Load all PDFs, DOCX and TXT files from the Tax_Project folder
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
import os

# Optional: python-docx may be required to read .docx files
try:
    from docx import Document as DocxReader
    _HAS_DOCX = True
except Exception:
    _HAS_DOCX = False

folder = 'Tax_Project'
if not os.path.isdir(folder):
    raise FileNotFoundError(f"Folder not found: {folder}. Update path accordingly.")

documents = []

for fname in sorted(os.listdir(folder)):
    fpath = os.path.join(folder, fname)
    if os.path.isdir(fpath):
        continue
    lower = fname.lower()
    try:
        if lower.endswith('.pdf'):
            loader = PyPDFLoader(fpath)
            # load() returns a list of Document objects (pages)
            pages = loader.load()
            for p in pages:
                # add filename to metadata so you can trace source
                p.metadata = {**getattr(p, 'metadata', {}), 'source': fname}
                documents.append(p)
        elif lower.endswith('.txt'):
            with open(fpath, 'r', encoding='utf-8') as f:
                text = f.read()
            documents.append(Document(page_content=text, metadata={'source': fname}))
        elif lower.endswith('.docx'):
            if not _HAS_DOCX:
                raise ImportError('python-docx is not installed. Run: python -m pip install python-docx')
            doc = DocxReader(fpath)
            text = '\n'.join(p.text for p in doc.paragraphs)
            documents.append(Document(page_content=text, metadata={'source': fname}))
        else:
            # skip other file types
            continue
    except Exception as e:
        print(f"⚠️ Failed to load {fname}: {e}")

print(f"✅ Loaded {len(documents)} documents from {folder}")

In [None]:
# Create text splitter (Module 2 knowledge!)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Characters per chunk
    chunk_overlap=100     # Overlap to preserve context
)

# Split documents
doc_splits = text_splitter.split_documents(pages)

print(f"✅ Created {len(doc_splits)} chunks")
print(f"\nSample chunk:")
print(f"{doc_splits[0].page_content[:200]}...")

In [None]:
# Initialize embeddings (using OpenAI)
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=openai_api_key
)

print("✅ Embeddings model initialized")

In [None]:
# Create FAISS vector store
faiss_path = "./faiss_index_agentic_rag"

# Create vector store from documents (build in-memory FAISS index)
vectorstore = FAISS.from_documents(documents=doc_splits, embedding=embeddings)

# Persist FAISS index locally
vectorstore.save_local(faiss_path)

print(f" FAISS vector store created with {len(doc_splits)} chunks")
print(f"   Persisted to: {faiss_path}")

In [None]:
@tool
def retrieve_documents(query: str) -> str:
    """
    Search for relevant documents in the knowledge base.
    
    Use this tool when you need information from the document collection
    to answer the user's question. Do NOT use this for:
    - General knowledge questions
    - Greetings or small talk
    - Simple calculations
    
    Args:
        query: The search query describing what information is needed
        
    Returns:
        Relevant document excerpts that can help answer the question
    """
    # Use MMR (Maximum Marginal Relevance) for diverse results
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 5, "fetch_k": 10}
    )
    
    # Retrieve documents
    results = retriever.invoke(query)
    
    if not results:
        return "No relevant documents found."
    
    # Format results
    formatted = "\n\n---\n\n".join(
        f"Document {i+1}:\n{doc.page_content}"
        for i, doc in enumerate(results)
    )
    
    return formatted

print("✅ Retrieval tool created")

In [None]:
# # Tax calculator tool - simple progressive example
# @tool
# def tax_calculator(income: float, filing_status: str = 'single', year: int = 2025, deductions: float = 0.0) -> str:
#     """Calculate estimated tax using a simple progressive schedule."""
#     try:
#         income = float(income)
#         deductions = float(deductions)
#     except Exception:
#         return 'Error: income and deductions must be numeric.'

#     taxable = max(0.0, income - deductions)

#     # Example brackets (single filer approximation) - replace with jurisdiction-specific rules as needed
#     brackets = [
#         (0.10, 11000),
#         (0.12, 44725),
#         (0.22, 95375),
#         (0.24, 182100),
#         (0.32, 231250),
#         (0.35, 578125),
#         (0.37, float('inf'))
#     ]

#     prev = 0.0
#     tax = 0.0
#     breakdown = []
#     for rate, upper in brackets:
#         upper = float(upper)
#         if taxable > prev:
#             amount = min(taxable, upper) - prev
#             segment_tax = amount * rate
#             tax += segment_tax
#             breakdown.append(f"{amount:.2f} @ {int(rate*100)}% = {segment_tax:.2f}")
#         prev = upper
#         if prev >= taxable:
#             break

#     effective_rate = (tax / income) if income > 0 else 0.0
#     result = {
#         'income': round(income,2),
#         'deductions': round(deductions,2),
#         'taxable_income': round(taxable,2),
#         'tax': round(tax,2),
#         'effective_rate': round(effective_rate,4),
#         'breakdown': breakdown
#     }
#     import json
#     return json.dumps(result)

# print("✅ Tax calculator tool created")

In [None]:
system_prompt = SystemMessage(content="""You are a helpful assistant with access to a document retrieval tool. Prioritize documents in the Tax_Project folder for answers and cite them.

RETRIEVAL DECISION RULES:

DO NOT retrieve for:
- Greetings, capability questions (e.g. "What can you help with?"), simple math or general knowledge, or casual conversation.

DO retrieve for:
- Questions asking for specific information that would be in documents.
- Requests for facts, definitions, or explanations about specialized topics (tax policy, statutes, guidance).
- Any question where citing sources would improve the answer.

ADDITIONAL RULES:
- Jurisdiction: ask the user if not specified; prefer documents applicable to the stated jurisdiction.
- Recency: prefer documents published after 2015 unless the user requests otherwise.
- Retrieval limits: retrieve at most 5 documents and include up to 300 characters per excerpt.
- Citation format: append " — Source: filename (page N)" to any excerpt or claim derived from a document.
- Conflicts: if documents disagree, list each source and explicitly state which you prefer and why.
- Clarify: if the query is ambiguous, ask one clarifying question before retrieving.
- Privacy: do not reveal private or sensitive content from documents unless the user explicitly permits it.

When you retrieve documents, cite them in your answer. If documents do not contain the answer, say so.
""")

print("✅ System prompt configured")

In [None]:
# Bind tools to LLM
tools = [retrieve_documents, tax_calculator]
llm_with_tools = llm.bind_tools(tools)

def assistant(state: MessagesState) -> dict:
    """Assistant node - decides whether to retrieve or answer directly."""
    messages = [system_prompt] + state['messages']
    response = llm_with_tools.invoke(messages)
    return {'messages': [response]}

def should_continue(state: MessagesState) -> Literal['tools', '__end__']:
    """Decide whether to call tools or finish."""
    last_message = state['messages'][-1]
    if last_message.tool_calls:
        return 'tools'
    return '__end__'

print("✅ Agent nodes defined")

In [None]:
# Build graph
builder = StateGraph(MessagesState)

# Add nodes
builder.add_node('assistant', assistant)
builder.add_node('tools', ToolNode(tools))

# Define edges
builder.add_edge(START, 'assistant')
builder.add_conditional_edges(
    'assistant',
    should_continue,
    {'tools': 'tools', '__end__': END},
)
builder.add_edge('tools', 'assistant')

# Add memory
memory = MemorySaver()
agent = builder.compile(checkpointer=memory)

print("✅ Agentic RAG system compiled")

In [None]:
# Visualize the agentic RAG graph
try:
    display(Image(agent.get_graph().draw_mermaid_png()))
except Exception as e:
    print(f"Could not display graph: {e}")
    print("Graph: START → assistant → [if tool_call] → tools → assistant → END")