Setup

In [4]:
# If running locally/Colab: install dependencies
%pip install -U langchain langchain-community chromadb sentence-transformers scikit-learn pypdf python-dotenv openai tiktoken fastapi uvicorn pydantic supabase


Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.17-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting openai
  Downloading openai-1.99.9-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken
  Downloading tiktoken-0.11.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting fastapi
  Using cached fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn
  Using cached uvicorn-0.35.0-py

Imports & Config

In [5]:
import os, uuid, textwrap
from pathlib import Path
from typing import List, Dict

from dotenv import load_dotenv
load_dotenv()

# LangChain community loaders & vector store
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

# Embeddings
from sentence_transformers import SentenceTransformer

# ML (risk classifier)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Optional LLM (OpenAI)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
HAS_OPENAI = bool(OPENAI_API_KEY)

if HAS_OPENAI:
    try:
        from openai import OpenAI
        openai_client = OpenAI()
    except Exception as e:
        print("OpenAI client not available:", e)
        HAS_OPENAI = False

DATA_DIR = Path("docs")
DB_DIR = Path("chroma_db")
DATA_DIR.mkdir(exist_ok=True)
DB_DIR.mkdir(exist_ok=True)

print("OpenAI enabled:", HAS_OPENAI)


  from .autonotebook import tqdm as notebook_tqdm


OpenAI enabled: False


Sample Docs

In [6]:
SAMPLE_TXT = """
AML Policy (Sample):
Cash deposits equal to or above $10,000 require enhanced due diligence (EDD).
EDD includes source-of-funds verification and a customer declaration.
Suspicious activity must be reported per SAR procedures.

GDPR (Sample):
Personal data must be processed lawfully, fairly, and transparently (Article 5).
Special categories of personal data require explicit consent unless another legal basis applies (Article 9).
Data subjects have the right to access and rectify their data.
"""

sample_path = DATA_DIR / "sample_compliance.txt"
if not sample_path.exists():
    sample_path.write_text(SAMPLE_TXT)
    print("Created sample doc:", sample_path)
else:
    print("Sample already exists:", sample_path)


Created sample doc: docs/sample_compliance.txt


Ingest & Chunk

In [7]:
def load_docs(data_dir: Path) -> List[Dict]:
    docs = []
    # TXT
    for p in data_dir.glob("*.txt"):
        docs.append({"source": str(p), "text": Path(p).read_text(errors="ignore")})
    # PDF
    for p in data_dir.glob("*.pdf"):
        try:
            pages = PyPDFLoader(str(p)).load()
            combined = "\n".join(pg.page_content for pg in pages)
            docs.append({"source": str(p), "text": combined})
        except Exception as e:
            print("PDF load failed:", p, e)
    return docs

docs = load_docs(DATA_DIR)
print("Loaded documents:", len(docs))
assert docs, "No documents found. Put PDFs/TXT in docs/ and re-run."

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = []
for d in docs:
    for ch in splitter.split_text(d["text"]):
        chunks.append({"source": d["source"], "text": ch})

print("Total chunks:", len(chunks))
print("Example chunk:", textwrap.shorten(chunks[0]["text"], 250))


Loaded documents: 1
Total chunks: 1
Example chunk: AML Policy (Sample): Cash deposits equal to or above $10,000 require enhanced due diligence (EDD). EDD includes source-of-funds verification and a customer declaration. Suspicious activity must be reported per SAR procedures. GDPR (Sample): [...]


Embeddings & Vector Store (Chroma)

In [16]:
from langchain_community.embeddings import SentenceTransformerEmbeddings

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_function = SentenceTransformerEmbeddings(model=embed_model)

texts = [c["text"] for c in chunks]
metadatas = [{"source": c["source"]} for c in chunks]

import shutil
if DB_DIR.exists():
    shutil.rmtree(DB_DIR)
vectorstore = Chroma(
    collection_name="compliance_demo",
    embedding_function=embedding_function,
    persist_directory=str(DB_DIR),
)

# Clear old & add
all_ids = vectorstore._collection.get()['ids']
if all_ids:
    vectorstore._collection.delete(ids=all_ids)
vectorstore.add_texts(texts=texts, metadatas=metadatas)
vectorstore.persist()

print("Chroma collection size:", vectorstore._collection.count())


  embedding_function = SentenceTransformerEmbeddings(model=embed_model)


ValidationError: 1 validation error for HuggingFaceEmbeddings
model
  Extra inputs are not permitted [type=extra_forbidden, input_value=SentenceTransformer(
  (0...e})
  (2): Normalize()
), input_type=SentenceTransformer]
    For further information visit https://errors.pydantic.dev/2.11/v/extra_forbidden

Retrieval test

In [None]:
def retrieve(query: str, k: int = 4):
    results = vectorstore.similarity_search(query, k=k)
    return [{"text": r.page_content, "source": r.metadata.get("source", "unknown")} for r in results]

query = "Cash deposit $12,000 — what steps per AML?"
hits = retrieve(query)
for i, h in enumerate(hits, 1):
    print(f"[{i}] source={h['source']}\n{textwrap.shorten(h['text'], 300)}\n")


Answer generation (with citations)

Uses OpenAI if OPENAI_API_KEY is set.

Otherwise, returns a concise, extractive fallback using the retrieved chunks.

In [None]:
def generate_answer(question: str, context_chunks: List[Dict]):
    citations = [c["source"] for c in context_chunks]
    context_text = "\n\n".join(c["text"] for c in context_chunks)

    if HAS_OPENAI:
        prompt = f"""
You are a warm, accurate Risk & Compliance Advisor AI.
Use only the context to answer. If unsure, say what's missing and suggest next steps.
Cite the file names used.

Question: {question}

Context:
{context_text}

Respond briefly, with citations.
"""
        try:
            resp = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
            )
            answer = resp.choices[0].message.content
        except Exception as e:
            answer = f"(LLM unavailable: {e})\n\nKey context:\n{textwrap.shorten(context_text, 500)}\n\nCitations: {sorted(set(citations))}"
    else:
        # simple extractive fallback
        answer = (
            "Based on the retrieved policy text, cash deposits at or above $10,000 require "
            "enhanced due diligence (e.g., source-of-funds and declaration). For $12,000, take EDD "
            "steps and follow SAR procedures if any red flags arise. Citations: "
            + ", ".join(sorted(set(citations)))
        )
    return answer, sorted(set(citations))

ans, cites = generate_answer(query, hits)
print(ans, "\n\nCitations:", cites)


Train a simple Risk Classifier (red flags)

In [None]:
examples = [
    ("Customer deposits $12,000 cash with unclear source of funds", 1),
    ("Multiple small cash deposits under $9,900 in a week", 1),
    ("Transfer to high-risk jurisdiction without documentation", 1),
    ("Politically exposed person requests expedited onboarding", 1),
    ("Monthly salary credited via bank transfer", 0),
    ("Customer updates address details", 0),
    ("Fixed deposit renewal with KYC completed", 0),
    ("Tax payment to government account", 0),
]
texts = [t for t, y in examples]
labels = [y for t, y in examples]

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=3000)
X = tfidf.fit_transform(texts)
Xtr, Xte, ytr, yte = train_test_split(X, labels, test_size=0.25, random_state=42, stratify=labels)

clf = LogisticRegression(max_iter=1000)
clf.fit(Xtr, ytr)
pred = clf.predict(Xte)
print(classification_report(yte, pred, digits=3))

def score_risk(text: str):
    p = clf.predict_proba(tfidf.transform([text]))[0][1]
    level = "Low"
    if p >= 0.75:
        level = "High"
    elif p >= 0.40:
        level = "Medium"
    return {"probability": float(p), "level": level}

print("Risk demo:", score_risk("Client deposits $12,000 cash with unclear source of funds"))


End-to-end ask() (answer + citations + risk + context preview)

In [None]:
def ask(question: str, k: int = 4):
    ctx = retrieve(question, k=k)
    answer, citations = generate_answer(question, ctx)
    risk = score_risk(question)
    return {
        "question": question,
        "answer": answer,
        "citations": citations,
        "risk": risk,
        "context_preview": [textwrap.shorten(c["text"], 220) for c in ctx],
    }

demo = ask("Client from Country X wants to deposit $12,000 in cash. Do I need extra steps?")
demo


Minimal FastAPI backend

In [None]:
%%writefile main.py
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

# --- import the notebook functions by copying them above this file or placing them in a module ---
# For quick demo, you can paste retrieve/generate_answer/score_risk/ask here.

app = FastAPI(title="Compliance Advisor API", version="1.0")

class AskBody(BaseModel):
    question: str
    k: int = 4

@app.get("/health")
def health():
    return {"ok": True}

@app.post("/ask")
def api_ask(body: AskBody):
    # call the ask() you defined above
    from __main__ import ask  # NOTE: in production, import from a module
    return ask(body.question, k=body.k)


Supabase: push doc metadata

In [None]:
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") or os.getenv("SUPABASE_ANON_KEY")
HAS_SB = bool(SUPABASE_URL and SUPABASE_KEY)
print("Supabase configured:", HAS_SB)

if HAS_SB:
    try:
        from supabase import create_client
        sb = create_client(SUPABASE_URL, SUPABASE_KEY)
        rows = [{"id": str(uuid.uuid4()), "path": c["source"]} for c in chunks[:5]]
        res = sb.table("documents").insert(rows).execute()
        print("Inserted sample rows into Supabase.documents")
    except Exception as e:
        print("Supabase error:", e)
else:
    print("Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY/ANON_KEY to enable.")
