# Ensuring compliance with regulations

In [1]:
! pip install langsmith openai langfuse
! pip install -U requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -U duckduckgo-search langchain-community ddgs
! pip install -U openevals pandas

Collecting opentelemetry-exporter-otlp-proto-common==1.37.0 (from opentelemetry-exporter-otlp-proto-http<2.0.0,>=1.33.1->langfuse)
  Downloading opentelemetry_exporter_otlp_proto_common-1.37.0-py3-none-any.whl.metadata (1.8 kB)
Collecting opentelemetry-proto==1.37.0 (from opentelemetry-exporter-otlp-proto-http<2.0.0,>=1.33.1->langfuse)
  Downloading opentelemetry_proto-1.37.0-py3-none-any.whl.metadata (2.3 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.33.1 (from langfuse)
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-api<2.0.0,>=1.33.1 (from langfuse)
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-semantic-conventions==0.58b0 (from opentelemetry-sdk<2.0.0,>=1.33.1->langfuse)
  Downloading opentelemetry_semantic_conventions-0.58b0-py3-none-any.whl.metadata (2.4 kB)
Downloading opentelemetry_exporter_otlp_proto_common-1.37.0-py3-none-any.whl (18 kB)
Downloading opentelemetry_proto-1.37.0

## Loading environment variables

In [2]:
import os

os.environ["LANGFUSE_TRACING"] = "true"
os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

## Building Chroma DB

In [3]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


  vs.persist()
  vs = Chroma(


## Agent foundation

In [4]:
from typing import Any, Dict
from langchain.chat_models import init_chat_model
from langchain_core.tools import create_retriever_tool, tool, render_text_description
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.agents import create_agent

from langfuse import observe
from langfuse import get_client

langfuse = get_client()

# --- assume your vector store exists as `vs` ---
retriever = vs.as_retriever(search_kwargs={"k": 3})
retrieve_tool = create_retriever_tool(
    retriever=retriever,
    name="retrieve",
    description="Search the internal vector store for passages relevant to the user's question."
)

_ddg = DuckDuckGoSearchRun()
@tool("duckduckgo_search")
def duckduckgo_search(query: str) -> str:
    """Search the web with DuckDuckGo and return a brief summary of top results."""
    return _ddg.run(query)

model  = init_chat_model("openai:gpt-4o-mini", temperature=0)
TOOLS  = [retrieve_tool, duckduckgo_search]

@observe()  # traces the function; pair with the callback on invoke
def agentic_solution(inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
    q: str = inputs["question"]
    prompt = langfuse.get_prompt("genera-qa-prompt")
    system_prompt = prompt.compile()

    # 2) One-liner agent creation (v1)
    agent = create_agent(
        model,                    # or "openai:gpt-4o-mini"
        tools=TOOLS,
        system_prompt=system_prompt,  # system prompt accepted directly
    )

    # 3) Invoke
    result = agent.invoke({"messages": [{"role": "user", "content": q}]})
    return {"answer": result["messages"][-1].content.strip()}

In [5]:
answer = agentic_solution(
    {"question": 'What HSBC is doing?'},
)
print(answer)

{'answer': "HSBC is currently undergoing significant changes and facing various challenges:\n\n1. **Restructuring and Focus Shift**: HSBC is planning to wind down its mergers and acquisitions (M&A) and some equities businesses in Europe and the Americas. This move is part of a broader strategy to shift its focus towards Asia, marking a significant retrenchment from investment banking.\n\n2. **Financial Provisions**: The bank announced it would book a $1.1 billion provision in its third-quarter results due to a long-running lawsuit related to the Bernard Madoff Ponzi scheme. This provision is expected to impact its Common Equity Tier 1 (CET1) ratio by about 15 basis points.\n\n3. **Cost-Cutting Initiatives**: HSBC is also anticipating $1.8 billion in expenses related to an overhaul initiated by its new CEO, aimed at cutting long-term costs and boosting profits amidst diverging interest rate policies and geopolitical challenges.\n\n4. **Investment Banking Operations**: There are plans to

## Adding metadata for the runs of the agent

In [6]:
from langfuse.langchain import CallbackHandler

langfuse = get_client()
langfuse_handler = CallbackHandler()

@observe()  # traces the function; pair with the callback on invoke
def agentic_solution_with_metadata(inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:

    q: str = inputs["question"]
    prompt = langfuse.get_prompt("genera-qa-prompt")
    system_prompt = prompt.compile()

    meta = {
        "jurisdiction.primary": "EU",                 # Anchor region for this run: "EU" | "US" | "UAE"
        "policy.version": "2025-10-01",               # Internal compliance policy version applied
        "purpose.id": "kb-product-info",              # Business purpose for processing this request
        "user.id": "alex",                            # Pseudonymous user identifier
        "session.id": "sess-123",                     # Session correlation ID

        "gdpr.basis": "contract",                     # EU lawful basis: "contract" | "consent" | "legitimate_interest"
        "consent.scope": ["analytics","model-improve"], # Explicit consents granted for this turn
        "data.residency": "EU",                       # Primary storage region: "EU" | "US" | "AE" | "multi-region"
        "transfer.mechanism": "SCCs",                 # Cross-border data transfer mechanism: "SCCs" | "BCR" | "adequacy"
        "retention.days": 30,                         # Log/trace retention period in days

        "pii.redaction_profile": "iban,az_national_id,phone,email", # Masking profile applied pre-LLM
        "security.encryption_at_rest": True,          # Data at rest encrypted
        "security.encryption_in_transit": True,       # TLS enforced in transit
        "security.pseudonymization": True,            # Pseudonymization used in pipelines

        "ai.risk": "limited",                         # Overall AI risk posture for this use case
        "eu.ai_act.risk_class": "limited",            # EU AI Act risk class mapping
        "us.ccpa.cpra.applicable": True,              # CCPA/CPRA applicability flag (US deployments/users)
        "us.glba.covered": True,                      # GLBA coverage (financial institution context)
        "uae.pdpl.basis": "contract",                 # UAE PDPL lawful basis (if processing in UAE)
        "uae.zone": "DIFC",                           # UAE deployment zone: "mainland" | "DIFC" | "ADGM"
        "audit.dpa_ref": "DPA-2025-001",              # Reference to the signed DPA / registry record
}

    langfuse.update_current_trace(metadata=meta)

    # 2) One-liner agent creation (v1)
    agent = create_agent(
        model,                    # or "openai:gpt-4o-mini"
        tools=TOOLS,
        system_prompt=system_prompt,  # system prompt accepted directly
    )

    # 3) Invoke
    result = agent.invoke({"messages": [{"role": "user", "content": q}]},
                          config={"callbacks": [langfuse_handler]})
    return {"answer": result["messages"][-1].content.strip()}

In [7]:
answer = agentic_solution_with_metadata(
    {"question": 'What HSBC is doing?'},
)
print(answer)

{'answer': "HSBC is currently engaged in several significant activities:\n\n1. **Financial Performance**: HSBC reported a record profit before tax of $32.3 billion in the fourth quarter of 2023, driven by higher revenue and interest income. However, their Q3 earnings were impacted by a $1.1 billion hit related to ongoing legal issues concerning their relationship with Bernard Madoff.\n\n2. **Restructuring and Strategic Growth**: The bank is focusing on restructuring and strategic growth initiatives to enhance its operations and market position.\n\n3. **Community Engagement**: HSBC is involved in various community projects aimed at improving employee well-being and inclusivity. This includes initiatives that bring together different societal groups to foster friendship and communication.\n\nThese activities reflect HSBC's commitment to both financial performance and social responsibility."}
