Milestone 3


Imports

In [9]:
from __future__ import annotations

import asyncio
import json
import logging
import os
import random
import time
import uuid
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

ROOT = Path.cwd().resolve()
ARTIFACTS_DIR = ROOT / "artifacts"

# Reproducibility
SEED = 42
random.seed(SEED)

# Avoid optional TensorFlow/JAX imports (common Windows DLL issues, and not needed here)
os.environ.setdefault("TRANSFORMERS_NO_TF", "1")
os.environ.setdefault("USE_TF", "0")
os.environ.setdefault("TRANSFORMERS_NO_FLAX", "1")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("milestone3")

def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def load_env_file(path: Path) -> Dict[str, str]:
    """Minimal .env loader (no extra deps).

    - Skips blanks and comments
    - Supports KEY=VALUE
    - Strips surrounding quotes
    - Does NOT override already-set env vars
    """
    loaded: Dict[str, str] = {}
    if not path.exists():
        return loaded

    for raw_line in path.read_text(encoding="utf-8").splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue
        if "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip().strip('"').strip("'")
        if not key:
            continue
        if os.getenv(key) is None:
            os.environ[key] = value
            loaded[key] = value
    return loaded

# Auto-load root .env if present
env_loaded = load_env_file(ROOT / ".env")

logger.info(f"ROOT={ROOT}")
logger.info(f"ARTIFACTS_DIR={ARTIFACTS_DIR} (exists={ARTIFACTS_DIR.exists()})")
logger.info(f"Loaded .env keys: {sorted(env_loaded.keys())}")
logger.info("Env guards: TRANSFORMERS_NO_TF=%s USE_TF=%s TRANSFORMERS_NO_FLAX=%s", os.getenv("TRANSFORMERS_NO_TF"), os.getenv("USE_TF"), os.getenv("TRANSFORMERS_NO_FLAX"))

Parallel Agent Execution with LangGraph

In [10]:
# Shared agent list used across the notebook
AGENT_TYPES = ["legal_agent", "compliance_agent", "finance_agent", "operations_agent"]

In [11]:


# Dependency repair for this notebook kernel (run once if imports fail)
import sys

try:
    import importlib.metadata as _md  # py3.8+
except Exception:  # pragma: no cover
    _md = None

def _v(pkg: str) -> str:
    if _md is None:
        return "(unknown)"
    try:
        return _md.version(pkg)
    except Exception:
        return "(not installed)"

print("Python executable:", sys.executable)
print("Before:")
print("- huggingface-hub:", _v("huggingface-hub"))
print("- transformers:   ", _v("transformers"))
print("- sentence-transformers:", _v("sentence-transformers"))
print("- tensorflow:     ", _v("tensorflow"))
print("- tensorflow-intel:", _v("tensorflow-intel"))

# IMPORTANT: %pip installs into the currently-running Jupyter kernel environment.
# If you still see the same import error after this, restart the kernel and rerun from Cell 3.
%pip install -U "huggingface-hub>=0.24.0,<1.0" "transformers>=4.40.0" "sentence-transformers>=2.7.0"

# If you see: "Failed to load the native TensorFlow runtime" on Windows, TensorFlow is installed but broken.
# Sentence-transformers does not require TensorFlow for embeddings, so removing it is safe for this notebook:
# %pip uninstall -y tensorflow tensorflow-intel

print("\nAfter:")
print("- huggingface-hub:", _v("huggingface-hub"))
print("- transformers:   ", _v("transformers"))
print("- sentence-transformers:", _v("sentence-transformers"))
print("- tensorflow:     ", _v("tensorflow"))
print("- tensorflow-intel:", _v("tensorflow-intel"))



Python executable: /usr/bin/python3
Before:
- huggingface-hub: 0.36.0
- transformers:    4.57.6
- sentence-transformers: 5.2.0
- tensorflow:      2.19.0
- tensorflow-intel: (not installed)

After:
- huggingface-hub: 0.36.0
- transformers:    4.57.6
- sentence-transformers: 5.2.0
- tensorflow:      2.19.0
- tensorflow-intel: (not installed)


In [12]:
 # Dependency repair for this notebook kernel (run once if imports fail)
import sys

try:
    import importlib.metadata as _md  # py3.8+
except Exception:  # pragma: no cover
    _md = None

def _v(pkg: str) -> str:
    if _md is None:
        return "(unknown)"
    try:
        return _md.version(pkg)
    except Exception:
        return "(not installed)"

print("Python executable:", sys.executable)
print("Before:")
print("- huggingface-hub:", _v("huggingface-hub"))
print("- transformers:   ", _v("transformers"))
print("- sentence-transformers:", _v("sentence-transformers"))
print("- tensorflow:     ", _v("tensorflow"))
print("- tensorflow-intel:", _v("tensorflow-intel"))

# IMPORTANT: %pip installs into the currently-running Jupyter kernel environment.
# If you still see the same import error after this, restart the kernel and rerun from Cell 3.
%pip install -U "huggingface-hub>=0.24.0,<1.0" "transformers>=4.40.0" "sentence-transformers>=2.7.0"

# If you see: "Failed to load the native TensorFlow runtime" on Windows, TensorFlow is installed but broken.
# Sentence-transformers does not require TensorFlow for embeddings, so removing it is safe for this notebook:
# %pip uninstall -y tensorflow tensorflow-intel

print("\nAfter:")
print("- huggingface-hub:", _v("huggingface-hub"))
print("- transformers:   ", _v("transformers"))
print("- sentence-transformers:", _v("sentence-transformers"))
print("- tensorflow:     ", _v("tensorflow"))
print("- tensorflow-intel:", _v("tensorflow-intel"))

Python executable: /usr/bin/python3
Before:
- huggingface-hub: 0.36.0
- transformers:    4.57.6
- sentence-transformers: 5.2.0
- tensorflow:      2.19.0
- tensorflow-intel: (not installed)

After:
- huggingface-hub: 0.36.0
- transformers:    4.57.6
- sentence-transformers: 5.2.0
- tensorflow:      2.19.0
- tensorflow-intel: (not installed)


Vector Database Connection and Embedding Setup

In [3]:
!pip install -U pinecone-client sentence-transformers transformers huggingface-hub


Collecting pinecone-client
  Using cached pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting huggingface-hub
  Using cached huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Using cached pinecone_client-6.0.0-py3-none-any.whl (6.7 kB)
Installing collected packages: pinecone-client
Successfully installed pinecone-client-6.0.0


In [1]:
!pip uninstall -y pinecone-client
!pip uninstall -y pinecone
!pip uninstall -y pinecone-core
!pip uninstall -y pinecone-text


Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
Found existing installation: pinecone 8.0.0
Uninstalling pinecone-8.0.0:
  Successfully uninstalled pinecone-8.0.0
[0m

In [2]:
!pip install -U pinecone


Collecting pinecone
  Using cached pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Using cached pinecone-8.0.0-py3-none-any.whl (745 kB)
Installing collected packages: pinecone
Successfully installed pinecone-8.0.0


In [3]:
import pinecone
print("pinecone imported OK")
print(pinecone.__file__)


pinecone imported OK
/usr/local/lib/python3.12/dist-packages/pinecone/__init__.py


In [13]:
# Pinecone connection

import os
from typing import List

# Ensure optional TensorFlow
os.environ.setdefault("TRANSFORMERS_NO_TF", "1")
os.environ.setdefault("USE_TF", "0")
os.environ.setdefault("TRANSFORMERS_NO_FLAX", "1")

# =============================================================
# 🔑 HARD-CODED CONFIG (AS REQUESTED)
# =============================================================

PINECONE_API_KEY = "pcsk_6jbLBU_DxNgioCN5BHBNM6x3S2Gd9WMY3DDVnruCFBSsEa7efABnmRWydJhEn4itJDVfG"
INDEX_NAME = "pinevs"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Optional (only needed for OLD Pinecone SDK)
PINECONE_ENV = os.getenv("PINECONE_ENV")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

# =============================================================
# CONNECT TO PINECONE
# =============================================================

index = None
try:
    # New SDK
    from pinecone import Pinecone

    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(INDEX_NAME)
    logger.info(f"✅ Connected to Pinecone index '{INDEX_NAME}' (new SDK)")

except Exception as e_new:
    try:
        # Old SDK fallback
        import pinecone

        if not PINECONE_ENV:
            raise RuntimeError(
                "Legacy Pinecone SDK requires PINECONE_ENV "
                "(e.g., 'us-east1-gcp')"
            )

        pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
        index = pinecone.Index(INDEX_NAME)
        logger.info(f"✅ Connected to Pinecone index '{INDEX_NAME}' (legacy SDK)")

    except Exception as e_old:
        raise RuntimeError(f"❌ Pinecone connection failed: new={e_new} old={e_old}")

# =============================================================
# EMBEDDING MODEL
# =============================================================

from sentence_transformers import SentenceTransformer
from pathlib import Path

ROOT = Path.cwd()
CACHE_DIR = ROOT / "models_cache" / "hub"

logger.info(f"🔹 Loading embedding model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME, cache_folder=str(CACHE_DIR))

def embed_query(text: str) -> List[float]:
    return model.encode(
        [text],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0].tolist()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Context-First Agent Orchestration
**bold text**
Agents fetch relevant knowledge upfront, then execute tasks using either sequential or parallel execution strategies.

In [14]:
from typing import Dict, List, Optional, Any, Tuple
import time
import asyncio


In [15]:
AGENT_TYPES = ["legal_agent", "compliance_agent", "finance_agent", "operations_agent"]

AGENT_QUERIES: Dict[str, List[str]] = {

    "legal_agent": [
        "Can you explain the termination clauses in this contract?",
        "What are the consequences if someone breaches the contract?",
        "Which obligations relate to confidentiality and non-disclosure?",
        "What indemnity and protection responsibilities exist for parties?",
    ],

    "compliance_agent": [
        "What privacy and data protection rules apply?",
        "Which regulations must we comply with?",
        "What auditing and reporting steps are required?",
        "How long must data be kept, and when should it be deleted?",
        "What steps must be taken to report a security breach?",
        "Are there specific certifications or audits needed (SOC2/ISO/HIPAA)?",
    ],

    "finance_agent": [
        "What are the payment deadlines and conditions?",
        "How are fees, invoices, and billing handled?",
        "What penalties or late fees are applied for missed payments?",
        "What interest is charged on overdue payments?",
        "What financial liabilities does each party carry?",
    ],

    "operations_agent": [
        "What are the expected deliverables and outputs of this project?",
        "What timelines and milestones should be met?",
        "What service level agreements (SLAs) are defined?",
        "What standards of performance must be maintained?",
        "What operational responsibilities are assigned to each party?",
        "What guarantees exist for uptime and service continuity?",
    ],

}

# -------------------------
# The rest of your pipeline code remains exactly the same
# -------------------------
def pinecone_query(
    *,
    query: str,
    top_k: int = 5,
    namespace: Optional[str] = None,
    metadata_filter: Optional[Dict[str, Any]] = None,
) -> Any:
    qvec = embed_query(query)
    kwargs: Dict[str, Any] = {
        "vector": qvec,
        "top_k": top_k,
        "include_metadata": True,
    }
    if namespace is not None:
        kwargs["namespace"] = namespace
    if metadata_filter is not None:
        kwargs["filter"] = metadata_filter
    return index.query(**kwargs)

def _extract_matches(resp: Any) -> List[Dict[str, Any]]:
    matches = getattr(resp, "matches", None)
    if matches is None and isinstance(resp, dict):
        matches = resp.get("matches")
    if not matches:
        return []

    out: List[Dict[str, Any]] = []
    for m in matches:
        md = getattr(m, "metadata", None)
        score = getattr(m, "score", None)
        if md is None and isinstance(m, dict):
            md = m.get("metadata")
            score = m.get("score")
        out.append({
            "score": float(score) if score is not None else None,
            "metadata": md or {},
        })
    return out

def _confidence_from_matches(matches: List[Dict[str, Any]]) -> Optional[float]:
    scores = [m.get("score") for m in matches if isinstance(m.get("score"), (int, float))]
    if not scores:
        return None
    return float(sum(scores) / len(scores))

def run_agent_pipeline(
    *,
    agent_type: str,
    question: str,
    contract_id: str,
    top_k_per_query: int = 5,
    chunks_namespace: Optional[str] = None,
    filter_chunks_by_contract_id: bool = False,
) -> Dict[str, Any]:
    if agent_type not in AGENT_QUERIES:
        raise ValueError(f"Unknown agent_type: {agent_type}")

    t0 = time.perf_counter()
    md_filter = (
        {"contract_id": {"$eq": contract_id}}
        if filter_chunks_by_contract_id
        else None
    )

    all_matches: List[Dict[str, Any]] = []
    per_query: List[Dict[str, Any]] = []
    for q in AGENT_QUERIES[agent_type]:
        resp = pinecone_query(query=q, top_k=top_k_per_query, namespace=chunks_namespace, metadata_filter=md_filter)
        matches = _extract_matches(resp)
        per_query.append({"query": q, "matches": matches})
        all_matches.extend(matches)

    confidence = _confidence_from_matches(all_matches)
    elapsed = time.perf_counter() - t0

    return {
        "agent_type": agent_type,
        "contract_id": contract_id,
        "question": question,
        "timestamp": utc_now_iso(),
        "elapsed_seconds": elapsed,
        "confidence": confidence,
        "retrieval": {
            "top_k_per_query": top_k_per_query,
            "filter_chunks_by_contract_id": filter_chunks_by_contract_id,
            "per_query": per_query,
        },
    }

def run_sequential(
    *,
    question: str,
    contract_id: str,
    agents: List[str] = AGENT_TYPES,
    filter_chunks_by_contract_id: bool = False,
) -> Tuple[Dict[str, Any], float]:
    t0 = time.perf_counter()
    out: Dict[str, Any] = {}
    for a in agents:
        out[a] = run_agent_pipeline(
            agent_type=a,
            question=question,
            contract_id=contract_id,
            filter_chunks_by_contract_id=filter_chunks_by_contract_id,
        )
    return out, time.perf_counter() - t0

async def run_parallel(
    *,
    question: str,
    contract_id: str,
    agents: List[str] = AGENT_TYPES,
    filter_chunks_by_contract_id: bool = False,
) -> Tuple[Dict[str, Any], float]:
    t0 = time.perf_counter()
    tasks = [
        asyncio.to_thread(
            run_agent_pipeline,
            agent_type=a,
            question=question,
            contract_id=contract_id,
            filter_chunks_by_contract_id=filter_chunks_by_contract_id,
        )
        for a in agents
    ]
    results = await asyncio.gather(*tasks)
    out = {r["agent_type"]: r for r in results}
    return out, time.perf_counter() - t0


In [16]:
from datetime import datetime, timezone

def utc_now_iso() -> str:
    """Return current UTC timestamp in ISO 8601 format."""
    return datetime.now(timezone.utc).isoformat()


In [17]:
# Configure your run

# - CONTRACT_ID can be any stable identifier you choose (used for memory persistence/recall).

# - If your chunk vectors include contract_id in metadata, set FILTER_CHUNKS_BY_CONTRACT_ID=True.

CONTRACT_ID = os.getenv("CONTRACT_ID", "demo_contract")

QUESTION = "What are the payment terms, audit requirements, and uptime commitments?"

FILTER_CHUNKS_BY_CONTRACT_ID = False



seq_out, seq_s = run_sequential(

    question=QUESTION,

    contract_id=CONTRACT_ID,

    filter_chunks_by_contract_id=FILTER_CHUNKS_BY_CONTRACT_ID,

)

par_out, par_s = await run_parallel(

    question=QUESTION,

    contract_id=CONTRACT_ID,

    filter_chunks_by_contract_id=FILTER_CHUNKS_BY_CONTRACT_ID,

)



print("Sequential seconds:", round(seq_s, 3))

print("Parallel seconds:  ", round(par_s, 3))



print("\nPer-agent confidence (parallel):")

for a in AGENT_TYPES:

    conf = par_out[a].get("confidence")

    print(f"- {a}: {None if conf is None else round(conf, 4)}")

Sequential seconds: 2.095
Parallel seconds:   0.753

Per-agent confidence (parallel):
- legal_agent: 0.697
- compliance_agent: 0.5864
- finance_agent: 0.6143
- operations_agent: 0.529



4) Persist Agent Outputs as Vector Memory (Pinecone)


In [18]:
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import uuid
import json


In [24]:
# ===============================
# FULL AGENT MEMORY PERSIST CODE
# ===============================

from dataclasses import dataclass, asdict
from typing import Dict, Any, List
from datetime import datetime, timezone
import uuid
import json

# -------------------------------
# CONFIG
# -------------------------------
CONTRACT_ID = "contract_001"
QUESTION = "Identify legal, compliance, finance and ops risks"
AGENT_TYPES = ["legal", "compliance", "finance", "operations"]
AGENT_MEMORY_NAMESPACE = "agent-memory"

# -------------------------------
# MOCK PARALLEL AGENT OUTPUTS
# (Replace with real pipeline output)
# -------------------------------
par_out = {
    "legal": {
        "risks": ["Termination clause vague", "Unlimited liability"],
        "severity": "high"
    },
    "compliance": {
        "issues": ["GDPR consent missing"],
        "severity": "medium"
    },
    "finance": {
        "financial_risks": ["Late payment penalty 3% per month"],
        "severity": "high"
    },
    "operations": {
        "operational_risks": ["No SLA defined"],
        "severity": "medium"
    }
}

# -------------------------------
# DATA MODEL
# -------------------------------
@dataclass
class AgentMemoryRecord:
    contract_id: str
    agent_type: str
    timestamp: str
    question: str
    output: Dict[str, Any]

# -------------------------------
# UTILITIES
# -------------------------------
def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

# -------------------------------
# PERSISTENCE LAYER
# (DB / Pinecone stub – SAFE)
# -------------------------------
def persist_agent_memory(
    records: List[AgentMemoryRecord],
    namespace: str = "agent-memory"
) -> List[str]:

    ids = []

    print(f"\n📦 Persisting records into namespace: '{namespace}'\n")

    for record in records:
        vector_id = f"{record.contract_id}-{record.agent_type}-{uuid.uuid4().hex[:8]}"
        payload = asdict(record)

        # 🔹 This is where Pinecone / DB upsert would go
        # index.upsert(vectors=[(vector_id, embedding, payload)], namespace=namespace)

        print(f"🧠 Stored → {vector_id}")
        print(json.dumps(payload, indent=2))
        print("-" * 60)

        ids.append(vector_id)

    return ids

# -------------------------------
# BUILD RECORDS FROM PARALLEL OUTPUT
# -------------------------------
records = [
    AgentMemoryRecord(
        contract_id=CONTRACT_ID,
        agent_type=agent,
        timestamp=utc_now_iso(),
        question=QUESTION,
        output=par_out[agent]
    )
    for agent in AGENT_TYPES
]

# -------------------------------
# PERSIST
# -------------------------------
ids = persist_agent_memory(
    records=records,
    namespace=AGENT_MEMORY_NAMESPACE
)

print(f"\n✅ Upserted {len(ids)} agent-memory vectors")
print("🔑 Example IDs:", ids[:2])



📦 Persisting records into namespace: 'agent-memory'

🧠 Stored → contract_001-legal-14daa9bc
{
  "contract_id": "contract_001",
  "agent_type": "legal",
  "timestamp": "2026-01-19T12:36:50.520978+00:00",
  "question": "Identify legal, compliance, finance and ops risks",
  "output": {
    "risks": [
      "Termination clause vague",
      "Unlimited liability"
    ],
    "severity": "high"
  }
}
------------------------------------------------------------
🧠 Stored → contract_001-compliance-a1c26cf4
{
  "contract_id": "contract_001",
  "agent_type": "compliance",
  "timestamp": "2026-01-19T12:36:50.520996+00:00",
  "question": "Identify legal, compliance, finance and ops risks",
  "output": {
    "issues": [
      "GDPR consent missing"
    ],
    "severity": "medium"
  }
}
------------------------------------------------------------
🧠 Stored → contract_001-finance-9bb25e7e
{
  "contract_id": "contract_001",
  "agent_type": "finance",
  "timestamp": "2026-01-19T12:36:50.521003+00:00",
  


Recall Stored Agent Memory (No Rerun)


In [25]:
# Recall examples (filtered by contract_id and optionally agent_type)

recall_ops = query_agent_memory(

    query="uptime commitments service credits",

    contract_id=CONTRACT_ID,

    agent_type="operations_agent",

    top_k=3,

)



recall_fin = query_agent_memory(

    query="interest charges late payment",

    contract_id=CONTRACT_ID,

    agent_type="finance_agent",

    top_k=3,

)



print("Operations memory matches:")

for m in getattr(recall_ops, "matches", [])[:3]:

    print("- score:", getattr(m, "score", None))

    print("  ts:", (getattr(m, "metadata", {}) or {}).get("timestamp"))

    print("  question:", (getattr(m, "metadata", {}) or {}).get("question"))



print("\nFinance memory matches:")

for m in getattr(recall_fin, "matches", [])[:3]:

    print("- score:", getattr(m, "score", None))

    print("  ts:", (getattr(m, "metadata", {}) or {}).get("timestamp"))

    print("  question:", (getattr(m, "metadata", {}) or {}).get("question"))

NameError: name 'query_agent_memory' is not defined

In [27]:
# ============================================================
# FULL AGENT MEMORY STORE + RECALL (SINGLE FILE)
# ============================================================

from dataclasses import dataclass, asdict
from typing import Dict, Any, List, Optional
from datetime import datetime, timezone
import uuid
import json
import math

# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
CONTRACT_ID = "contract_001"
QUESTION = "Identify legal, compliance, finance and ops risks"

AGENT_TYPES = [
    "legal_agent",
    "compliance_agent",
    "finance_agent",
    "operations_agent",
]

AGENT_MEMORY_NAMESPACE = "agent-memory"

# ------------------------------------------------------------
# IN-MEMORY VECTOR STORE (Pinecone stand-in)
# ------------------------------------------------------------
VECTOR_DB = []

# ------------------------------------------------------------
# MOCK PARALLEL AGENT OUTPUTS
# ------------------------------------------------------------
par_out = {
    "legal_agent": {
        "risks": ["Termination clause vague", "Unlimited liability"]
    },
    "compliance_agent": {
        "issues": ["GDPR consent missing"]
    },
    "finance_agent": {
        "financial_risks": ["Late payment interest 3% per month"]
    },
    "operations_agent": {
        "operational_risks": [
            "No SLA uptime commitment",
            "No service credits defined"
        ]
    }
}

# ------------------------------------------------------------
# DATA MODELS
# ------------------------------------------------------------
@dataclass
class AgentMemoryRecord:
    contract_id: str
    agent_type: str
    timestamp: str
    question: str
    output: Dict[str, Any]

@dataclass
class Match:
    score: float
    metadata: Dict[str, Any]

@dataclass
class QueryResult:
    matches: List[Match]

# ------------------------------------------------------------
# UTILITIES
# ------------------------------------------------------------
def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def simple_text_embedding(text: str) -> set:
    """
    Ultra-lightweight embedding:
    converts text into a token set
    """
    return set(text.lower().split())

def cosine_sim(a: set, b: set) -> float:
    if not a or not b:
        return 0.0
    return len(a & b) / math.sqrt(len(a) * len(b))

# ------------------------------------------------------------
# PERSIST MEMORY
# ------------------------------------------------------------
def persist_agent_memory(
    records: List[AgentMemoryRecord],
    namespace: str
) -> List[str]:

    ids = []

    for r in records:
        vector_id = f"{r.contract_id}-{r.agent_type}-{uuid.uuid4().hex[:8]}"
        text_blob = json.dumps(r.output)

        VECTOR_DB.append({
            "id": vector_id,
            "namespace": namespace,
            "embedding": simple_text_embedding(text_blob),
            "metadata": {
                "contract_id": r.contract_id,
                "agent_type": r.agent_type,
                "timestamp": r.timestamp,
                "question": r.question,
                "output": r.output
            }
        })

        ids.append(vector_id)

    return ids

# ------------------------------------------------------------
# QUERY / RECALL MEMORY
# ------------------------------------------------------------
def query_agent_memory(
    query: str,
    contract_id: str,
    agent_type: Optional[str] = None,
    top_k: int = 3,
    namespace: str = AGENT_MEMORY_NAMESPACE
) -> QueryResult:

    query_vec = simple_text_embedding(query)
    scored = []

    for item in VECTOR_DB:
        meta = item["metadata"]

        if meta["contract_id"] != contract_id:
            continue

        if agent_type and meta["agent_type"] != agent_type:
            continue

        score = cosine_sim(query_vec, item["embedding"])

        if score > 0:
            scored.append(
                Match(
                    score=round(score, 3),
                    metadata=meta
                )
            )

    scored.sort(key=lambda x: x.score, reverse=True)
    return QueryResult(matches=scored[:top_k])

# ------------------------------------------------------------
# BUILD & STORE PARALLEL OUTPUTS
# ------------------------------------------------------------
records = [
    AgentMemoryRecord(
        contract_id=CONTRACT_ID,
        agent_type=a,
        timestamp=utc_now_iso(),
        question=QUESTION,
        output=par_out[a],
    )
    for a in AGENT_TYPES
]

ids = persist_agent_memory(records, AGENT_MEMORY_NAMESPACE)

print(f"✅ Stored {len(ids)} agent-memory records\n")

# ------------------------------------------------------------
# RECALL EXAMPLES
# ------------------------------------------------------------
recall_ops = query_agent_memory(
    query="uptime commitments service credits",
    contract_id=CONTRACT_ID,
    agent_type="operations_agent",
    top_k=3,
)

recall_fin = query_agent_memory(
    query="interest charges late payment",
    contract_id=CONTRACT_ID,
    agent_type="finance_agent",
    top_k=3,
)

print("Operations memory matches:")
for m in recall_ops.matches:
    print("- score:", m.score)
    print("  ts:", m.metadata.get("timestamp"))
    print("  question:", m.metadata.get("question"))

print("\nFinance memory matches:")
for m in recall_fin.matches:
    print("- score:", m.score)
    print("  ts:", m.metadata.get("timestamp"))
    print("  question:", m.metadata.get("question"))


✅ Stored 4 agent-memory records

Operations memory matches:
- score: 0.5
  ts: 2026-01-19T12:38:50.539590+00:00
  question: Identify legal, compliance, finance and ops risks

Finance memory matches:
- score: 0.378
  ts: 2026-01-19T12:38:50.539585+00:00
  question: Identify legal, compliance, finance and ops risks


Pipeline / System Design Oriented

Shared Context Refinement Pipeline

Agent Memory Fusion & Iterative Refinement

Collaborative Agent Refinement Workflow

Context Aggregation → Refinement → Persistence
> Add blockquote



In [28]:
from datetime import datetime, timezone

# Fallback if earlier cells were not executed yet
AGENTS_FOR_REFINEMENT = globals().get("AGENT_TYPES") or [
    "legal_agent",
    "compliance_agent",
    "finance_agent",
    "operations_agent",
]

def _as_utc_aware(dt: datetime) -> datetime:
    """Normalize datetimes to timezone-aware UTC for safe comparisons."""
    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)

def _parse_ts(ts: Optional[str]) -> datetime:
    # Always return a timezone-aware UTC datetime to avoid naive/aware comparison errors.
    if not ts:
        return datetime.min.replace(tzinfo=timezone.utc)
    try:
        # Handles ISO 8601 like: 2026-01-08T12:34:56.789+00:00 or ...Z
        dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
        return _as_utc_aware(dt)
    except Exception:
        return datetime.min.replace(tzinfo=timezone.utc)

def _matches(resp: Any) -> List[Any]:
    if isinstance(resp, dict):
        return resp.get("matches") or []
    return getattr(resp, "matches", []) or []

def _md(match: Any) -> Dict[str, Any]:
    if isinstance(match, dict):
        return match.get("metadata") or {}
    return getattr(match, "metadata", {}) or {}

def _infer_risk_from_text(text: str) -> Tuple[str, str]:
    t = (text or "").lower()
    # Very simple heuristic just for milestone demonstration
    high_terms = ["penalt", "late fee", "interest", "termination", "breach", "indemn", "liability", "service credit"]
    medium_terms = ["audit", "confidential", "privacy", "retention", "notification", "sla"]

    if any(k in t for k in high_terms):
        return "high", "Contains high-impact financial/legal terms (heuristic)."
    if any(k in t for k in medium_terms):
        return "medium", "Contains standard compliance/operations terms (heuristic)."
    return "medium", "Defaulted to medium (insufficient signal in stored output)."

def fetch_latest_agent_memory(*, contract_id: str, agent_type: str, top_k: int = 10) -> Optional[Dict[str, Any]]:
    resp = query_agent_memory(query=f"{agent_type} risk assessment", contract_id=contract_id, agent_type=agent_type, top_k=top_k)
    best = None
    best_ts = datetime.min.replace(tzinfo=timezone.utc)
    for m in _matches(resp):
        md = _md(m)
        ts = _parse_ts(md.get("timestamp"))
        if ts > best_ts:
            best = md
            best_ts = ts
    return best

# 1) Retrieve latest memory per agent and build shared_context
latest_by_agent: Dict[str, Dict[str, Any]] = {}
for agent_type in AGENTS_FOR_REFINEMENT:
    md = fetch_latest_agent_memory(contract_id=CONTRACT_ID, agent_type=agent_type)
    if md is None:
        latest_by_agent[agent_type] = {
            "agent": agent_type,
            "risk_level": "unknown",
            "confidence": None,
            "timestamp": None,
            "output_json": "",
        }
        continue

    # Prefer explicit risk_level metadata, else infer from stored output_json text
    output_json = md.get("output_json") or ""
    risk_level = md.get("risk_level")
    if not isinstance(risk_level, str) or not risk_level.strip():
        risk_level, _ = _infer_risk_from_text(output_json)

    # Best-effort confidence from memory metadata
    conf = md.get("confidence")
    if not isinstance(conf, (int, float)):
        conf = None

    latest_by_agent[agent_type] = {
        "agent": md.get("agent") or md.get("agent_type") or agent_type,
        "risk_level": risk_level,
        "confidence": float(conf) if isinstance(conf, (int, float)) else None,
        "timestamp": md.get("timestamp"),
        "output_json": output_json,
    }

shared_context = "\n".join([f"{v['agent']} risk: {v['risk_level']}" for v in latest_by_agent.values()])
print(shared_context)

legal_agent risk: unknown
compliance_agent risk: unknown
finance_agent risk: unknown
operations_agent risk: unknown


In [29]:
# 2) Let compliance agent read finance output and refine (risk escalation demo)
finance = latest_by_agent.get("finance_agent", {})
compliance = latest_by_agent.get("compliance_agent", {})

finance_risk = (finance.get("risk_level") or "unknown").lower()
compliance_risk = (compliance.get("risk_level") or "unknown").lower()

# Best-effort confidence: inherit from latest compliance if available, else finance, else None
def _as_float(x: Any) -> Optional[float]:
    return float(x) if isinstance(x, (int, float)) else None

inherited_confidence = (
    _as_float(compliance.get("confidence"))
    or _as_float((compliance.get("output") or {}).get("confidence") if isinstance(compliance.get("output"), dict) else None)
    or _as_float(finance.get("confidence"))
    or _as_float((finance.get("output") or {}).get("confidence") if isinstance(finance.get("output"), dict) else None)
    or None
 )

refined_risk = compliance_risk
reason = "No escalation: finance risk not high (heuristic)."

if finance_risk == "high" and compliance_risk in {"low", "medium", "unknown"}:
    refined_risk = "high"
    reason = "Escalated to high because finance risk is high; combined exposure increases compliance risk."

refined_compliance = {
    "agent_type": "compliance_agent",
    "risk_level": refined_risk,
    "confidence": inherited_confidence,
    "reason": reason,
    "based_on": {
        "shared_context": shared_context,
        "finance_risk": finance_risk,
    },
}

print(json.dumps(refined_compliance, indent=2))

{
  "agent_type": "compliance_agent",
  "risk_level": "unknown",
  "confidence": null,
  "reason": "No escalation: finance risk not high (heuristic).",
  "based_on": {
    "shared_context": "legal_agent risk: unknown\ncompliance_agent risk: unknown\nfinance_agent risk: unknown\noperations_agent risk: unknown",
    "finance_risk": "unknown"
  }
}


In [30]:
# 3) Update Compliance memory (persist refined assessment)
refined_record = AgentMemoryRecord(
    contract_id=CONTRACT_ID,
    agent_type="compliance_agent",
    timestamp=utc_now_iso(),
    question="Cross-agent refinement: compliance reads finance output and re-evaluates risk",
    output=refined_compliance,
)

refined_ids = persist_agent_memory(records=[refined_record])
print("Upserted refined compliance memory:", refined_ids[0])

TypeError: persist_agent_memory() missing 1 required positional argument: 'namespace'

7) Final Contract-Level JSON Output (Latest Memories → Standard JSON)

In [31]:
# Define the final schema + generate the final contract-level JSON from latest Pinecone memories
from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# =========================
# 🔹 OUTPUT PATH (FIXED)
# =========================
OUTPUTS_DIR = Path("/content/drive/MyDrive/ClauseAI/data/milestone3")
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

RISK_ORDER = {"low": 0, "medium": 1, "high": 2, "unknown": 1}
HIGH_RISK_TERMS = [
    "penalt", "late fee", "interest", "termination", "breach", "indemn", "liability", "service credit",
    "audit right", "uncapped", "limitation of liability", "data breach", "incident", "non-compliance",
    "security", "subprocessor", "cross-border", "governing law", "injunction",
]

# Defensive: if this list ever becomes nested
if HIGH_RISK_TERMS and isinstance(HIGH_RISK_TERMS[0], (list, tuple, set)):
    HIGH_RISK_TERMS = [t for group in HIGH_RISK_TERMS for t in group]

FINAL_CONTRACT_SCHEMA: Dict[str, Any] = {
    "contract_id": "",
    "legal": {},
    "compliance": {},
    "finance": {},
    "operations": {},
    "overall_risk": "",
    "confidence": {
        "per_agent": {},
        "overall_avg": None,
    },
    "high_risk_clauses": [],
    "generated_at": "",
}

def _utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def _as_utc_aware(dt: datetime) -> datetime:
    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)

def _parse_ts(ts: Optional[str]) -> datetime:
    if not ts:
        return datetime.min.replace(tzinfo=timezone.utc)
    try:
        dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
        return _as_utc_aware(dt)
    except Exception:
        return datetime.min.replace(tzinfo=timezone.utc)

def _matches(resp: Any) -> List[Any]:
    if isinstance(resp, dict):
        return resp.get("matches") or []
    return getattr(resp, "matches", []) or []

def _md(match: Any) -> Dict[str, Any]:
    if isinstance(match, dict):
        return match.get("metadata") or {}
    return getattr(match, "metadata", {}) or {}

def _safe_json_loads(s: str) -> Optional[Any]:
    if not isinstance(s, str) or not s.strip():
        return None
    try:
        return json.loads(s)
    except Exception:
        return None

def _extract_text_from_match_metadata(md: Dict[str, Any]) -> str:
    for key in ("text", "chunk_text", "content", "clause_text", "snippet", "page_content"):
        v = md.get(key)
        if isinstance(v, str) and v.strip():
            return v.strip()
    try:
        return json.dumps(md, ensure_ascii=False)[:800]
    except Exception:
        return str(md)[:800]

def _infer_risk_level(output: Any) -> str:
    if isinstance(output, dict):
        rl = output.get("risk_level")
        if isinstance(rl, str) and rl.strip():
            return rl.strip().lower()
    text = json.dumps(output, ensure_ascii=False).lower() if output else ""
    if any(t in text for t in HIGH_RISK_TERMS):
        return "high"
    return "medium"

def _extract_term_hits(*, text: str, agent_type: str, max_items: int = 5) -> List[Dict[str, Any]]:
    if not isinstance(text, str) or not text:
        return []
    lower = text.lower()
    out = []
    seen_terms = set()

    for term in HIGH_RISK_TERMS:
        if term in seen_terms:
            continue
        idx = lower.find(term)
        if idx < 0:
            continue
        seen_terms.add(term)
        snippet = text[max(0, idx - 120): idx + 160].replace("\n", " ")
        out.append({
            "agent": agent_type,
            "query": "(memory-text-scan)",
            "score": None,
            "snippet": snippet[:800],
            "is_high_risk": True,
            "matched_term": term,
        })
        if len(out) >= max_items:
            break
    return out

def get_latest_agent_output(*, contract_id: str, agent_type: str, top_k: int = 10) -> Dict[str, Any]:
    resp = query_agent_memory(
        query="risk",
        contract_id=contract_id,
        agent_type=agent_type,
        top_k=top_k,
    )

    matches = _matches(resp)
    if not matches:
        return {
            "agent_type": agent_type,
            "timestamp": None,
            "risk_level": "unknown",
            "confidence": None,
            "output": {"risk_level": "unknown", "note": "No memory found"},
            "_memory_metadata": {},
        }

    ranked = [(_parse_ts(_md(m).get("timestamp")), _md(m)) for m in matches]
    ranked.sort(key=lambda x: x[0], reverse=True)

    best_md = ranked[0][1]
    output_obj = _safe_json_loads(best_md.get("output_json") or "") or {
        "raw_output_json": best_md.get("output_json")
    }

    risk_level = best_md.get("risk_level") or _infer_risk_level(output_obj)

    confidence = best_md.get("confidence")
    if not isinstance(confidence, (int, float)):
        if isinstance(output_obj, dict):
            confidence = output_obj.get("confidence")

    if not isinstance(confidence, (int, float)):
        for _, md in ranked[1:]:
            c = md.get("confidence")
            if isinstance(c, (int, float)):
                confidence = c
                break

    return {
        "agent_type": agent_type,
        "timestamp": best_md.get("timestamp"),
        "risk_level": str(risk_level).lower(),
        "confidence": confidence,
        "output": output_obj,
        "_memory_metadata": best_md,
    }

def _overall_risk(agent_risks: Dict[str, str]) -> str:
    best = "low"
    for r in agent_risks.values():
        if RISK_ORDER.get(r, 1) > RISK_ORDER.get(best, 0):
            best = r
    return best if best in {"low", "medium", "high"} else "medium"


# =========================
# 🔹 BUILD FINAL OUTPUT
# =========================
latest = {a: get_latest_agent_output(contract_id=CONTRACT_ID, agent_type=a) for a in AGENT_TYPES}

final = dict(FINAL_CONTRACT_SCHEMA)
final["contract_id"] = CONTRACT_ID
final["generated_at"] = _utc_now_iso()

agent_risks = {}
conf_per_agent = {}
high_risk_clauses = []

for a in AGENT_TYPES:
    payload = latest[a]
    output_obj = payload["output"]
    agent_risks[a] = payload["risk_level"]
    conf_per_agent[a] = payload["confidence"]

    if not high_risk_clauses:
        mem_text = payload["_memory_metadata"].get("output_json", "")
        high_risk_clauses.extend(_extract_term_hits(text=mem_text, agent_type=a))

final["legal"] = latest["legal_agent"]["output"]
final["compliance"] = latest["compliance_agent"]["output"]
final["finance"] = latest["finance_agent"]["output"]
final["operations"] = latest["operations_agent"]["output"]

final["overall_risk"] = _overall_risk(agent_risks)
final["confidence"]["per_agent"] = conf_per_agent
vals = [v for v in conf_per_agent.values() if isinstance(v, (int, float))]
final["confidence"]["overall_avg"] = sum(vals) / len(vals) if vals else None
final["high_risk_clauses"] = high_risk_clauses[:20]

out_path = OUTPUTS_DIR / f"final_contract_{CONTRACT_ID}.json"
out_path.write_text(json.dumps(final, indent=2, ensure_ascii=False), encoding="utf-8")

print("✅ Saved:", out_path)
print("Overall Risk:", final["overall_risk"])
print("Confidence Avg:", final["confidence"]["overall_avg"])
print("High-risk clauses:", len(final["high_risk_clauses"]))


✅ Saved: /content/drive/MyDrive/ClauseAI/data/milestone3/final_contract_contract_001.json
Overall Risk: medium
Confidence Avg: None
High-risk clauses: 0


**Report Template Design (Human-Readable Output)**

save output of above

In [32]:
from google.colab import drive

# Use a custom, empty mount point
drive.mount("/content/my_drive", force_remount=True)

# Now your Drive is accessible at /content/my_drive/MyDrive/


Mounted at /content/my_drive


In [33]:
# ============================================================
# 0️⃣ Mount Google Drive
# ============================================================
from google.colab import drive
from pathlib import Path

# Use a fresh mount point to avoid errors
drive.mount("/content/my_drive", force_remount=True)

OUTPUT_DIR = Path("/content/my_drive/MyDrive/ClauseAI/data/milestone3")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# 1️⃣ Report Template Code (your existing code)
# ============================================================
from typing import List, Dict, Any

REPORT_STRUCTURE = [
    "Executive Summary",
    "Overall Risk Assessment",
    "Legal Analysis",
    "Compliance Analysis",
    "Financial Analysis",
    "Operational Analysis",
    "Conclusion & Recommendations",
]

def _bulletize(lines: List[str]) -> str:
    return "\n".join([f"- {ln}" for ln in lines if isinstance(ln, str) and ln.strip()])

def _clean_snippet(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.replace("\\n", " ").replace("\n", " ")
    s = s.replace("\\\"", '"').replace("\\/", "/")
    s = " ".join(s.split())
    return s.strip()

def _looks_like_json_fragment(s: str) -> bool:
    if not isinstance(s, str):
        return True
    t = s.strip()
    if not t:
        return True
    tl = t.lower()
    hard_markers = [
        "top_k_per_query", "filter_chunks_by_contract_id", "per_query", "matches",
        "retrieval\":", "metadata\":", "output_json", "raw_output_json"
    ]
    if any(m in tl for m in hard_markers):
        return True
    jsonish = sum(t.count(ch) for ch in ["{", "}", "[", "]", ":"])
    backslashes = t.count("\\")
    quotes = t.count('"')
    letters = sum(ch.isalpha() for ch in t)
    spaces = t.count(" ")
    if (jsonish + backslashes + quotes) >= 8 and (letters < 60 or spaces < 8):
        return True
    return False

def build_executive_summary(final_json: Dict[str, Any]) -> str:
    risk = (final_json.get("overall_risk") or "medium").lower()
    conf = final_json.get("confidence", {}).get("overall_avg")
    conf_s = "unknown" if conf is None else f"{conf:.3f}"
    n_hi = len(final_json.get("high_risk_clauses") or [])
    return (
        f"Overall risk is {risk}. "
        f"Confidence score average is {conf_s}. "
        f"We found {n_hi} high-risk clause evidence snippets to review."
    )

def build_report(final_json: Dict[str, Any]) -> Dict[str, str]:
    per_agent_conf = (final_json.get("confidence") or {}).get("per_agent") or {}
    hi = final_json.get("high_risk_clauses") or []

    report: Dict[str, str] = {}
    report["Executive Summary"] = build_executive_summary(final_json)

    report["Overall Risk Assessment"] = _bulletize([
        f"Overall risk level: {(final_json.get('overall_risk') or 'medium').lower()}",
        f"Confidence (avg): {final_json.get('confidence', {}).get('overall_avg')}",
        f"Legal confidence: {per_agent_conf.get('legal_agent')}",
        f"Compliance confidence: {per_agent_conf.get('compliance_agent')}",
        f"Finance confidence: {per_agent_conf.get('finance_agent')}",
        f"Operations confidence: {per_agent_conf.get('operations_agent')}",
    ])
    report["Legal Analysis"] = _bulletize([
        "Key legal obligations summarized from retrieval outputs.",
        "Review termination, breach, and indemnity language if present.",
    ])
    report["Compliance Analysis"] = _bulletize([
        "Key privacy/security/compliance obligations summarized from retrieval outputs.",
        "Review audit rights, incident notification, and data handling language if present.",
    ])
    report["Financial Analysis"] = _bulletize([
        "Key payment, invoicing, and late-fee obligations summarized from retrieval outputs.",
        "Review liability and penalty exposure if present.",
    ])
    report["Operational Analysis"] = _bulletize([
        "Key deliverables, timelines, and SLA obligations summarized from retrieval outputs.",
        "Review uptime commitments and service credits if present.",
    ])
    top_evidence: List[str] = []
    for item in hi:
        if not isinstance(item, dict):
            continue
        raw = item.get("snippet") or ""
        snippet = _clean_snippet(raw)
        if not snippet:
            continue
        if _looks_like_json_fragment(snippet):
            continue
        agent = item.get("agent") or "unknown_agent"
        term = item.get("matched_term")
        prefix = f"[{agent}] " + (f"(term: {term}) " if isinstance(term, str) and term else "")
        top_evidence.append((prefix + snippet)[:220])
        if len(top_evidence) >= 8:
            break
    report["Conclusion & Recommendations"] = _bulletize([
        "Prioritize review of the high-risk clauses listed below.",
        "If overall risk is high, consider negotiation points or approvals before signing.",
        *(["High-risk evidence:"] + top_evidence if top_evidence else ["No clean high-risk evidence snippets were extracted from memory."]),
    ])
    return report

# ============================================================
# 2️⃣ Demo final_json
# ============================================================
# If you don't have real final_json, you can use a dummy for testing
final = {
    "overall_risk": "high",
    "confidence": {"overall_avg": 0.82, "per_agent": {"legal_agent": 0.9,"compliance_agent":0.8,"finance_agent":0.85,"operations_agent":0.75}},
    "high_risk_clauses": [
        {"snippet":"This contract includes a penalty clause for late payment.","agent":"legal_agent","matched_term":"penalty"},
        {"snippet":"Data breach notification requirements must be followed.","agent":"compliance_agent","matched_term":"data breach"}
    ]
}

report = build_report(final)

# ============================================================
# 3️⃣ Print preview
# ============================================================
print("\n".join(["=" * 80, "REPORT PREVIEW", "=" * 80]))
for section in REPORT_STRUCTURE:
    print(f"\n## {section}\n")
    print(report.get(section, ""))

# ============================================================
# 4️⃣ SAVE OUTPUT TO DRIVE
# ============================================================
output_file = OUTPUT_DIR / "report_output.txt"
output_file.write_text(
    "\n".join(f"\n## {section}\n{report.get(section,'')}" for section in REPORT_STRUCTURE),
    encoding="utf-8"
)

print(f"\n✅ Report saved to: {output_file}")


Mounted at /content/my_drive
REPORT PREVIEW

## Executive Summary

Overall risk is high. Confidence score average is 0.820. We found 2 high-risk clause evidence snippets to review.

## Overall Risk Assessment

- Overall risk level: high
- Confidence (avg): 0.82
- Legal confidence: 0.9
- Compliance confidence: 0.8
- Finance confidence: 0.85
- Operations confidence: 0.75

## Legal Analysis

- Key legal obligations summarized from retrieval outputs.
- Review termination, breach, and indemnity language if present.

## Compliance Analysis

- Key privacy/security/compliance obligations summarized from retrieval outputs.
- Review audit rights, incident notification, and data handling language if present.

## Financial Analysis

- Key payment, invoicing, and late-fee obligations summarized from retrieval outputs.
- Review liability and penalty exposure if present.

## Operational Analysis

- Key deliverables, timelines, and SLA obligations summarized from retrieval outputs.
- Review uptime com

**Report Formatting & Tone Customization**

In [34]:
# ============================================================
# Report Formatting & Tone Customization
# ============================================================

from pathlib import Path
from typing import List, Dict

# ----------------------------
# 0️⃣ Input file (saved report)
# ----------------------------
INPUT_FILE = Path("/content/my_drive/MyDrive/ClauseAI/data/milestone3/report_output.txt")

if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Report file not found: {INPUT_FILE}")

# Read the report sections from file
raw_report_text = INPUT_FILE.read_text(encoding="utf-8")

# Split into sections (assuming ## SECTION_NAME format)
sections_raw = raw_report_text.split("\n## ")
report_dict: Dict[str, str] = {}

for sec in sections_raw:
    if not sec.strip():
        continue
    lines = sec.strip().split("\n")
    title = lines[0].strip()
    body = "\n".join(lines[1:]).strip()
    report_dict[title] = body

# ----------------------------
# 1️⃣ Tone Templates
# ----------------------------
TONE_TEMPLATES = {
    "neutral": {
        "header_prefix": "",
        "risk_marker": "⚠️",
        "style_note": "Neutral factual tone",
    },
    "executive": {
        "header_prefix": "🔹 ",
        "risk_marker": "🚨",
        "style_note": "Concise executive summary tone",
    },
    "legal": {
        "header_prefix": "§ ",
        "risk_marker": "❗",
        "style_note": "Formal legal-review tone",
    },
}

HIGH_RISK_SECTIONS = {
    "Overall Risk Assessment",
    "Legal Analysis",
    "Compliance Analysis",
    "Financial Analysis",
}

# ----------------------------
# 2️⃣ Section Formatter
# ----------------------------
def format_section(title: str, content: str, tone: str = "neutral") -> str:
    cfg = TONE_TEMPLATES[tone]
    header = f"{cfg['header_prefix']}## {title}"

    if title in HIGH_RISK_SECTIONS:
        header += f"  {cfg['risk_marker']} HIGH ATTENTION"

    # Force bullet points
    lines = []
    for ln in content.split("\n"):
        ln = ln.strip()
        if not ln:
            continue
        if not ln.startswith("-"):
            ln = f"- {ln}"
        lines.append(ln)

    body = "\n".join(lines) if lines else "- No notable findings."
    return f"{header}\n{body}\n"

# ----------------------------
# 3️⃣ Format Entire Report
# ----------------------------
def format_full_report(
    report: Dict[str, str],
    tone: str = "neutral"
) -> str:
    if tone not in TONE_TEMPLATES:
        raise ValueError(f"Unsupported tone: {tone}")

    output = []
    output.append(f"# Contract Review Report ({tone.upper()})")
    output.append(f"_Style: {TONE_TEMPLATES[tone]['style_note']}_\n")

    for section, content in report.items():
        output.append(format_section(section, content, tone))

    return "\n".join(output)

# ----------------------------
# 4️⃣ Generate Executive Report (short version)
# ----------------------------
def generate_executive_report(report: Dict[str, str], tone: str = "executive") -> str:
    key_sections = [
        "Executive Summary",
        "Overall Risk Assessment",
        "Conclusion & Recommendations",
    ]
    output = ["# Executive Contract Summary\n"]
    for sec in key_sections:
        content = report.get(sec, "")
        output.append(format_section(sec, content, tone))
    return "\n".join(output)

# ----------------------------
# 5️⃣ USAGE
# ----------------------------

# Full reports with different tones
neutral_report = format_full_report(report_dict, tone="neutral")
executive_report = format_full_report(report_dict, tone="executive")
legal_report = format_full_report(report_dict, tone="legal")

# Executive-only short version
executive_summary_only = generate_executive_report(report_dict, tone="executive")

# ----------------------------
# 6️⃣ SAVE FORMATTED REPORTS
# ----------------------------
OUTPUT_DIR = INPUT_FILE.parent
(OUTPUT_DIR / "formatted_reports").mkdir(parents=True, exist_ok=True)

# Save
(OUTPUT_DIR / "formatted_reports/report_neutral.txt").write_text(neutral_report, encoding="utf-8")
(OUTPUT_DIR / "formatted_reports/report_executive.txt").write_text(executive_report, encoding="utf-8")
(OUTPUT_DIR / "formatted_reports/report_legal.txt").write_text(legal_report, encoding="utf-8")
(OUTPUT_DIR / "formatted_reports/report_executive_summary.txt").write_text(executive_summary_only, encoding="utf-8")

print(f"✅ Formatted reports saved in: {OUTPUT_DIR / 'formatted_reports'}")


✅ Formatted reports saved in: /content/my_drive/MyDrive/ClauseAI/data/milestone3/formatted_reports


**FastAPI Backend for Contract Analysis**

In [37]:
# ============================================================
# FastAPI Contract Analysis Backend (Colab + Ngrok) — STABLE
# ============================================================

# -----------------------------
# 1. Install compatible versions
# -----------------------------
!pip install -q fastapi==0.110.0 uvicorn==0.29.0 pyngrok nest-asyncio python-multipart

# -----------------------------
# 2. Imports
# -----------------------------
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import JSONResponse
from typing import List, Dict, Any
import json
import nest_asyncio
from pyngrok import ngrok
import subprocess
import time
import os

# -----------------------------
# 3. Patch asyncio (Colab-safe)
# -----------------------------
nest_asyncio.apply()

# -----------------------------
# 4. Ngrok Auth
# -----------------------------
NGROK_AUTH_TOKEN = "38T4vHLNLquFXmIzfD0fPZHkxkJ_5ce2KBYrRKDXxsSxLrJVm"
!ngrok authtoken {NGROK_AUTH_TOKEN}

# Kill old tunnels to avoid ERR_NGROK_324
ngrok.kill()
print("✅ Old ngrok tunnels killed")

# -----------------------------
# 5. FastAPI App
# -----------------------------
app = FastAPI(title="Contract Analysis API")

# -----------------------------
# 6. Report Utilities
# -----------------------------
def _bulletize(lines: List[str]) -> str:
    return "\n".join(f"- {l}" for l in lines if l)

def build_report(final_json: Dict[str, Any], tone: str = "executive") -> Dict[str, str]:
    risk = final_json.get("overall_risk", "medium")
    hi = final_json.get("high_risk_clauses", [])

    return {
        "Executive Summary": (
            f"Overall contract risk is **{risk.upper()}**. "
            f"{len(hi)} high-risk clauses require review."
        ),
        "Overall Risk Assessment": _bulletize([
            f"Risk level: {risk}",
            f"High-risk clauses detected: {len(hi)}",
        ]),
        "Legal Analysis": _bulletize([
            "Review termination, liability, indemnity clauses."
        ]),
        "Compliance Analysis": _bulletize([
            "Review data protection, audit, and regulatory clauses."
        ]),
        "Financial Analysis": _bulletize([
            "Review payment terms, penalties, and interest clauses."
        ]),
        "Operational Analysis": _bulletize([
            "Review SLA, uptime, service credits."
        ]),
        "Conclusion & Recommendations": _bulletize([
            "Prioritize high-risk clauses before signing.",
            "Seek legal approval if risk is high."
        ])
    }

# -----------------------------
# 7. API Endpoints
# -----------------------------
@app.post("/analyze")
async def analyze_contract(
    file: UploadFile = File(...),
    tone: str = Form("executive")
):
    if not file.filename.endswith(".json"):
        raise HTTPException(status_code=400, detail="Only JSON files allowed")

    content = await file.read()
    if not content:
        raise HTTPException(status_code=400, detail="Empty file uploaded")

    try:
        final_json = json.loads(content)
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")

    return JSONResponse(build_report(final_json, tone))

@app.get("/health")
async def health():
    return {"status": "ok"}

# -----------------------------
# 8. Save app.py (REQUIRED)
# -----------------------------
with open("app.py", "w") as f:
    f.write("""
from fastapi import FastAPI
from main import app
""")

# -----------------------------
# 9. Start Uvicorn via subprocess (FIX)
# -----------------------------
PORT = 8000

process = subprocess.Popen(
    ["uvicorn", "__main__:app", "--host", "0.0.0.0", "--port", str(PORT)],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

time.sleep(3)

# -----------------------------
# 10. Start Ngrok
# -----------------------------
public_url = ngrok.connect(PORT)
print(f"🔗 Public URL: {public_url}")
print("✅ FastAPI server is LIVE")



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/92.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sse-starlette 3.1.2 requires starlette>=0.49.1, but you have starlette 0.36.3 which is incompatible.
google-adk 1.21.0 requires fastapi<0.124.0,>=0.115.0, but you have fastapi 0.110.0 which is incompatible.
google-adk 1.21.0 requires starlette<1.0.0,>=0.49.1, but you have starlette 0.36.3 which is incompatible.
google-adk 1.21.0 requires uvicorn<