In [1]:
# %% [1] INGESTION: Build Policy Vector Store
import json
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# 1. Load API Key
load_dotenv()
if not os.getenv("GOOGLE_API_KEY"):
    # If .env isn't set up, paste your key here for testing:
    # os.environ["GOOGLE_API_KEY"] = "AIza..." 
    print("‚ùå Error: GOOGLE_API_KEY not found. Please check your .env file.")

# 2. Configuration
# Based on your screenshot, the JSON is inside the 'CUAD_v1' folder
DATASET_PATH = os.path.join("CUAD_v1", "CUAD_v1.json")
VECTOR_DB_PATH = "policy_vectorstore"

def ingest_policy_data():
    if not os.path.exists(DATASET_PATH):
        print(f"‚ùå Error: File not found at {DATASET_PATH}")
        return

    print(f"Loading dataset from {DATASET_PATH}...")
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 3. Extract ONE Target Contract
    # We select Index 0: "LIND_Inc... Distributor Agreement"
    # This acts as the "Company Policy" or "Contract" we want to audit.
    target_index = 0
    contract_data = data['data'][target_index]
    contract_title = contract_data.get('title', 'Unknown Contract')
    
    print(f"üìÑ Processing Document: {contract_title}")

    # 4. Extract Text Content
    # The JSON splits text into paragraphs -> context. We join them back together.
    full_text = ""
    for paragraph in contract_data['paragraphs']:
        full_text += paragraph['context'] + "\n\n"
    
    print(f"   - Extracted {len(full_text)} characters.")

    # 5. Chunking
    # We use large chunks (2000 chars) because legal clauses (like Indemnification) can be long.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200
    )
    
    doc_obj = Document(
        page_content=full_text,
        metadata={"source": contract_title, "type": "Commercial Agreement"}
    )
    
    chunks = text_splitter.split_documents([doc_obj])
    print(f"   - Split into {len(chunks)} chunks.")

    # 6. Embed & Save
    print("üß† Generating Embeddings (talking to Gemini)...")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    
    vector_store = FAISS.from_documents(chunks, embeddings)
    vector_store.save_local(VECTOR_DB_PATH)
    
    print(f"‚úÖ Success! Policy Vector Store saved to folder: '{VECTOR_DB_PATH}'")

# Run it
if __name__ == "__main__":
    ingest_policy_data()

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset from CUAD_v1\CUAD_v1.json...
üìÑ Processing Document: LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT
   - Extracted 54292 characters.
   - Split into 37 chunks.
üß† Generating Embeddings (talking to Gemini)...
‚úÖ Success! Policy Vector Store saved to folder: 'policy_vectorstore'


In [3]:
# %% [2] COMPLIANCE AUDIT AGENT
import json
import pandas as pd
import time
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# 1. Define the 15 Compliance Rules (Task Requirement)
# We recreate the JSON file here to ensure you have it locally.
rules = [
    {"id": "R01", "name": "Governing Law", "desc": "Must specify which state/country laws apply."},
    {"id": "R02", "name": "Termination", "desc": "Must allow termination for cause or breach."},
    {"id": "R03", "name": "Confidentiality", "desc": "Must include a confidentiality/non-disclosure clause."},
    {"id": "R04", "name": "Indemnification", "desc": "Must have an indemnification clause protecting parties."},
    {"id": "R05", "name": "Force Majeure", "desc": "Must excuse performance due to unforeseen events (Acts of God)."},
    {"id": "R06", "name": "Assignment", "desc": "Must specify if assignment to third parties is allowed/restricted."},
    {"id": "R07", "name": "Intellectual Property", "desc": "Must clarify IP ownership (who owns created work)."},
    {"id": "R08", "name": "Dispute Resolution", "desc": "Must specify arbitration, mediation, or litigation."},
    {"id": "R09", "name": "Payment Terms", "desc": "Must specify when payments are due (e.g., Net 30)."},
    {"id": "R10", "name": "Liability Cap", "desc": "Should limit liability to a specific amount."},
    {"id": "R11", "name": "Non-Compete", "desc": "Check for restrictions on competing with the other party."},
    {"id": "R12", "name": "Non-Solicit", "desc": "Restriction on poaching employees or customers."},
    {"id": "R13", "name": "Severability", "desc": "If one part is invalid, the rest remains in effect."},
    {"id": "R14", "name": "Entire Agreement", "desc": "This contract supersedes previous agreements."},
    {"id": "R15", "name": "Waiver", "desc": "Failure to enforce a right does not waive that right."}
]

# Save rules to JSON (Deliverable)
with open("compliance_rules.json", "w") as f:
    json.dump(rules, f, indent=4)

# 2. Load the Policy Brain
print("Loading Policy Vector Store...")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
vector_store = FAISS.load_local(
    "policy_vectorstore", 
    embeddings, 
    allow_dangerous_deserialization=True
)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# 3. Setup the Auditor LLM
llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash", temperature=0.1) # Low temp for strict logic

# 4. The Audit Prompt
audit_template = """
You are a strict Legal Compliance Auditor. 
Analyze the retrieved contract clauses to determine if the following rule is satisfied.

Rule: {question}

Instructions:
1. Search the Context for a clause that matches the Rule.
2. If found, extract the exact text and determine if it is COMPLIANT.
3. If NOT found, mark it as NON-COMPLIANT.

Context:
{context}

Format your answer EXACTLY like this:
Verdict: [COMPLIANT / NON-COMPLIANT]
Evidence: [Quote the specific clause or say 'No clause found']
Reasoning: [Brief explanation]

Answer:
"""
prompt = PromptTemplate(template=audit_template, input_variables=["context", "question"])

audit_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

# 5. Run the Audit Loop
results = []
print(f"üöÄ Starting Audit of 15 Rules...")

for rule in rules:
    print(f"   - Checking Rule {rule['id']}: {rule['name']}...")
    
    # We ask the RAG system to find evidence for the rule description
    query = f"Find the clause regarding '{rule['name']}'. {rule['desc']}"
    
    try:
        response = audit_chain.invoke({"query": query})
        raw_answer = response['result']
        
        # Simple parsing (You can make this more robust if needed)
        verdict = "NON-COMPLIANT"
        if "Verdict: COMPLIANT" in raw_answer:
            verdict = "COMPLIANT"
            
        results.append({
            "Rule ID": rule['id'],
            "Rule Name": rule['name'],
            "Verdict": verdict,
            "AI Analysis": raw_answer
        })
        
        # Sleep to avoid hitting API rate limits
        time.sleep(1.5)
        
    except Exception as e:
        print(f"‚ùå Error on {rule['name']}: {e}")
        results.append({"Rule ID": rule['id'], "Verdict": "ERROR", "AI Analysis": str(e)})

# 6. Save Report
df = pd.DataFrame(results)
df.to_csv("compliance_report.csv", index=False)

print("\n‚úÖ Audit Complete!")
print("Results saved to 'compliance_report.csv'.")
print("\nSample Output:")
print(df[["Rule Name", "Verdict"]].head(5))

Loading Policy Vector Store...
üöÄ Starting Audit of 15 Rules...
   - Checking Rule R01: Governing Law...
   - Checking Rule R02: Termination...
   - Checking Rule R03: Confidentiality...
   - Checking Rule R04: Indemnification...
   - Checking Rule R05: Force Majeure...
   - Checking Rule R06: Assignment...
   - Checking Rule R07: Intellectual Property...
   - Checking Rule R08: Dispute Resolution...
   - Checking Rule R09: Payment Terms...
   - Checking Rule R10: Liability Cap...
   - Checking Rule R11: Non-Compete...
   - Checking Rule R12: Non-Solicit...
   - Checking Rule R13: Severability...
   - Checking Rule R14: Entire Agreement...
   - Checking Rule R15: Waiver...

‚úÖ Audit Complete!
Results saved to 'compliance_report.csv'.

Sample Output:
         Rule Name    Verdict
0    Governing Law  COMPLIANT
1      Termination  COMPLIANT
2  Confidentiality  COMPLIANT
3  Indemnification  COMPLIANT
4    Force Majeure  COMPLIANT
