# ***Objective*** :
## *Build a mini AI agent that can read, summarise, and analyse the Universal Credit Act 2025 and return a structured JSON report.*

---
# **Step's Followed** :
1. ***Installation Of Specific Libraries/Setting Up the Environment***
2. ***Loding The Required LLM and Vector Models***
3. ***Making a Single Pipeline for result***

---
## *Input File* : *Universal Credit Act 2025*
## *Output File* : *a structured JSON report*

---
# ***Details about the libraries Used*** :
###1. **Docling** : *Docling is the current one of the best data extractor which good at extracting compllex Data like Table , Complex act Structure etc. Which is best for this task*
###2. **Langchain** : *For the PAG pipleline*
###3. **ChromaDB** : *Best to store the vector Database*
###4. **Transformers**: *Used so the LLM model Run Locally*
###5. **BitsAndBytes** : *I used because I have a low end setup. [ Note: Its is optional only use when you have a setup with low memory and GPU ]*

# ***Details about the LLM models Used*** :
###1. **'NOMIC-EMBED-TEST-v1.5' [ Embedded Model ]** : *I used This model because it has a 8192 context window ehich is good for handling long section or long data perfectly then other model.*
###2. **'QWEN 2.5-7B ( INSTRUCT VERSION )' [ Generator Model ]** : *I used this model because it is currently one of the best 7B model and has a memory to store long context which is best for this task*
---





In [None]:
print("‚è≥ Installing Required Libararies need For this Task. Please Wait It Take some Time.....\n")
# Libraries for Data Extraction From the Raw Pdf
!pip install -q -U docling pydantic

# Libraries For the Vector Data base and LLM models
!pip install -q -U langchain langchain-chromadb langchain-community langchain-huggingface chromadb
!pip install -q -U bitsandbytes accelerate peft transformers

print("‚úÖ All libararies Install Sucessfully .")

In [None]:
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

def load_ai_models():

    '''
    This Function Is basically made So we can load the LLM Models for both The Verctor DB ( Brain )
    and the LLM ( Generator ).

    Vector DB model ( nomic-embed-text-v1.5 ) : In simple Words Vector DB model is model which is used to store the Raw Data
                                                JSON file in a numarical Vector which is then used for LLM's.
    LLM Model ( Qwen 2.5-7B ( Instruct Version ) ) : In simple Words LLM model is Used to generate the reports,Its take data from the Brain ( Vector DB)
                                                     as context and generate the report.
    '''

    print("üß† Loading the LLM Model for the Vector DataBase....")
    embedding_model = HuggingFaceEmbeddings(
        model_name="nomic-ai/nomic-embed-text-v1.5",
        model_kwargs={"trust_remote_code": True, "device": "cuda"}
    )

    print("ü§ñ Loading the Generator : Qwen2.5-7B-Instruct ....)
    '''
    Note : Below bnb_config is optional only use when you have a setup with low memory and GPU.
    '''
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model_id = "Qwen/Qwen2.5-7B-Instruct"

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config, # use only when you make a bnb_config
        device_map="auto",
        trust_remote_code=True
    )

    # Create the text generation pipeline
    text_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=2048,
        temperature=0.1,
        repetition_penalty=1.1
    )

    llm = HuggingFacePipeline(pipeline=text_pipeline)

    print("‚úÖ Both Model loaded Sucessfull. Ready to use !")
    return embedding_model, llm

# Execute the loader
em_model, llm = load_ai_models()

In [None]:
import json
import re
from google.colab import files
from docling.document_converter import DocumentConverter
from langchain_chroma import Chroma
from langchain.text_splitter import MarkdownTextSplitter

# This Function is used to extract the Result From the LLM generated Output
def clean_json_response(text, expected_type="dict"):
    """
    Sanitizes the LLM output to extract valid JSON, ignoring conversational text.
    Handles both Dictionaries {} and Lists [].
    """
    # Strategy 1: Look for Markdown Code Blocks (e.g., ```json ... ```)
    code_block = re.search(r'```(?:json)?\s*(\{|\[)(.*?)(\}|\])\s*```', text, re.DOTALL)
    if code_block:
        clean_str = code_block.group(1) + code_block.group(2) + code_block.group(3)
        try:
            return json.loads(clean_str)
        except:
            pass # If this Parsing Failed ( like there is not markdown {} then it auto swift to the other one's.)

    # Strategy 2: Raw Regex Search
    if expected_type == "list":
        # Find the largest outer bracket [] containing a curly brace {}
        match = re.search(r'(\[\s*\{.*\}\s*\])', text, re.DOTALL)
    else:
        # Find the largest outer curly brace {}
        match = re.search(r'(\{.*\})', text, re.DOTALL)

    if match:
        try:
            return json.loads(match.group(0))
        except:
            pass

    return [] if expected_type == "list" else {}


# This is the Main function Its handle all task.
'''
Working of the Function :
1. Upload PDF -> 2. Docling Extraction -> 3. Chroma Indexing -> 4. Qwen Analysis -> 5. Output
'''

def run_legal_agent():
    # --- STEP 1: UPLOAD ---
    print("\nüìÇ Please Upload the PDF....")
    uploaded = files.upload()

    if not uploaded:
        print("‚ùå No file uploaded. Exiting.")
        return
    pdf_filename = next(iter(uploaded))

    # --- STEP 2: EXTRACTION (VISION) ---
    print(f"üöÄ PHASE 1: Extracting structure from {pdf_filename}...")
    # For this Stage we use the Docling , Because its is better in Table extraction then the normal OCR.
    converter = DocumentConverter()
    result = converter.convert(pdf_filename)
    full_markdown = result.document.export_to_markdown()
    print(f"   -> Extraction successful. Document length: {len(full_markdown)} chars.")

    # --- STEP 3: INDEXING (RAG) ---
    print("üíæ PHASE 2: building RAG Memory (Vector Database)...")
    # Split text into chunks so the AI can "read" specific pages
    text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.create_documents([full_markdown], metadatas=[{"source": pdf_filename}])

    # Index documents into ChromaDB using Nomic Embeddings
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=em_model,
        persist_directory="./chroma_final"
    )
    retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

    # --- STEP 4: CONTEXT RETRIEVAL ---
    # We ask the database for specific sections to feed the AI
    print("üîç PHASE 3: Retrieving key legal sections...")
    rag_docs = retriever.invoke("definitions eligibility penalties payments obligations enforcement uplift percentage table")
    rag_text = "\n\n".join([d.page_content for d in rag_docs])

    # --- STEP 5: TASK 3 (SECTION EXTRACTION) ---
    print("ü§ñ PHASE 4: Generating Task 3 Report (Extraction)...")

    prompt_task3 = f"""
    [CONTEXT]
    {rag_text}
    [END CONTEXT]

    You are a legal AI analyst. Analyze the text above from the 'Universal Credit Act 2025'.

    IMPORTANT CONTEXT:
    This is an AMENDMENT ACT. It primarily updates rates and definitions.
    If a specific section (like Penalties or Record Keeping) is not found, do NOT invent one.
    Instead, write: "Not specified in this Amendment (refers to Principal Regulations)."

    INSTRUCTIONS:
    1. For 'payments', specifically look for the 'Relevant uplift percentage' table (e.g. 2.3%, 3.1%).
    2. For 'definitions', extract 'pre-2026 claimant' and 'severe conditions'.
    3. Use '¬£' for currency.

    OUTPUT: Return ONLY valid JSON.
    {{
      "definitions": "...",
      "obligations": "...",
      "responsibilities": "...",
      "eligibility": "...",
      "payments": "...",
      "penalties": "...",
      "record_keeping": "..."
    }}
    """

    response_3 = llm.invoke(prompt_task3)
    task3_json = clean_json_response(response_3, expected_type="dict")

    # --- STEP 6: TASK 4 (RULE CHECKS) ---
    print("ü§ñ PHASE 5: Performing Task 4 (Rule Verification)...")

    prompt_task4 = f"""
    [CONTEXT]
    {rag_text}
    [END CONTEXT]

    Perform a strict compliance check on the 'Universal Credit Act 2025'.
    Return a JSON LIST of exactly 6 objects representing the rules below.

    RULES TO CHECK:
    1. Act must define key terms (Pass if 'pre-2026 claimant' is defined)
    2. Act must specify eligibility criteria (Pass if criteria for severe conditions is mentioned)
    3. Act must specify responsibilities of authority (Pass if Secretary of State powers are mentioned)
    4. Act must include enforcement or penalties (Fail if no specific penalties in this text)
    5. Act must include payment calculation (Pass if uplift percentages are found)
    6. Act must include record-keeping (Fail or Pass based on 'Information Requirements')

    OUTPUT: Return ONLY valid JSON List.
    [
      {{ "rule": "Act must define key terms", "status": "pass", "evidence": "...", "confidence": 100 }},
      ...
    ]
    """

    response_4 = llm.invoke(prompt_task4)
    task4_json = clean_json_response(response_4, expected_type="list")

    # --- STEP 7: MERGE & DOWNLOAD ---
    final_output = {
        "task_3_extraction": task3_json,
        "task_4_rule_checks": task4_json
    }

    filename = "Final_Submission_Output.json"
    with open(filename, "w") as f:
        json.dump(final_output, f, indent=4)

    print("\n" + "="*50)
    print("üéâ SUCCESS: Report Generated & Saved.")
    print("="*50)
    print(json.dumps(final_output, indent=2))

    files.download(filename)

# Start the Agent
run_legal_agent()