# Compliance Report AI-Agent
---


In [1]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import cv2
import os
import shutil

In [2]:
os.makedirs("data", exist_ok=True)
os.makedirs("images/invoice", exist_ok=True)
os.makedirs("images/truck", exist_ok=True)
os.makedirs("ocr_text", exist_ok=True)

In [11]:
def pdf_to_images(pdf_path, output_dir, dpi=300):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Converting {pdf_path} to images...")
    pages = convert_from_path(pdf_path, dpi=dpi)
    image_paths = []
    for i, page in enumerate(pages):
        img_path = os.path.join(output_dir, f"page_{i + 1}.png")
        page.save(img_path, 'PNG')
        image_paths.append(img_path)
    return image_paths

def rotate_and_ocr(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Optional: Try rotating to correct orientation
    # Tesseract works better on upright text
    try:
        config = "--psm 0"
        orientation_info = pytesseract.image_to_osd(gray, config=config)
        angle = int(re.search("(?<=Rotate: )\d+", orientation_info).group(0))
        if angle != 0:
            if angle == 90:
                gray = cv2.rotate(gray, cv2.ROTATE_90_CLOCKWISE)
            elif angle == 180:
                gray = cv2.rotate(gray, cv2.ROTATE_180)
            elif angle == 270:
                gray = cv2.rotate(gray, cv2.ROTATE_90_COUNTERCLOCKWISE)
    except:
        pass  # Fallback if orientation detection fails

    text = pytesseract.image_to_string(gray)
    return text

def extract_text_from_pdf(pdf_path, image_dir, output_txt_path):
    image_paths = pdf_to_images(pdf_path, image_dir)
    full_text = ""
    for img_path in image_paths:
        text = rotate_and_ocr(img_path)
        full_text += text + "\n\n"

    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    
    return full_text

# Process the invoice PDF
invoice_pdf = "data/invoice (1).PDF"
invoice_text_path = "ocr_text/invoice.txt"
invoice_text = extract_text_from_pdf(invoice_pdf, "images/invoice", invoice_text_path)

# Process the truck consignment PDF
truck_pdf = "data/truck consignment, exit certificate, delivery note.pdf"
truck_text_path = "ocr_text/truck.txt"
truck_text = extract_text_from_pdf(truck_pdf, "images/truck", truck_text_path)

# Display first few lines of extracted text for verification
print("--- Invoice Text Preview ---")
print(invoice_text[:800])
print("\n--- Truck Document Text Preview ---")
print(truck_text[:800])

Converting data/invoice (1).PDF to images...
Converting data/truck consignment, exit certificate, delivery note.pdf to images...
--- Invoice Text Preview ---
Tax Invoice

Tax Registration Number QQ

Customer No 128659 Invoice No 626867-ADS1-1
Customer Name Invoice Date 12-Aug-2023
SO No SO-153412
Bill To Address -O. BOX 7946, Due Date 10-Dec-2023
Ruwi, PO No 1002475
Oman
Customer Tax Reg No Sales Man
Phone Delivery Note No 626867-ADS1-1
Fax Currency AED
Contact Person Exchange Rate l
Project Name INTERCOMPANY
Sr Item Code & Description Quantity UOM Unit Price Total Amount VAT Total Amount
No (Excluding VAT) —_Rate(%) Amount (Including VAT)
1 WEICCO Rubber Support Insert 25mm, 35.00 SET 4.82 168.70 0.00 168.70
RSI 3" X 25mm (Width-38mm)
Total Amount (Excluding VAT) 168.70
VAT Amount 0.00
Total Amount in Words AED One Hundred Sixty Eight and 70/100 Only. Total Amount (Including VAT) 168.70
Payment Terms LPO - 120 Days
Bank Details Ship to Address

--- Truck Document Text Preview ---
WOU 

In [12]:
import re

def extract_invoice_data(text):
    data = {}

    try:
        data['invoice_no'] = re.search(r'Invoice\s+No\s*[:\-]?\s*([A-Z0-9\-]+)', text, re.IGNORECASE).group(1)
    except: data['invoice_no'] = None

    try:
        data['invoice_date'] = re.search(r'Invoice\s+Date\s*[:\-]?\s*([0-9]{1,2}[-/][A-Za-z]{3,}[-/][0-9]{4})', text, re.IGNORECASE).group(1)
    except: data['invoice_date'] = None

    try:
        data['po_no'] = re.search(r'PO\s+No\s*[:\-]?\s*([0-9A-Z\-]+)', text, re.IGNORECASE).group(1)
    except: data['po_no'] = None

    try:
        data['customer_name'] = re.search(r'Customer Name\s*:\s*(.+?)\s*Bill To Address', text, re.DOTALL | re.IGNORECASE).group(1).strip()
    except: data['customer_name'] = None

    # Use same value for vendor (same line)
    data['vendor'] = data['customer_name']

    # Extract line items
    line_items = []
    pattern = re.compile(r'(?P<qty>\d+\.\d+)\s+SET\s+(?P<unit_price>\d+\.\d+)\s+(?P<total>\d+\.\d+)', re.MULTILINE)
    for match in pattern.finditer(text):
        try:
            description = "N/A"
            lines = text.split(match.group(0))[0].splitlines()
            if len(lines) >= 2:
                description_line = lines[-2].strip()
                if len(description_line) > 0:
                    description = description_line
            line_items.append({
                'description': description,
                'quantity': float(match.group("qty")),
                'unit_price': float(match.group("unit_price")),
                'total': float(match.group("total"))
            })
        except:
            continue

    data['line_items'] = line_items

    # Try extracting grand total
    try:
        data['total'] = float(re.search(r'Total Amount \(Including\s+VAT\)\s*:\s*([0-9]+\.[0-9]{2})', text).group(1))
    except:
        data['total'] = None

    return data

def extract_delivery_data(text):
    data = {}

    try:
        data['po_no'] = re.search(r'PO\s+No\s*[:\-]?\s*([A-Z0-9\-\/]+)', text, re.IGNORECASE).group(1)
    except: data['po_no'] = None

    try:
        data['delivery_date'] = re.search(r'(Delivery|Exit)?\s*Date\s*[:\-]?\s*([0-9]{1,2}[-/][A-Za-z0-9]+)', text, re.IGNORECASE).group(2)
    except: data['delivery_date'] = None

    quantity_match = re.findall(r'(\d+)\s+(SET|Box|Carton|Unit)', text)
    data['received_items'] = [{'quantity': int(q), 'unit': u} for q, u in quantity_match] if quantity_match else []

    return data

invoice_data = extract_invoice_data(invoice_text)
truck_data = extract_delivery_data(truck_text)

print("--- Extracted Invoice Data ---")
for k, v in invoice_data.items():
    print(f"{k}: {v}")

print("\n--- Extracted Delivery/Truck Data ---")
for k, v in truck_data.items():
    print(f"{k}: {v}")



--- Extracted Invoice Data ---
invoice_no: 626867-ADS1-1
invoice_date: 12-Aug-2023
po_no: 1002475
customer_name: None
vendor: None
line_items: [{'description': 'No (Excluding VAT) —_Rate(%) Amount (Including VAT)', 'quantity': 35.0, 'unit_price': 4.82, 'total': 168.7}]
total: None

--- Extracted Delivery/Truck Data ---
po_no: None
delivery_date: 15/08
received_items: []


In [13]:
import os
import ast
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

# ✅ Load API key from .env
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

# ✅ Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    google_api_key=google_api_key
)

# ✅ Batched prompt template
template = """
You are a document compliance engine. Generate Python functions named `check_rule_N` (where N starts from 1)
that take two dictionaries: `invoice_data` and `delivery_data`.

Each function should evaluate the corresponding rule and return a tuple: (True/False, "explanation").

Please:
- Avoid unescaped quotes inside strings
- Use triple quotes for multi-line explanations if needed
- Keep each function self-contained

Rules:
{rules}
"""

prompt = PromptTemplate(
    input_variables=["rules"],
    template=template
)

# ✅ LangChain RunnableChain using | operator
chain = prompt | llm

def generate_batched_rule_function_code(rules_list):
    joined_rules = "\n".join([f"{i+1}. {rule}" for i, rule in enumerate(rules_list)])
    response = chain.invoke({"rules": joined_rules})
    return response.content

# ✅ Strip markdown ```python fencing
def clean_code_block(code_str: str) -> str:
    lines = code_str.strip().splitlines()
    if lines and lines[0].strip().startswith("```"):
        lines = lines[1:]
    if lines and lines[-1].strip().startswith("```"):
        lines = lines[:-1]
    return "\n".join(lines)

# ✅ Safe exec with syntax check
def safe_exec_code(code_str):
    try:
        ast.parse(code_str)
        local_vars = {}
        exec(code_str, {}, local_vars)
        return local_vars, None
    except SyntaxError as e:
        return None, f"Syntax Error: {e}"
    except Exception as e:
        return None, f"Execution Error: {e}"

def evaluate_compliance(invoice_data, delivery_data, rules_list):
    results = []

    try:
        print("\nEvaluating batched rules...")
        func_code = generate_batched_rule_function_code(rules_list)
        print("----- GENERATED CODE -----")
        print(func_code)

        func_code = clean_code_block(func_code)

        local_vars, error = safe_exec_code(func_code)
        if error:
            results.append(f"⚠️ Error processing batched rules – {error}")
            return results

        for idx, rule in enumerate(rules_list):
            try:
                rule_func_name = f"check_rule_{idx+1}"
                result, explanation = local_vars[rule_func_name](invoice_data, delivery_data)
                icon = "✅" if result else "❌"
                results.append(f"{icon} {rule} → {explanation}")
            except Exception as sub_e:
                results.append(f"⚠️ Error evaluating rule {idx+1}: {rule} – {str(sub_e)}")

    except Exception as e:
        results.append(f"⚠️ Error processing batched rules – {str(e)}")

    return results

# ✅ Example Input
invoice_data = {
    "po_number": "PO123",
    "date": "2024-01-15",
    "invoice_date": "2024-01-15",
    "line_items": [{"item": "AC Unit", "quantity": 5}],
    "total": 9999.00,
    "vendor": "LEMINAR AIR CONDITIONING CO. LLC"
}

truck_data = {
    "po_number": "PO123",
    "date": "2024-01-14",
    "delivery_date": "2024-01-14",
    "line_items": [{"item": "AC Unit", "quantity": 5}]
}

rules_list = [
    "Match invoice PO number with delivery note PO number.",
    "Ensure invoice date is on or after delivery note date.",
    "Each line item in the invoice should have quantity less than or equal to received quantity in the delivery note.",
    "Invoice total amount must be within 2% of sum of all line totals.",
    "Vendor name in the invoice should be LEMINAR AIR CONDITIONING CO. LLC."
]

# ✅ Run Engine
compliance_results = evaluate_compliance(invoice_data, truck_data, rules_list)

# ✅ Print Report
print("\n--- COMPLIANCE REPORT ---")
for line in compliance_results:
    print(line)





Evaluating batched rules...
----- GENERATED CODE -----
```python
def check_rule_1(invoice_data, delivery_data):
    """
    Checks if the invoice PO number matches the delivery note PO number.
    """
    invoice_po = invoice_data.get("po_number")
    delivery_po = delivery_data.get("po_number")

    if invoice_po == delivery_po:
        return True, "Invoice PO number matches delivery note PO number."
    else:
        return False, f"Invoice PO number '{invoice_po}' does not match delivery note PO number '{delivery_po}'."


def check_rule_2(invoice_data, delivery_data):
    """
    Ensures the invoice date is on or after the delivery note date.
    Assumes dates are in YYYY-MM-DD format.
    """
    invoice_date = invoice_data.get("invoice_date")
    delivery_date = delivery_data.get("delivery_date")

    if not invoice_date or not delivery_date:
        return False, "Invoice date or delivery date is missing."

    try:
        from datetime import datetime
        invoice_date_obj

In [14]:
# PHASE 5: Generate Structured Compliance Report
from datetime import datetime

def generate_compliance_report(invoice_data, delivery_data, results, file_path="compliance_report.txt"):
    status = "PASS" if all(r.startswith("✅") for r in results) else "FAIL"
    
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("====== DOCUMENT COMPLIANCE REPORT ======\n")
        f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        f.write("Invoice Details:\n")
        f.write(f"  Invoice No: {invoice_data.get('invoice_no')}\n")
        f.write(f"  Invoice Date: {invoice_data.get('invoice_date')}\n")
        f.write(f"  PO No: {invoice_data.get('po_no')}\n")
        f.write(f"  Vendor: {invoice_data.get('vendor')}\n\n")

        f.write("Delivery Document Details:\n")
        f.write(f"  PO No: {delivery_data.get('po_no')}\n")
        f.write(f"  Delivery Date: {delivery_data.get('delivery_date')}\n")
        f.write(f"  Received Quantities: {delivery_data.get('received_items')}\n\n")

        f.write("Compliance Check Results:\n")
        for line in results:
            f.write(f"  {line}\n")
        
        f.write("\nFinal Status: {}\n".format("✅ COMPLIANT" if status == "PASS" else "❌ NON-COMPLIANT"))
    
    print(f"\n📄 Report saved as '{file_path}'")

# Generate the report
generate_compliance_report(invoice_data, truck_data, compliance_results)



📄 Report saved as 'compliance_report.txt'
