# Task 14.4 Solutions: Document AI Pipeline

This notebook contains solutions to the exercises in the Document AI Pipeline notebook.

---

## Challenge Solution: Document Comparison System

The challenge was to create a function that compares two documents and identifies similarities and differences.

### Approach
We use a multi-step VLM analysis:
1. **Individual Analysis**: Extract key info from each document separately
2. **Side-by-Side Composition**: Create a combined image with both documents
3. **Comparative Analysis**: Ask the VLM to compare "left" vs "right" documents
4. **Issue Detection**: Specifically prompt for discrepancies and inconsistencies

### Why Side-by-Side Works Better Than Sequential
- VLMs can directly compare visual elements (layout, fonts, formatting)
- Numerical discrepancies are more obvious when viewed together
- The model can reference "left document" vs "right document" clearly

### Practical Applications
- Invoice verification (comparing PO to invoice)
- Contract comparison (identifying changes between versions)
- Report validation (checking consistency across documents)

In [None]:
import torch
import gc
from PIL import Image, ImageDraw, ImageFont
from typing import Dict, List

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
# Load VLM
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

print("Loading Qwen2-VL...")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print("Loaded!")

In [None]:
def ask_about_document(image: Image.Image, question: str) -> str:
    """Ask a question about a document image."""
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": question}
        ]
    }]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
    inputs = inputs.to(model.device)
    
    with torch.inference_mode():
        output_ids = model.generate(**inputs, max_new_tokens=300, temperature=0.3)
    
    generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
def compare_documents(doc1_image: Image.Image, doc2_image: Image.Image) -> Dict:
    """
    Compare two documents and identify similarities and differences.
    
    Args:
        doc1_image: First document image
        doc2_image: Second document image
        
    Returns:
        Dictionary with comparison results
    """
    results = {}
    
    # Analyze each document separately
    print("Analyzing Document 1...")
    results['doc1_type'] = ask_about_document(
        doc1_image, 
        "What type of document is this? (e.g., invoice, report, form)"
    )
    
    results['doc1_key_info'] = ask_about_document(
        doc1_image,
        "List the key information in this document (dates, amounts, names, etc.)"
    )
    
    print("Analyzing Document 2...")
    results['doc2_type'] = ask_about_document(
        doc2_image,
        "What type of document is this? (e.g., invoice, report, form)"
    )
    
    results['doc2_key_info'] = ask_about_document(
        doc2_image,
        "List the key information in this document (dates, amounts, names, etc.)"
    )
    
    # Create side-by-side comparison
    print("Comparing documents...")
    
    # Resize to same height
    target_height = 600
    
    img1 = doc1_image.copy()
    img1.thumbnail((img1.width * target_height // img1.height, target_height))
    
    img2 = doc2_image.copy()
    img2.thumbnail((img2.width * target_height // img2.height, target_height))
    
    # Combine
    gap = 20
    combined_width = img1.width + img2.width + gap
    combined = Image.new('RGB', (combined_width, target_height), 'white')
    combined.paste(img1, (0, 0))
    combined.paste(img2, (img1.width + gap, 0))
    
    # Ask for comparison
    comparison_prompt = """These are two documents side by side (left and right).
    
Please compare them:
1. Are they the same type of document?
2. What information do they have in common?
3. What are the key differences (different dates, amounts, names, etc.)?
4. Are there any discrepancies that seem concerning?

Be specific about any numerical differences."""
    
    results['comparison'] = ask_about_document(combined, comparison_prompt)
    
    # Identify potential issues
    issues_prompt = """Looking at these two documents side by side:
    
Are there any inconsistencies, errors, or discrepancies that might be problematic?
For example: different totals, mismatched dates, or conflicting information.

If everything looks consistent, say so. Otherwise, list the specific issues."""
    
    results['potential_issues'] = ask_about_document(combined, issues_prompt)
    
    return results

print("compare_documents() function ready!")

In [None]:
# Create two sample invoices for comparison

def create_invoice(invoice_num: str, total: str, date: str) -> Image.Image:
    img = Image.new('RGB', (400, 500), 'white')
    draw = ImageDraw.Draw(img)
    
    try:
        font_large = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 20)
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
    except:
        font_large = ImageFont.load_default()
        font = ImageFont.load_default()
    
    draw.text((20, 20), "INVOICE", fill='navy', font=font_large)
    draw.text((280, 20), invoice_num, fill='black', font=font)
    draw.text((20, 60), f"Date: {date}", fill='black', font=font)
    
    draw.text((20, 100), "TechCorp Inc.", fill='black', font=font)
    draw.text((20, 120), "123 Main Street", fill='gray', font=font)
    
    # Items
    y = 180
    draw.rectangle([20, y, 380, y+25], fill='lightgray')
    draw.text((30, y+5), "Item", fill='black', font=font)
    draw.text((280, y+5), "Amount", fill='black', font=font)
    
    items = [("Service A", "$500"), ("Service B", "$300"), ("Service C", "$200")]
    y = 210
    for item, amount in items:
        draw.text((30, y), item, fill='black', font=font)
        draw.text((280, y), amount, fill='black', font=font)
        y += 30
    
    # Total
    draw.line([(20, y+10), (380, y+10)], fill='black', width=2)
    draw.text((200, y+20), "Total:", fill='black', font=font_large)
    draw.text((280, y+20), total, fill='navy', font=font_large)
    
    return img

# Create two invoices - one correct, one with a discrepancy
invoice1 = create_invoice("#001", "$1,000", "Dec 15, 2024")
invoice2 = create_invoice("#001", "$1,100", "Dec 15, 2024")  # Different total!

print("Created two invoices with a discrepancy in the total")

In [None]:
# Run the comparison
comparison_results = compare_documents(invoice1, invoice2)

print("\n" + "=" * 60)
print("DOCUMENT COMPARISON RESULTS")
print("=" * 60)

print("\n--- Document 1 ---")
print(f"Type: {comparison_results['doc1_type']}")
print(f"Key Info: {comparison_results['doc1_key_info']}")

print("\n--- Document 2 ---")
print(f"Type: {comparison_results['doc2_type']}")
print(f"Key Info: {comparison_results['doc2_key_info']}")

print("\n--- Comparison ---")
print(comparison_results['comparison'])

print("\n--- Potential Issues ---")
print(comparison_results['potential_issues'])

In [None]:
# Cleanup
del model, processor
clear_gpu_memory()
print("Solutions notebook complete!")