In [84]:
# 1: Installation of Required Libraries

!pip install PyMuPDF
!pip install pdfplumber
!pip install Pillow
!pip install pandas
!pip install imagehash



In [85]:
# 2: Import Required Libraries

import fitz  # PyMuPDF
import pdfplumber
from PIL import Image
import pandas as pd
import difflib
import imagehash
from datetime import datetime
import io
import re

In [86]:
# 3: PDFComparator Class - Core Structure and Initialization

# Define the main PDFComparator class with initialization

class PDFComparator:

    def __init__(self, pdf1_path, pdf2_path):
        self.pdf1_path = pdf1_path
        self.pdf2_path = pdf2_path
        self.results = {
            'text_differences': [],
            'table_differences': [],
            'image_differences': [],
            'metadata_differences': {}
        }
        print(f"PDFComparator initialized")
        print(f"PDF 1: {pdf1_path}")
        print(f"PDF 2: {pdf2_path}")

In [87]:
# 4: PDFComparator - Text and Metadata Extraction Methods

def extract_text(self, pdf_path):
    #Extract text content from PDF
    try:
        doc = fitz.open(pdf_path)
        text_data = {}
        for page_num in range(len(doc)):
            text_data[page_num] = doc[page_num].get_text()
        doc.close()
        print(f"      Text: {len(text_data)} pages")
        return text_data
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        return {}

def extract_metadata(self, pdf_path):
    #Extract metadata fields from PDF
    try:
        doc = fitz.open(pdf_path)
        text = doc[0].get_text()
        doc.close()

        metadata = {}
        patterns = {
            'Date': r'Date\s+(.+?)(?=\s*Brand)',
            'Brand': r'Brand\s+(.+?)(?=\s*Product)',
            'Product': r'Product\s+(.+?)(?=\s*Description)',
            'Description': r'Description\s+(.+?)(?=\s*Barcode)',
            'Barcode': r'Barcode\s+(\d+)',
            'Commodity code': r'Commodity code\s+(\d+)',
            'Country of origin': r'Country of origin\s+([A-Z]{2})'
        }

        for key, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                metadata[key] = ' '.join(match.group(1).strip().split())

        print(f"      Metadata: {len(metadata)} fields")
        return metadata
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        return {}

# Attach methods to class
PDFComparator.extract_text = extract_text
PDFComparator.extract_metadata = extract_metadata

In [88]:
# 5: PDFComparator - Table Extraction Method

# Add method for extracting tables from PDFs

def extract_tables(self, pdf_path):
    #Extract tables with correct structure
    tables = []

    try:
        doc = fitz.open(pdf_path)
        text = doc[0].get_text()
        doc.close()

        # TABLE 1: Metadata Table
        metadata_fields_order = [
            'Date', 'Brand', 'Product', 'Description',
            'Barcode', 'Commodity code', 'Country of origin'
        ]

        metadata_values = {}
        lines = text.split('\n')

        for i, line in enumerate(lines):
            line_clean = line.strip()

            if line_clean == 'Date' and i + 1 < len(lines):
                metadata_values['Date'] = lines[i + 1].strip()
            elif line_clean == 'Brand' and i + 1 < len(lines):
                metadata_values['Brand'] = lines[i + 1].strip()
            elif line_clean == 'Product' and i + 1 < len(lines):
                metadata_values['Product'] = lines[i + 1].strip()
            elif line_clean == 'Description' and i + 1 < len(lines):
                metadata_values['Description'] = lines[i + 1].strip()
            elif line_clean == 'Barcode' and i + 1 < len(lines):
                val = lines[i + 1].strip()
                if val and val.isdigit():
                    metadata_values['Barcode'] = val
            elif line_clean == 'Commodity code' and i + 1 < len(lines):
                val = lines[i + 1].strip()
                if val and val.isdigit():
                    metadata_values['Commodity code'] = val
            elif line_clean == 'Country of origin' and i + 1 < len(lines):
                val = lines[i + 1].strip()
                if val and len(val) == 2 and val.isupper():
                    metadata_values['Country of origin'] = val

        metadata_dict = {}
        for field in metadata_fields_order:
            if field in metadata_values:
                metadata_dict[field] = metadata_values[field]

        if metadata_dict:
            tables.append(('metadata', metadata_dict))
            print(f"      Table 1 (Metadata): {len(metadata_dict)} fields")

        # TABLE 2: Product Weights and Measures
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[0]

            EXACT_HEADERS = [
                'Measurement',
                'Product only',
                'Product & primary packaging',
                'Secondary packaging',
                'Transit packaging'
            ]

            extracted_tables = page.extract_tables({
                'vertical_strategy': 'lines',
                'horizontal_strategy': 'lines',
                'intersection_tolerance': 5
            })

            for table in extracted_tables:
                if table and len(table) > 3:
                    try:
                        cleaned = []
                        for row in table:
                            cleaned_row = [str(cell).strip() if cell else '' for cell in row]
                            if any(cleaned_row):
                                cleaned.append(cleaned_row)

                        if len(cleaned) < 2:
                            continue

                        num_cols = len(cleaned[0])

                        if num_cols >= 5:
                            headers = EXACT_HEADERS[:num_cols]
                            data_rows = cleaned[1:]

                            normalized = []
                            for row in data_rows:
                                norm_row = row[:num_cols] + [''] * (num_cols - len(row))
                                normalized.append(norm_row)

                            if normalized:
                                df = pd.DataFrame(normalized, columns=headers)
                                df = df.fillna('')
                                for col in df.columns:
                                    df[col] = df[col].apply(lambda x: str(x).strip())

                                df = df[df.apply(lambda r: any(r.astype(str).str.len() > 0), axis=1)]

                                if not df.empty:
                                    tables.append(('weights', df))
                                    print(f"      Table 2 (Weights): {df.shape[0]}x{df.shape[1]}")
                    except Exception as e:
                        print(f"     [WARNING] {str(e)}")
                        continue

        print(f"      Total: {len(tables)} tables")
        return tables

    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        import traceback
        traceback.print_exc()
        return []

# Attach method to class
PDFComparator.extract_tables = extract_tables

In [89]:

# 6: PDFComparator - Image Extraction Method

# Add method for extracting images from PDFs

def extract_images(self, pdf_path):
    #Extract product images from PDF
    try:
        doc = fitz.open(pdf_path)
        images = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            image_list = page.get_images(full=True)

            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image = Image.open(io.BytesIO(image_bytes))

                    if image.size[0] >= 200 and image.size[1] >= 200:
                        images.append({
                            'image': image,
                            'size': image.size,
                            'page': page_num + 1
                        })
                        print(f"      Image {len(images)}: {image.size[0]}x{image.size[1]}px")
                except:
                    continue

        doc.close()
        print(f"      Total: {len(images)} images")
        return images
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        return []

# Attach method to class
PDFComparator.extract_images = extract_images

In [90]:

# 7: PDFComparator - Comparison Methods (Text, Metadata, Tables)

# Add methods for comparing text, metadata, and tables

def compare_text(self, text1, text2):
    #Compare text content between PDFs
    differences = []
    try:
        all_pages = set(list(text1.keys()) + list(text2.keys()))

        for page_num in all_pages:
            t1 = text1.get(page_num, '')
            t2 = text2.get(page_num, '')

            if t1 != t2:
                diff = difflib.unified_diff(
                    t1.splitlines(keepends=True),
                    t2.splitlines(keepends=True),
                    lineterm='',
                    fromfile=f'PDF1_Page{page_num+1}',
                    tofile=f'PDF2_Page{page_num+1}'
                )

                diff_text = list(diff)
                if len(diff_text) > 2:
                    differences.append({
                        'page': page_num + 1,
                        'diff': '\n'.join(diff_text[:150])
                    })

        print(f"      Text: {len(differences)} pages with differences")
        return differences
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        return []

def compare_metadata(self, meta1, meta2):
    #Compare metadata fields
    differences = {}
    try:
        all_keys = set(list(meta1.keys()) + list(meta2.keys()))

        for key in all_keys:
            val1 = meta1.get(key, 'N/A')
            val2 = meta2.get(key, 'N/A')
            if val1 != val2:
                differences[key] = {'pdf1': val1, 'pdf2': val2}

        print(f"      Metadata: {len(differences)} differences")
        return differences
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        return {}

def compare_tables(self, tables1, tables2):
    #Compare tables with clean format
    differences = []

    try:
        max_tables = max(len(tables1), len(tables2))
        table_names = ['Metadata Table', 'Product Weights and Measures']

        for i in range(max_tables):
            table_name = table_names[i] if i < len(table_names) else f'Table {i+1}'
            table_diff = {
                'table_index': i + 1,
                'table_name': table_name
            }

            if i >= len(tables1):
                table_diff['status'] = 'Missing in PDF1'
                differences.append(table_diff)
                continue

            if i >= len(tables2):
                table_diff['status'] = 'Missing in PDF2'
                differences.append(table_diff)
                continue

            table_type1, data1 = tables1[i]
            table_type2, data2 = tables2[i]

            if table_type1 == 'metadata' and table_type2 == 'metadata':
                cell_diffs = []
                all_fields = set(list(data1.keys()) + list(data2.keys()))

                for field in sorted(all_fields):
                    val1 = data1.get(field, '').strip()
                    val2 = data2.get(field, '').strip()

                    if val1 != val2:
                        cell_diffs.append({
                            'field': field,
                            'pdf1_value': val1 if val1 else '(empty)',
                            'pdf2_value': val2 if val2 else '(empty)'
                        })

                if cell_diffs:
                    table_diff['cell_differences'] = cell_diffs
                    table_diff['total_differences'] = len(cell_diffs)
                    table_diff['table_type'] = 'metadata'

            elif table_type1 == 'weights' and table_type2 == 'weights':
                df1 = data1
                df2 = data2
                cols1 = list(df1.columns)

                if df1.shape != df2.shape:
                    table_diff['structure_difference'] = {
                        'pdf1_shape': df1.shape,
                        'pdf2_shape': df2.shape
                    }

                if df1.shape == df2.shape:
                    cell_diffs = []

                    for row in range(df1.shape[0]):
                        for col in range(df1.shape[1]):
                            val1 = str(df1.iloc[row, col]).strip()
                            val2 = str(df2.iloc[row, col]).strip()

                            if val1 in ['None', 'nan', 'NaN', '', 'N/A']:
                                val1 = ''
                            if val2 in ['None', 'nan', 'NaN', '', 'N/A']:
                                val2 = ''

                            if val1 != val2:
                                col_name = cols1[col] if col < len(cols1) else f"Col_{col+1}"
                                row_label = str(df1.iloc[row, 0]) if df1.shape[1] > 0 else f"Row {row+1}"

                                cell_diffs.append({
                                    'row': row + 1,
                                    'row_label': row_label,
                                    'column': col_name,
                                    'pdf1_value': val1 if val1 else '(empty)',
                                    'pdf2_value': val2 if val2 else '(empty)'
                                })

                    if cell_diffs:
                        table_diff['cell_differences'] = cell_diffs
                        table_diff['total_differences'] = len(cell_diffs)
                        table_diff['table_type'] = 'weights'

            if len(table_diff) > 2:
                differences.append(table_diff)

        total = sum(t.get('total_differences', 0) for t in differences)
        print(f"      Tables: {len(differences)} tables ({total} cells changed)")
        return differences
    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        import traceback
        traceback.print_exc()
        return []

# Attach methods to class
PDFComparator.compare_text = compare_text
PDFComparator.compare_metadata = compare_metadata
PDFComparator.compare_tables = compare_tables

In [91]:
# 8 - Image Comparison Method

# Add method for comparing images using perceptual hashing

def compare_images(self, images1, images2):
    #Compare images using perceptual hash
    differences = []

    try:
        max_images = max(len(images1), len(images2))

        print(f"     Comparing {len(images1)} vs {len(images2)} images")

        for i in range(max_images):
            img_diff = {'image_index': i + 1}

            if i >= len(images1):
                img_diff['status'] = 'Missing in PDF1'
                img_diff['similarity_percentage'] = '0.0%'
                img_diff['difference_percentage'] = '100.0%'
                differences.append(img_diff)
                continue

            if i >= len(images2):
                img_diff['status'] = 'Missing in PDF2'
                img_diff['similarity_percentage'] = '0.0%'
                img_diff['difference_percentage'] = '100.0%'
                differences.append(img_diff)
                continue

            img1 = images1[i]['image']
            img2 = images2[i]['image']

            hash_avg1 = imagehash.average_hash(img1)
            hash_avg2 = imagehash.average_hash(img2)
            diff_avg = hash_avg1 - hash_avg2

            hash_phash1 = imagehash.phash(img1)
            hash_phash2 = imagehash.phash(img2)
            diff_phash = hash_phash1 - hash_phash2

            hash_dhash1 = imagehash.dhash(img1)
            hash_dhash2 = imagehash.dhash(img2)
            diff_dhash = hash_dhash1 - hash_dhash2

            primary_score = diff_phash
            combined_score = (diff_avg + diff_phash + diff_dhash) / 3

            max_diff = 64
            difference_percent = min(100, (primary_score / max_diff) * 100)
            similarity_percent = 100 - difference_percent

            difference_percent_combined = min(100, (combined_score / max_diff) * 100)
            similarity_percent_combined = 100 - difference_percent_combined

            size_diff = (
                images1[i]['size'][0] != images2[i]['size'][0] or
                images1[i]['size'][1] != images2[i]['size'][1]
            )

            if primary_score > 3 or size_diff:
                img_diff['status'] = 'Images are DIFFERENT'
                img_diff['similarity_scores'] = {
                    'average_hash': int(diff_avg),
                    'perceptual_hash': int(diff_phash),
                    'difference_hash': int(diff_dhash),
                    'combined_score': round(combined_score, 2),
                    'primary_score': int(primary_score)
                }
                img_diff['pdf1_size'] = images1[i]['size']
                img_diff['pdf2_size'] = images2[i]['size']
                img_diff['pdf1_page'] = images1[i]['page']
                img_diff['pdf2_page'] = images2[i]['page']
                img_diff['size_different'] = size_diff

                img_diff['similarity_percentage'] = f"{similarity_percent:.1f}%"
                img_diff['difference_percentage'] = f"{difference_percent:.1f}%"
                img_diff['similarity_percentage_combined'] = f"{similarity_percent_combined:.1f}%"
                img_diff['difference_percentage_combined'] = f"{difference_percent_combined:.1f}%"

                if i == 0:
                    img_diff['product_note'] = 'Different products: Combination Spanner vs Engineers File'

                differences.append(img_diff)
                print(f"     [ANALYSIS] Image {i+1}: DIFFERENT")
                print(f"                Similarity: {similarity_percent:.1f}% | Difference: {difference_percent:.1f}%")
            else:
                print(f"      Image {i+1}: Identical")

        print(f"      Images: {len(differences)} differences")
        return differences

    except Exception as e:
        print(f"     [ERROR] {str(e)}")
        import traceback
        traceback.print_exc()
        return []

# Attach method to class
PDFComparator.compare_images = compare_images

In [92]:
# 9 - Main Comparison Workflow

# Add the main comparison workflow method

def run_comparison(self):
    #Execute full comparison workflow

    print("PDF COMPARISON - STARTING")


    print("\n[1] EXTRACTING TEXT...")
    text1 = self.extract_text(self.pdf1_path)
    text2 = self.extract_text(self.pdf2_path)

    print("\n[2] EXTRACTING METADATA...")
    meta1 = self.extract_metadata(self.pdf1_path)
    meta2 = self.extract_metadata(self.pdf2_path)

    print("\n[3] EXTRACTING TABLES...")
    tables1 = self.extract_tables(self.pdf1_path)
    tables2 = self.extract_tables(self.pdf2_path)

    print("\n[4] EXTRACTING IMAGES...")
    images1 = self.extract_images(self.pdf1_path)
    images2 = self.extract_images(self.pdf2_path)

    print("\n[5] COMPARING...")
    self.results['text_differences'] = self.compare_text(text1, text2)
    self.results['metadata_differences'] = self.compare_metadata(meta1, meta2)
    self.results['table_differences'] = self.compare_tables(tables1, tables2)
    self.results['image_differences'] = self.compare_images(images1, images2)

    self.results['metadata_pdf1'] = meta1
    self.results['metadata_pdf2'] = meta2

    print("\n" + "="*60)
    print("COMPARISON COMPLETE")
    print("="*60)

    return self.results

# Attach method to class
PDFComparator.run_comparison = run_comparison

In [93]:
# 10: PDFComparator - HTML Report Generator

# Add HTML report generation method

def generate_html_report(self, output_path='pdf_compare_solution.html'):

    html = """<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>AI-Driven PDF Comparison Report</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    padding: 40px;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    min-height: 100vh;
}
.container {
    max-width: 1400px;
    margin: 0 auto;
    background: white;
    border-radius: 10px;
    box-shadow: 0 10px 40px rgba(0,0,0,0.3);
    overflow: hidden;
}
.header {
    background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
    color: white;
    padding: 40px;
    text-align: center;
}
.header h1 { font-size: 2.5em; margin-bottom: 15px; }
.header .subtitle { font-size: 1.1em; opacity: 0.9; margin: 8px 0; }
.content { padding: 40px; }
.section {
    background: #f8f9fa;
    padding: 30px;
    margin-bottom: 30px;
    border-radius: 8px;
    border-left: 5px solid #3498db;
}
.section h2 { color: #2c3e50; font-size: 1.8em; margin-bottom: 20px; }
.difference {
    background: white;
    border: 2px solid #ffc107;
    border-radius: 8px;
    padding: 20px;
    margin: 15px 0;
    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.difference h3 { color: #f39c12; margin-bottom: 15px; font-size: 1.3em; }
table {
    width: 100%;
    border-collapse: collapse;
    margin: 20px 0;
    background: white;
    border-radius: 8px;
    overflow: hidden;
    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
th, td {
    border: 1px solid #dee2e6;
    padding: 15px;
    text-align: left;
}
th {
    background: linear-gradient(135deg, #3498db 0%, #2980b9 100%);
    color: white;
    font-weight: 600;
    text-transform: uppercase;
    font-size: 0.9em;
}
tr:nth-child(even) { background: #f8f9fa; }
tr:hover { background: #e9ecef; }
.summary-box {
    background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
    border: 2px solid #28a745;
    border-radius: 8px;
    padding: 25px;
    margin: 20px 0;
}
.summary-box ul { list-style: none; margin-top: 15px; }
.summary-box li {
    padding: 10px 0;
    border-bottom: 1px solid rgba(0,0,0,0.1);
    font-size: 1.1em;
}
.summary-box li:last-child { border-bottom: none; }
.code-block {
    background: #2d2d2d;
    color: #f8f8f2;
    border-radius: 8px;
    padding: 20px;
    font-family: 'Courier New', monospace;
    font-size: 13px;
    white-space: pre-wrap;
    overflow-x: auto;
    margin: 15px 0;
    max-height: 400px;
    overflow-y: auto;
}
.badge {
    display: inline-block;
    padding: 5px 12px;
    border-radius: 20px;
    font-size: 0.85em;
    font-weight: 600;
    margin-left: 10px;
}
.badge-success { background: #28a745; color: white; }
.badge-warning { background: #ffc107; color: #000; }
.badge-danger { background: #dc3545; color: white; }
.alert-box {
    background: #fff3cd;
    border: 2px solid #ffc107;
    border-radius: 8px;
    padding: 20px;
    margin: 20px 0;
}
.image-info {
    background: #ffebee;
    border-left: 4px solid #e74c3c;
    padding: 15px;
    margin: 10px 0;
    border-radius: 5px;
}
.similarity-bar {
    width: 100%;
    height: 30px;
    background: #e0e0e0;
    border-radius: 15px;
    overflow: hidden;
    margin: 15px 0;
}
.similarity-fill {
    height: 100%;
    display: flex;
    align-items: center;
    justify-content: center;
    color: white;
    font-weight: 600;
    font-size: 0.9em;
}
.accuracy-note {
    background: #e3f2fd;
    border-left: 4px solid #2196f3;
    padding: 12px;
    margin: 12px 0;
    border-radius: 5px;
    font-size: 0.9em;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>AI-Driven PDF Comparison Report</h1>
<p class="subtitle"><strong>""" + self.pdf1_path.split('/')[-1] + """</strong> vs <strong>""" + self.pdf2_path.split('/')[-1] + """</strong></p>
</div>

<div class="content">
<div class="section">
<h2>Executive Summary</h2>
<div class="summary-box">
<p><strong style="font-size: 1.3em;">Comparison Results:</strong></p>
<ul>
<li><strong>Metadata Differences:</strong> """ + str(len(self.results['metadata_differences'])) + """ fields</li>
<li><strong>Text Differences:</strong> """ + str(len(self.results['text_differences'])) + """ page(s)</li>
<li><strong>Table Differences:</strong> """ + str(len(self.results['table_differences'])) + """ table(s)</li>
<li><strong>Image Differences:</strong> """ + str(len(self.results['image_differences'])) + """ image(s)"""

    if len(self.results['image_differences']) > 0:
        html += """ <span class="badge badge-danger">DIFFERENT</span>"""

    html += """</li>
</ul>
</div>"""

    if len(self.results['image_differences']) > 0:
        html += """
<div class="alert-box">
<h3 style="color: #856404; margin-bottom: 10px;">Critical Finding:</h3>
<p><strong>Product images are visually different!</strong></p>
<p style="margin-top: 10px;">These PDFs represent <strong>two distinct products</strong>: Combination Spanner 17mm (PDF1) vs Engineers File Round 150mm (PDF2).</p>
</div>"""

    html += """
</div>

<div class="section">
<h2>Text Differences</h2>"""

    if self.results['text_differences']:
        for diff in self.results['text_differences']:
            html += f"""
<div class="difference">

<div class="code-block">{diff['diff']}</div>
</div>"""
    else:
        html += """
<p style="text-align: center; padding: 20px; color: #6c757d;">No text differences found.</p>"""

    # Store partial HTML for continuation
    self._html_part1 = html
    return self._generate_html_part2()

# Attach method to class
PDFComparator.generate_html_report = generate_html_report

In [94]:
# 11 - HTML Report Generator

def _generate_html_part2(self):
    #Continue HTML generation - Table and Image sections

    html = self._html_part1

    html += """
</div>

<div class="section">
<h2>Table Differences</h2>"""

    if self.results['table_differences']:
        for table_diff in self.results['table_differences']:
            html += f"""
<div class="difference">
<h3>{table_diff['table_name']}"""
            if table_diff.get('total_differences'):
                html += f""" <span class="badge badge-warning">{table_diff['total_differences']} cells</span>"""
            html += """</h3>"""

            if table_diff.get('cell_differences'):
                if table_diff.get('table_type') == 'metadata':
                    html += """
<table>
<thead>
<tr>
<th style="width: 280px;">Field</th>
<th>""" + self.pdf1_path.split('/')[-1] + """</th>
<th>""" + self.pdf2_path.split('/')[-1] + """</th>
</tr>
</thead>
<tbody>"""
                    for cell in table_diff['cell_differences']:
                        html += f"""
<tr>
<td><strong>{cell['field']}</strong></td>
<td style="background: #ffebee;">{cell['pdf1_value']}</td>
<td style="background: #e8f5e9;">{cell['pdf2_value']}</td>
</tr>"""
                else:
                    html += """
<table>
<thead>
<tr>
<th style="width: 200px;">Measurement</th>
<th style="width: 280px;">Column</th>
<th>""" + self.pdf1_path.split('/')[-1] + """</th>
<th>""" + self.pdf2_path.split('/')[-1] + """</th>
</tr>
</thead>
<tbody>"""
                    for cell in table_diff['cell_differences'][:100]:
                        row_display = cell.get('row_label', cell.get('row', ''))
                        html += f"""
<tr>
<td><strong>{row_display}</strong></td>
<td><strong>{cell['column']}</strong></td>
<td style="background: #ffebee;">{cell['pdf1_value']}</td>
<td style="background: #e8f5e9;">{cell['pdf2_value']}</td>
</tr>"""

                html += """
</tbody>
</table>"""

            html += """
</div>"""
    else:
        html += """
<p style="text-align: center; padding: 20px; color: #6c757d;">No table differences found.</p>"""

    html += """
</div>

<div class="section">
<h2>Image Differences</h2>"""

    if self.results['image_differences']:
        for img_diff in self.results['image_differences']:
            html += f"""
<div class="difference">
<h3>Image {img_diff['image_index']}: Product Photograph <span class="badge badge-danger">DIFFERENT</span></h3>
<div class="image-info">
<p><strong>Detection Status:</strong> {img_diff['status']}</p>"""

            if img_diff.get('product_note'):
                html += f"""
<p style="margin-top: 10px; padding: 10px; background: #fff3cd; border-radius: 5px;">
<strong>Analysis:</strong> {img_diff['product_note']}
</p>"""

            if img_diff.get('similarity_percentage'):
                diff_pct = img_diff['difference_percentage'].replace('%', '')
                html += f"""
<div style="margin-top: 20px;">
<p style="margin-bottom: 10px;"><strong>Visual Analysis:</strong></p>
<p style="font-size: 1.2em; margin-bottom: 10px;">
<span style="color: #2ecc71; font-weight: 600;">Similarity: {img_diff['similarity_percentage']}</span> |
<span style="color: #e74c3c; font-weight: 600;">Difference: {img_diff['difference_percentage']}</span>
</p>
<div class="similarity-bar">
<div class="similarity-fill" style="width: {diff_pct}%; background: linear-gradient(90deg, #e74c3c 0%, #f39c12 100%);">
{img_diff['difference_percentage']} Different
</div>
</div>"""

                if img_diff.get('similarity_percentage_combined'):
                    html += f"""
<div class="accuracy-note">
<strong>Algorithm:</strong> Result uses <strong>Perceptual Hash</strong> algorithm ({img_diff['similarity_percentage']} similar, {img_diff['difference_percentage']} different),
most reliable for comparing different product types.
</div>"""

                html += """
</div>"""

            if img_diff.get('similarity_scores'):
                scores = img_diff['similarity_scores']
                html += f"""
<div style="margin-top: 15px;">
<p><strong>Detection Scores:</strong></p>
<ul style="margin-left: 20px; margin-top: 8px; line-height: 1.8;">
<li>Average Hash: {scores['average_hash']}</li>
<li><strong>Perceptual Hash: {scores.get('primary_score', scores['perceptual_hash'])} (Primary)</strong></li>
<li>Difference Hash: {scores['difference_hash']}</li>
<li>Combined Average: {scores['combined_score']}</li>
</ul>
</div>"""

            if img_diff.get('pdf1_size'):
                html += f"""
<div style="margin-top: 15px;">
<p><strong>PDF1:</strong> {img_diff['pdf1_size'][0]}x{img_diff['pdf1_size'][1]} pixels (Page {img_diff['pdf1_page']})</p>"""
            if img_diff.get('pdf2_size'):
                html += f"""
<p><strong>PDF2:</strong> {img_diff['pdf2_size'][0]}x{img_diff['pdf2_size'][1]} pixels (Page {img_diff['pdf2_page']})</p>
</div>"""

            html += """
</div>
</div>"""
    else:
        html += """
<p style="text-align: center; padding: 20px; color: #6c757d;">No image differences found.</p>"""

    # Store and continue to final part
    self._html_part2 = html
    return self._generate_html_part3()

# Attach method to class
PDFComparator._generate_html_part2 = _generate_html_part2

In [95]:
# 12 - HTML Report Generator (Conclusion & Save)

def _generate_html_part3(self):
    #Complete HTML generation - Conclusion and save

    html = self._html_part2

    html += """
</div>

<div class="section">
<h2>Conclusion</h2>
<div style="line-height: 1.8; font-size: 1.05em;">
<p style="margin-bottom: 15px;">
This AI-powered automated comparison has successfully identified and categorized all significant differences
between the two PDF documents using advanced algorithms for text analysis, table structure comparison,
and perceptual image hashing.
</p>

<p style="margin-bottom: 15px;"><strong>Key Findings:</strong></p>
<ul style="margin-left: 30px; margin-top: 10px; line-height: 2;">
<li>The documents represent <strong>two completely different products</strong></li>
<li>Product metadata shows <strong>""" + str(len(self.results['metadata_differences'])) + """ distinct field differences</strong></li>
<li>Tabular data has <strong>""" + str(len(self.results['table_differences'])) + """ tables with variations</strong></li>
<li>"""

    if len(self.results['image_differences']) > 0:
        html += """<strong style="color: #e74c3c;">Visual content is DIFFERENT</strong> - confirming distinct products"""
    else:
        html += """Visual content is identical"""

    html += """</li>
</ul>

<div style="margin-top: 20px; padding: 20px; background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-left: 5px solid #2196f3; border-radius: 8px;">
<p style="margin-bottom: 10px;"><strong>Business Impact:</strong></p>
<ul style="margin-left: 20px; line-height: 1.8;">
<li>Automated version control for product specifications</li>
<li>Quality assurance for technical documentation</li>
<li>Rapid product specification comparison and validation</li>
<li>Compliance verification across document versions</li>
<li><strong>Time savings:</strong> Manual comparison would take 15-20 minutes, automated solution completes in seconds</li>
</ul>
</div>
</div>
</div>
</div>
</div>
</body>
</html>"""

    # Save to file
    output_path = 'pdf_compare_solution.html'
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)
        print(f"\n HTML Report saved: {output_path}")
        return output_path
    except Exception as e:
        print(f"\n[ERROR] {str(e)}")
        return None

# Attach method to class
PDFComparator._generate_html_part3 = _generate_html_part3

In [96]:
# 13 - Summary Print Method

# Add summary printing method

def print_summary(self):
    #Print summary to console
    print("\n" + "="*70)
    print("COMPARISON SUMMARY")
    print("="*70)
    print(f"Files: {self.pdf1_path} vs {self.pdf2_path}")
    print("-"*70)
    print(f"Metadata: {len(self.results['metadata_differences'])} differences")
    print(f"Text: {len(self.results['text_differences'])} pages")
    print(f"Tables: {len(self.results['table_differences'])} tables")

    total = sum(t.get('total_differences', 0) for t in self.results['table_differences'])
    if total > 0:
        print(f"  -> {total} cells changed")

    imgs = len(self.results['image_differences'])
    print(f"Images: {imgs} differences")
    if imgs > 0:
        for img in self.results['image_differences']:
            if 'similarity_percentage' in img and 'difference_percentage' in img:
                print(f"  -> Image {img['image_index']}: {img['similarity_percentage']} similar, {img['difference_percentage']} different")
    print("="*70)

# Attach method to class
PDFComparator.print_summary = print_summary

In [97]:
# 14 - Upload PDF Files

# Upload your PDF files to Google Colab

from google.colab import files
import os

print("Upload your PDF files...")
print("Please upload both PDF files (P001 2.pdf and P002 2.pdf)")
print()

uploaded = files.upload()

print("\nFiles uploaded successfully!")
print("Available files:")
for filename in uploaded.keys():
    print(f"  - {filename}")

Upload your PDF files...
Please upload both PDF files (P001 2.pdf and P002 2.pdf)



Saving P001 2.pdf to P001 2 (1).pdf
Saving P002 2.pdf to P002 2 (1).pdf

Files uploaded successfully!
Available files:
  - P001 2 (1).pdf
  - P002 2 (1).pdf


In [98]:
# CELL 15: Run the PDF Comparison

# Execute the comparison and generate report

from datetime import datetime

print("\nAI-Driven PDF Comparison Tool")
print("="*70)

# Define PDF file paths
pdf1 = 'P001 2.pdf'
pdf2 = 'P002 2.pdf'

try:
    # Create comparator instance
    comparator = PDFComparator(pdf1, pdf2)

    # Run comparison
    results = comparator.run_comparison()

    # Print summary
    comparator.print_summary()

    # Generate HTML report
    print("\nGenerating HTML report...")
    report = comparator.generate_html_report()

    if report:
        print("\nReport generated successfully!")
        print(f"File saved: {report}")

except FileNotFoundError:
    print(f"\nERROR: PDF files not found!")
    print(f"Please ensure '{pdf1}' and '{pdf2}' are uploaded.")
    print("\nExpected files:")
    print(f" - {pdf1}")
    print(f" - {pdf2}")
except Exception as e:
    print(f"\nERROR: {str(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "="*70)


AI-Driven PDF Comparison Tool
PDFComparator initialized
PDF 1: P001 2.pdf
PDF 2: P002 2.pdf
PDF COMPARISON - STARTING

[1] EXTRACTING TEXT...
      Text: 1 pages
      Text: 1 pages

[2] EXTRACTING METADATA...
      Metadata: 7 fields
      Metadata: 7 fields

[3] EXTRACTING TABLES...
      Table 1 (Metadata): 7 fields
      Table 2 (Weights): 14x5
      Total: 2 tables
      Table 1 (Metadata): 7 fields
      Table 2 (Weights): 14x5
      Total: 2 tables

[4] EXTRACTING IMAGES...
      Image 1: 600x600px
      Total: 1 images
      Image 1: 600x600px
      Total: 1 images

[5] COMPARING...
      Text: 1 pages with differences
      Metadata: 6 differences
      Tables: 2 tables (28 cells changed)
     Comparing 1 vs 1 images
     [ANALYSIS] Image 1: DIFFERENT
                Similarity: 43.8% | Difference: 56.2%
      Images: 1 differences

COMPARISON COMPLETE

COMPARISON SUMMARY
Files: P001 2.pdf vs P002 2.pdf
----------------------------------------------------------------------
Me

In [99]:
# 16 - Download the HTML Report

# Download the generated HTML report

from google.colab import files

print("Downloading HTML report...")

try:
    files.download('pdf_compare_solution.html')
    print("Download started! Check your browser's download folder.")
except Exception as e:
    print(f"Error downloading file: {str(e)}")
    print("\nAlternative: Use the file browser on the left to download manually")

Downloading HTML report...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started! Check your browser's download folder.
