# PDF Page Classification Test

Test the PDF processing and classification system in Google Colab

## 1. Install Dependencies

In [None]:
!apt-get update
!apt-get install -y tesseract-ocr poppler-utils
!pip install pdf2image PyPDF2 pytesseract pillow numpy

## 2. Upload Your PDF

Click the folder icon on the left, then upload your PDF file.

In [None]:
from google.colab import files
import os

# Upload PDF
print("Please upload your PDF file:")
uploaded = files.upload()

# Get the uploaded filename
pdf_filename = list(uploaded.keys())[0]
print(f"\n‚úÖ Uploaded: {pdf_filename}")

## 3. PDF Processor Class

In [None]:
from pdf2image import convert_from_path
from PIL import Image
import PyPDF2
import pytesseract
import numpy as np
import base64
from io import BytesIO
from typing import Dict, List

class PDFProcessor:
    def __init__(self):
        self.page_types = [
            'floor_plan', 'elevation', 'section',
            'electrical_plan', 'plumbing_plan', 'hvac_plan',
            'site_plan', 'detail', 'notes', 'cover_page',
            'schedule', 'unknown'
        ]
    
    def process_pdf(self, pdf_path: str, output_dir: str = '/content/output') -> Dict:
        """Process PDF and classify each page"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Get total pages
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
        
        print(f"\nüìÑ Converting {total_pages} pages to images...")
        
        # Convert PDF to images
        images = convert_from_path(pdf_path, dpi=300, fmt='jpeg')
        
        # Process each page
        pages_data = []
        
        for i, image in enumerate(images):
            page_num = i + 1
            print(f"\n{'='*60}")
            print(f"üìÑ Processing Page {page_num}/{total_pages}")
            print(f"{'='*60}")
            
            # Save image
            image_path = os.path.join(output_dir, f'page_{page_num}.jpg')
            image.save(image_path, 'JPEG', quality=95)
            
            # Classify page
            classification = self._classify_page(image_path, image)
            
            pages_data.append({
                'page_number': page_num,
                'type': classification['type'],
                'confidence': classification['confidence'],
                'title': classification['title'],
                'analyzable': classification['analyzable'],
                'metadata': classification.get('metadata', {})
            })
            
            # Display results
            print(f"\n‚úÖ Classification Result:")
            print(f"   Type: {classification['type']}")
            print(f"   Title: {classification['title']}")
            print(f"   Analyzable: {'‚úÖ YES' if classification['analyzable'] else '‚ùå NO'}")
            print(f"   Confidence: {classification['confidence']:.2f}")
            
            # Show thumbnail
            thumbnail = image.copy()
            thumbnail.thumbnail((200, 200))
            display(thumbnail)
        
        return {
            'total_pages': total_pages,
            'pages': pages_data
        }
    
    def _classify_page(self, image_path: str, image: Image.Image) -> Dict:
        """Classify a single page"""
        
        # Extract text with OCR
        print("\nüîç Extracting text with OCR...")
        text = pytesseract.image_to_string(image)
        text_lower = text.lower()
        word_count = len(text.split())
        
        print(f"   Words extracted: {word_count}")
        if word_count > 0:
            preview = text[:150].replace('\n', ' ')
            print(f"   Text preview: {preview}...")
        
        # Analyze image
        print("\nüñºÔ∏è Analyzing image properties...")
        img_array = np.array(image)
        height, width = img_array.shape[:2]
        aspect_ratio = width / height
        
        # Drawing detection
        is_drawing = self._looks_like_drawing(img_array)
        
        print(f"   Aspect ratio: {aspect_ratio:.2f}")
        print(f"   Looks like drawing: {'‚úÖ YES' if is_drawing else '‚ùå NO'}")
        
        # Classification logic
        page_type = 'unknown'
        confidence = 0.5
        title = "Unknown Page"
        analyzable = False
        
        # Text-heavy pages
        if word_count > 100:
            if any(kw in text_lower for kw in ['note', 'specification', 'general', 'description']):
                page_type = 'notes'
                title = "Notes & Specifications"
                analyzable = False
                confidence = 0.85
            elif any(kw in text_lower for kw in ['schedule', 'finish', 'door', 'window', 'room']):
                page_type = 'schedule'
                title = "Schedule"
                analyzable = False
                confidence = 0.80
        
        # Drawing pages
        else:
            # Check for keywords
            if any(kw in text_lower for kw in ['floor plan', 'plan view', 'first floor', 'second floor']):
                page_type = 'floor_plan'
                title = "Floor Plan"
                analyzable = True
                confidence = 0.90
            elif any(kw in text_lower for kw in ['elevation', 'front elevation', 'side elevation']):
                page_type = 'elevation'
                title = "Elevation"
                analyzable = True
                confidence = 0.90
            elif any(kw in text_lower for kw in ['electrical', 'power', 'lighting']):
                page_type = 'electrical_plan'
                title = "Electrical Plan"
                analyzable = True
                confidence = 0.85
            # Use image analysis
            elif is_drawing or word_count < 100:
                page_type = 'floor_plan'
                title = "Floor Plan (Auto-detected)"
                analyzable = True
                confidence = 0.55 if is_drawing else 0.50
        
        return {
            'type': page_type,
            'confidence': confidence,
            'title': title,
            'analyzable': analyzable,
            'metadata': {
                'word_count': word_count,
                'aspect_ratio': aspect_ratio,
                'is_drawing': is_drawing
            }
        }
    
    def _looks_like_drawing(self, img_array: np.ndarray) -> bool:
        """Detect if image looks like an architectural drawing"""
        try:
            # Convert to grayscale
            if len(img_array.shape) == 3:
                gray = np.mean(img_array, axis=2).astype(np.uint8)
            else:
                gray = img_array.astype(np.uint8)
            
            # Calculate statistics
            height, width = gray.shape
            total_pixels = height * width
            
            avg_brightness = np.mean(gray)
            brightness_std = np.std(gray)
            dark_pixels = np.sum(gray < 200)
            dark_ratio = dark_pixels / total_pixels
            
            # Drawing characteristics
            mostly_white = avg_brightness > 200
            has_content = dark_ratio > 0.05
            has_contrast = brightness_std > 30
            
            is_drawing = mostly_white and has_content and has_contrast
            
            print(f"   Brightness: {avg_brightness:.1f}")
            print(f"   Dark pixel ratio: {dark_ratio:.3f}")
            print(f"   Contrast (std): {brightness_std:.1f}")
            
            return is_drawing
        except Exception as e:
            print(f"   ‚ö†Ô∏è Drawing detection failed: {e}")
            return False

print("‚úÖ PDFProcessor class loaded!")

## 4. Process the PDF

In [None]:
# Create processor
processor = PDFProcessor()

# Process PDF
result = processor.process_pdf(pdf_filename)

print("\n" + "="*60)
print("üìä SUMMARY")
print("="*60)
print(f"Total pages: {result['total_pages']}")

analyzable_count = sum(1 for p in result['pages'] if p['analyzable'])
print(f"Analyzable pages: {analyzable_count}")
print(f"Not analyzable: {result['total_pages'] - analyzable_count}")

print("\nüìã Page Details:")
for page in result['pages']:
    status = "‚úÖ ANALYZABLE" if page['analyzable'] else "‚ùå NOT ANALYZABLE"
    print(f"  Page {page['page_number']}: {page['title']} - {status} ({page['confidence']:.0%})")

## 5. Detailed Analysis of Each Page

In [None]:
import pandas as pd

# Create DataFrame for analysis
df_data = []
for page in result['pages']:
    df_data.append({
        'Page': page['page_number'],
        'Type': page['type'],
        'Title': page['title'],
        'Analyzable': '‚úÖ' if page['analyzable'] else '‚ùå',
        'Confidence': f"{page['confidence']:.0%}",
        'Words': page['metadata'].get('word_count', 0),
        'Is Drawing': '‚úÖ' if page['metadata'].get('is_drawing', False) else '‚ùå',
        'Aspect Ratio': f"{page['metadata'].get('aspect_ratio', 0):.2f}"
    })

df = pd.DataFrame(df_data)
print("\nüìä Detailed Page Analysis:")
display(df)

## 6. Test Individual Page

If you want to test a specific page in detail:

In [None]:
# Change this to test a specific page
test_page_number = 1

image_path = f'/content/output/page_{test_page_number}.jpg'
image = Image.open(image_path)

print(f"Testing Page {test_page_number}:")
classification = processor._classify_page(image_path, image)

print(f"\n‚úÖ Result:")
print(f"   Type: {classification['type']}")
print(f"   Analyzable: {classification['analyzable']}")
print(f"   Confidence: {classification['confidence']:.2f}")

# Show image
display(image)