In [2]:
import os
import fitz  # PyMuPDF
from PIL import Image
import io
import re
import numpy as np
import cv2
import csv
import datetime

def is_likely_plot(image):
    """Analyze image content to determine if it's likely a plot/chart rather than a photograph."""
    # Convert PIL image to cv2 format
    img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    
    # 1. Edge detection - plots have more straight lines/edges
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    edge_ratio = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
    
    # 2. Color analysis - plots typically have fewer unique colors than photos
    # Resize to normalize comparison
    resized = cv2.resize(img_cv, (100, 100))
    reshaped = resized.reshape((-1, 3))
    # Convert to 8+8+8=24 bit color (reducing precision)
    colors = np.uint8(reshaped / 32) * 32
    unique_colors = np.unique(colors, axis=0).shape[0]
    
    # 3. Detect straight lines using Hough transform
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=80, 
                            minLineLength=50, maxLineGap=10)
    
    # Count how many straight lines we found
    num_lines = 0 if lines is None else len(lines)
    
    # Debug print
    print(f"Edge ratio: {edge_ratio:.3f}, Unique colors: {unique_colors}, Straight lines: {num_lines}")
    
    # Combined scoring
    is_plot_score = 0
    
    # Edge ratio score (0-3)
    if edge_ratio > 0.1:
        is_plot_score += 3
    elif edge_ratio > 0.05:
        is_plot_score += 2
    elif edge_ratio > 0.02:
        is_plot_score += 1
    
    # Color score (0-3)
    if unique_colors < 100:
        is_plot_score += 3
    elif unique_colors < 300:
        is_plot_score += 2
    elif unique_colors < 700:
        is_plot_score += 1
    
    # Line score (0-3)
    if num_lines > 20:
        is_plot_score += 3
    elif num_lines > 10:
        is_plot_score += 2
    elif num_lines > 5:
        is_plot_score += 1
    
    # Final decision
    print(f"Plot score: {is_plot_score}/9")
    return is_plot_score >= 5  # At least 5 out of 9 points to be considered a plot

def has_light_background(image):
    """Check if image has a light background (not necessarily pure white)."""
    # Convert image to numpy array
    img_array = np.array(image)
    
    # Check the edges of the image (likely background)
    edges = []
    # Top edge
    edges.extend(img_array[0, :])
    # Bottom edge
    edges.extend(img_array[-1, :])
    # Left edge
    edges.extend(img_array[:, 0])
    # Right edge
    edges.extend(img_array[:, -1])
    
    # Convert to numpy array for easier analysis
    edges = np.array(edges)
    
    # For RGB images
    if len(edges.shape) == 2:
        # Calculate brightness (average of RGB values)
        brightness = np.mean(edges, axis=1)
        # Count pixels that are light-colored (brightness > 200)
        light_pixels = np.sum(brightness > 200)
        # Calculate percentage of light pixels
        light_percentage = light_pixels / len(brightness)
        
        return light_percentage > 0.5  # At least 50% of edge pixels should be light-colored
    
    # For grayscale images
    else:
        # Count pixels that are light-colored (value > 200)
        light_pixels = np.sum(edges > 200)
        # Calculate percentage of light pixels
        light_percentage = light_pixels / len(edges)
        
        return light_percentage > 0.5  # At least 50% of edge pixels should be light-colored

def extract_images_from_all_pdfs(pdf_dir, output_dir, csv_path):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if CSV exists, create with headers if it doesn't
    csv_exists = os.path.isfile(csv_path)
    
    # List all PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print("No PDF files found in the directory.")
        return
    
    # Get current date/time for CSV
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Counter for total processed
    total_pdfs_processed = 0
    total_images_found = 0
    total_images_saved = 0
    
    # Open CSV file in append mode - open once for all PDFs
    with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
        # SIMPLIFIED CSV STRUCTURE
        fieldnames = ['date_extracted', 'pdf_filename', 'accepted_images_count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header if file is new
        if not csv_exists:
            writer.writeheader()
        
        # Process each PDF file
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            
            print(f"\n{'='*60}")
            print(f"Processing PDF {total_pdfs_processed+1}/{len(pdf_files)}: {pdf_file}")
            print(f"{'='*60}\n")
            
            # Open the PDF
            try:
                doc = fitz.open(pdf_path)
            except Exception as e:
                print(f"Error opening PDF {pdf_file}: {e}")
                continue
            
            # Target dimensions and aspect ratio for plot charts
            target_width = 535
            target_height = 369
            target_aspect_ratio = target_width / target_height  # ≈ 1.45
            
            # Counter for images in this PDF
            pdf_total_images = 0
            pdf_accepted_images = 0
            
            # Iterate through each page
            for page_index in range(len(doc)):
                page = doc[page_index]
                
                # NEW: Skip the first page (page_index == 0)
                if page_index == 0:
                    print(f"Skipping page 1 (first page) as requested")
                    continue
                
                # Get all image locations on this page
                img_locations = []
                image_list = page.get_images(full=True)
                
                # First pass: identify plot chart images and their locations
                for img_index, img in enumerate(image_list):
                    pdf_total_images += 1
                    total_images_found += 1
                    xref = img[0]
                    
                    try:
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        ext = base_image["ext"]
                        
                        # Load image for analysis
                        image = Image.open(io.BytesIO(image_bytes))
                        width, height = image.size
                        aspect_ratio = width / height
                        
                        # RELAXED FILTERS
                        if width < 200 or height < 200:
                            print(f"REJECTED: Image {pdf_total_images} - Dimensions {width}x{height} too small")
                            continue
                        
                        if aspect_ratio < 0.5 or aspect_ratio > 2.5:
                            print(f"REJECTED: Image {pdf_total_images} - Extreme aspect ratio {aspect_ratio:.2f}")
                            continue
                        
                        # Light background check - note this as information but don't reject
                        background_light = has_light_background(image)
                        if not background_light:
                            print(f"NOTE: Image {pdf_total_images} - Does not appear to have a light background")
                            # Continue anyway - don't reject
                        
                        # More relaxed checks for plot charts
                        width_in_range = 0.7 * target_width <= width <= 1.3 * target_width
                        height_in_range = 0.7 * target_height <= height <= 1.3 * target_height
                        aspect_ratio_in_range = 0.75 * target_aspect_ratio <= aspect_ratio <= 1.25 * target_aspect_ratio
                        reasonable_size = width >= 300 and width <= 800 and height >= 200 and height <= 600
                        
                        is_plot = (
                            (width_in_range and height_in_range) or
                            (aspect_ratio_in_range and reasonable_size) or
                            (aspect_ratio >= 0.8 and aspect_ratio <= 2.0 and min(width, height) >= 250)
                        )
                        
                        if not is_plot:
                            print(f"REJECTED: Image {pdf_total_images} - Not a plot chart (dimensions: {width}x{height}, ratio: {aspect_ratio:.2f})")
                            continue
                        
                        # Check if it's really a plot using content analysis
                        plot_check_passed = is_likely_plot(image)
                        
                        if not plot_check_passed:
                            print(f"REJECTED: Image {pdf_total_images} - Content analysis indicates this is not a plot chart")
                            continue
                        
                        # Find the location of this image on the page
                        img_locations.append({
                            "index": img_index,
                            "xref": xref,
                            "bytes": image_bytes,
                            "ext": ext,
                            "width": width,
                            "height": height,
                            "aspect_ratio": aspect_ratio,
                            "has_light_bg": background_light
                        })
                    except Exception as e:
                        print(f"Error processing image {pdf_total_images} on page {page_index+1}: {e}")
                        continue
                
                # If we found plot charts on this page, extract text that might be titles
                if img_locations:
                    try:
                        # Extract text blocks with their positions
                        text_blocks = page.get_text("dict")["blocks"]
                        
                        # Process each plot chart image
                        for img_data in img_locations:
                            # Try to find the image location on the page
                            img_rect = None
                            for rect in page.get_image_rects(img_data["xref"]):
                                img_rect = rect  # There might be multiple instances, use the first one
                                break
                            
                            title = "Unknown Title"  # Default title
                            
                            if img_rect:
                                # Look for text blocks that could be titles (above the image)
                                potential_titles = []
                                
                                # Calculate expanded search area (with horizontal expansion)
                                search_area = fitz.Rect(
                                    img_rect.x0 - 20,        # Left edge, expanded by 20 points
                                    img_rect.y0 - 150,       # Top edge, look up to 150 points above
                                    img_rect.x1 + 20,        # Right edge, expanded by 20 points
                                    img_rect.y0 + 10         # Include a bit below the top of the image
                                )
                                
                                print(f"Searching for title in area: {search_area}")
                                
                                # Debug - capture all text in the search area
                                all_text_in_area = page.get_text("text", clip=search_area)
                                print(f"All text in search area:\n{all_text_in_area}")
                                
                                for block in text_blocks:
                                    if block["type"] == 0:  # Text block
                                        block_rect = fitz.Rect(block["bbox"])
                                        
                                        # Check if text is within the search area
                                        if search_area.intersects(block_rect):
                                            # Extract the text
                                            block_text = ""
                                            for line in block["lines"]:
                                                for span in line["spans"]:
                                                    block_text += span["text"] + " "
                                            
                                            # Store the potential title along with its distance and vertical position
                                            potential_titles.append({
                                                "text": block_text.strip(),
                                                "distance": abs(img_rect.y0 - block_rect.y1),  # Distance to image
                                                "y_pos": block_rect.y0  # Y position (for sorting from top to bottom)
                                            })
                                
                                # First try to find short title candidates (1-3 lines) close to the image
                                short_titles = [t for t in potential_titles if 
                                              len(t["text"].split('\n')) <= 3 and 
                                              len(t["text"]) <= 200 and
                                              t["distance"] <= 100]
                                
                                if short_titles:
                                    # Sort by distance (closest first)
                                    short_titles.sort(key=lambda x: x["distance"])
                                    title = short_titles[0]["text"]
                                else:
                                    # If no good short titles, try all text sorted from top to bottom
                                    if potential_titles:
                                        potential_titles.sort(key=lambda x: x["y_pos"])
                                        title = potential_titles[0]["text"]
                                    
                                    # If title is too long, truncate it
                                    if len(title) > 200:
                                        title = title[:197] + "..."
                                
                                # Clean up the title
                                title = re.sub(r'^(Figure|Fig\.)\s+\d+[.:]\s*', '', title)
                                title = re.sub(r'\s+', ' ', title).strip()
                            
                            # Save the image with its title
                            pdf_accepted_images += 1
                            total_images_saved += 1
                            
                            # Clean title for filename use
                            clean_title = re.sub(r'[^\w\s-]', '', title)
                            clean_title = re.sub(r'\s+', '_', clean_title)
                            clean_title = clean_title[:50]  # Limit length for filename
                            
                            if clean_title == "Unknown_Title" or not clean_title:
                                image_filename = f"{pdf_file.split('.')[0]}_plot_{pdf_accepted_images}.{img_data['ext']}"
                            else:
                                image_filename = f"{pdf_file.split('.')[0]}_plot_{pdf_accepted_images}_{clean_title}.{img_data['ext']}"
                            
                            # Add background color info to filename if it doesn't have a light background
                            if not img_data["has_light_bg"]:
                                base, ext = os.path.splitext(image_filename)
                                image_filename = f"{base}_DARK_BG{ext}"
                            
                            image_path = os.path.join(output_dir, image_filename)
                            
                            # Save the image
                            with open(image_path, "wb") as f:
                                f.write(img_data["bytes"])
                            
                            # Print image info
                            print(f"ACCEPTED: {image_filename}")
                            print(f"Dimensions: {img_data['width']}x{img_data['height']}, Ratio: {img_data['aspect_ratio']:.2f}")
                            print(f"Has light background: {img_data['has_light_bg']}")
                            print(f"Title: {title}")
                            print("-" * 50)
                    except Exception as e:
                        print(f"Error processing text on page {page_index+1}: {e}")
                        continue
            
            # Write a single CSV entry per PDF
            writer.writerow({
                'date_extracted': timestamp,
                'pdf_filename': pdf_file,
                'accepted_images_count': pdf_accepted_images
            })
            
            print(f"PDF Summary: {pdf_file}")
            print(f"Images found: {pdf_total_images}")
            print(f"Plot charts saved: {pdf_accepted_images}")
            print(f"{'='*60}\n")
            
            doc.close()
            total_pdfs_processed += 1
    
    print(f"\nComplete! Processed {total_pdfs_processed} PDFs")
    print(f"Total images found: {total_images_found}")
    print(f"Total plot charts saved: {total_images_saved}")

# Paths
pdf_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs"
output_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs\Images"
csv_path = r"C:\Users\clint\Desktop\Scraping Task\pdf_image_data.csv"

# Extract images from all PDFs and update CSV
extract_images_from_all_pdfs(pdf_dir, output_dir, csv_path)


Processing PDF 1/80: 01_2018.pdf

Skipping page 1 (first page) as requested
REJECTED: Image 1 - Dimensions 468x63 too small
Edge ratio: 0.120, Unique colors: 52, Straight lines: 43
Plot score: 9/9
REJECTED: Image 3 - Dimensions 469x14 too small
Searching for title in area: Rect(86.0, -46.0, 520.0, 114.0)
All text in search area:
SEP
COMMERCIAL VEHICLE GU
Average Auction Hammer Price: 3- to 6-Year-Old Benchmark Sleeper Tractor (Nominal Numbers)
           

ACCEPTED: 01_2018_plot_1_Average_Auction_Hammer_Price_3-_to_6-Year-Old_Benc.png
Dimensions: 395x250, Ratio: 1.58
Has light background: True
Title: Average Auction Hammer Price: 3- to 6-Year-Old Benchmark Sleeper Tractor (Nominal Numbers)
--------------------------------------------------
REJECTED: Image 4 - Dimensions 468x63 too small
Edge ratio: 0.110, Unique colors: 34, Straight lines: 37
Plot score: 9/9
Edge ratio: 0.105, Unique colors: 32, Straight lines: 31
Plot score: 9/9
REJECTED: Image 7 - Dimensions 469x14 too small
Searchi