In [None]:
import os
import random
import fitz  # PyMuPDF
from PIL import Image
import io

def extract_images_from_random_pdf(pdf_dir, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # List all PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print("No PDF files found in the directory.")
        return
    
    # Select a random PDF file
    random_pdf = random.choice(pdf_files)
    pdf_path = os.path.join(pdf_dir, random_pdf)
    
    print(f"Selected PDF: {random_pdf}")
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Target dimensions and aspect ratio for plot charts
    target_width = 535
    target_height = 369
    target_aspect_ratio = target_width / target_height  # ≈ 1.45
    
    # Counter for images
    total_images = 0
    accepted_images = 0
    
    # Iterate through each page
    for page_index in range(len(doc)):
        page = doc[page_index]
        image_list = page.get_images(full=True)
        
        # Process each image
        for img_index, img in enumerate(image_list):
            total_images += 1
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Get image extension
            ext = base_image["ext"]
            
            # Load image for analysis
            image = Image.open(io.BytesIO(image_bytes))
            width, height = image.size
            
            # Calculate aspect ratio
            aspect_ratio = width / height
            
            # RELAXED FILTERS
            # Minimum dimensions - reduced from 300 to 200
            if width < 200 or height < 200:
                print(f"REJECTED: Image {total_images} - Dimensions {width}x{height} too small")
                continue
            
            # Wider aspect ratio range - expanded from [0.7-2.0] to [0.5-2.5]
            if aspect_ratio < 0.5 or aspect_ratio > 2.5:
                print(f"REJECTED: Image {total_images} - Extreme aspect ratio {aspect_ratio:.2f}")
                continue
            
            # More relaxed checks for plot charts
            # Check if dimensions are close to target with 30% tolerance (was 20%)
            width_in_range = 0.7 * target_width <= width <= 1.3 * target_width
            height_in_range = 0.7 * target_height <= height <= 1.3 * target_height
            
            # Check if aspect ratio is close to target with 25% tolerance (was 15%)
            aspect_ratio_in_range = 0.75 * target_aspect_ratio <= aspect_ratio <= 1.25 * target_aspect_ratio
            
            # Expanded reasonable size ranges
            reasonable_size = width >= 300 and width <= 800 and height >= 200 and height <= 600
            
            # More lenient plot chart criteria - any of these conditions can qualify an image
            is_plot = (
                (width_in_range and height_in_range) or
                (aspect_ratio_in_range and reasonable_size) or
                (aspect_ratio >= 0.8 and aspect_ratio <= 2.0 and min(width, height) >= 250)
            )
            
            if not is_plot:
                print(f"REJECTED: Image {total_images} - Not a plot chart (dimensions: {width}x{height}, ratio: {aspect_ratio:.2f})")
                continue
            
            # Only save accepted images
            accepted_images += 1
            image_filename = f"{random_pdf.split('.')[0]}_plot_{accepted_images}.{ext}"
            image_path = os.path.join(output_dir, image_filename)
            
            # Save the image
            with open(image_path, "wb") as f:
                f.write(image_bytes)
            
            # Print image info
            size_kb = len(image_bytes) / 1024
            print(f"ACCEPTED: {image_filename}")
            print(f"Dimensions: {width}x{height}, Aspect ratio: {aspect_ratio:.2f}")
            print("-" * 50)
    
    print(f"Total images found: {total_images}")
    print(f"Plot charts saved: {accepted_images}")
    doc.close()

# Paths
pdf_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs"
output_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs\Images"

# Extract images
extract_images_from_random_pdf(pdf_dir, output_dir)

Selected PDF: 06_2019.pdf
REJECTED: Image 1 - Extreme aspect ratio 5.38
ACCEPTED: 06_2019_plot_1.jpeg
Dimensions: 534x356, Aspect ratio: 1.50
--------------------------------------------------
REJECTED: Image 3 - Dimensions 540x10 too small
REJECTED: Image 4 - Dimensions 141x45 too small
REJECTED: Image 5 - Dimensions 540x10 too small
REJECTED: Image 6 - Not a plot chart (dimensions: 557x863, ratio: 0.65)
REJECTED: Image 7 - Dimensions 30x1680 too small
REJECTED: Image 8 - Dimensions 141x45 too small
REJECTED: Image 9 - Dimensions 540x10 too small
ACCEPTED: 06_2019_plot_2.jpeg
Dimensions: 1485x953, Aspect ratio: 1.56
--------------------------------------------------
ACCEPTED: 06_2019_plot_3.jpeg
Dimensions: 1487x876, Aspect ratio: 1.70
--------------------------------------------------
REJECTED: Image 12 - Extreme aspect ratio 0.47
REJECTED: Image 13 - Dimensions 30x1918 too small
REJECTED: Image 14 - Dimensions 141x45 too small
REJECTED: Image 15 - Dimensions 540x10 too small
ACCEPTE