In [6]:
import os
import fitz  # PyMuPDF
import numpy as np
import cv2
from PIL import Image

def extract_charts_from_pdf(pdf_path, output_folder):
    """
    Extract only chart regions from PDF using computer vision techniques
    with lower detection thresholds to catch more charts
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    print(f"Processing PDF: {pdf_path}")
    print(f"Total pages: {len(doc)}")
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num+1}...")
        
        # Render page at high resolution
        zoom = 4
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        
        # Convert to numpy array for OpenCV processing
        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
        img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        
        # Make a copy for visualization
        img_display = img_cv.copy()
        
        # Convert to grayscale for processing
        gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
        
        # Apply threshold to separate foreground from background
        _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)
        
        # Find contours
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Sort contours by area (largest first)
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        
        # Process large contours that might be charts
        chart_count = 0
        # Process more contours with lower thresholds
        for i, contour in enumerate(contours[:15]):  # Check the 15 largest contours (increased from 10)
            # Get the contour area
            area = cv2.contourArea(contour)
            
            # Skip if the area is too small or too large - LOWERED THRESHOLD from 5% to 2%
            total_area = img_cv.shape[0] * img_cv.shape[1]
            if area < (total_area * 0.02) or area > (total_area * 0.95):  # More permissive range
                continue
            
            # Get bounding rectangle
            x, y, w, h = cv2.boundingRect(contour)
            
            # Skip if aspect ratio is extreme - EXPANDED RANGE from 0.2-5 to 0.1-8
            aspect_ratio = float(w) / h
            if aspect_ratio < 0.1 or aspect_ratio > 8:  # More permissive aspect ratio
                continue
            
            # Skip if region appears to be full of text - LOWERED THRESHOLD from 0.1 to 0.05
            roi = binary[y:y+h, x:x+w]
            density = np.count_nonzero(roi) / (w * h)
            if density < 0.05:  # More permissive density check
                continue
            
            # Cut out the region
            chart_region = img_cv[y:y+h, x:x+w]
            
            # Draw the rectangle on the visualization image
            cv2.rectangle(img_display, (x, y), (x+w, y+h), (0, 255, 0), 10)
            
            # Save the chart region
            chart_count += 1
            chart_filename = f"chart_page{page_num+1}_region{chart_count}.png"
            chart_path = os.path.join(output_folder, chart_filename)
            cv2.imwrite(chart_path, chart_region)
            print(f"  Saved chart region: {chart_path}")
            
            # Save debug info
            debug_filename = f"chart_debug_p{page_num+1}_r{chart_count}.txt"
            debug_path = os.path.join(output_folder, debug_filename)
            with open(debug_path, 'w') as f:
                f.write(f"Area: {area} pixels ({area/total_area*100:.2f}% of page)\n")
                f.write(f"Dimensions: {w}x{h} pixels\n")
                f.write(f"Aspect ratio: {aspect_ratio:.2f}\n")
                f.write(f"Content density: {density:.2f}\n")
        
        # Save the visualization with rectangles around detected charts
        vis_filename = f"page{page_num+1}_detection.png"
        vis_path = os.path.join(output_folder, vis_filename)
        cv2.imwrite(vis_path, img_display)
        print(f"  Saved detection visualization: {vis_path}")
        
        # Also save the binary image to see what's being used for contour detection
        bin_filename = f"page{page_num+1}_binary.png"
        bin_path = os.path.join(output_folder, bin_filename)
        cv2.imwrite(bin_path, binary)
        print(f"  Saved binary image: {bin_path}")
        
        # If no charts detected using contour method, try an alternative approach
        if chart_count == 0:
            print(f"  No charts detected on page {page_num+1} using contour method. Trying alternative...")
            
            # Try to identify grid-like structures (common in charts)
            # Apply Canny edge detection with lower thresholds
            edges = cv2.Canny(gray, 30, 150)  # Lower minimum threshold (50 → 30)
            
            # Dilate to connect lines
            kernel = np.ones((3, 3), np.uint8)
            dilated = cv2.dilate(edges, kernel, iterations=2)
            
            # Find lines using Hough transform with lower thresholds
            lines = cv2.HoughLinesP(dilated, 1, np.pi/180, 
                                   threshold=80,       # Lower threshold (100 → 80)
                                   minLineLength=80,   # Detect shorter lines (100 → 80)
                                   maxLineGap=30)      # Allow bigger gaps (20 → 30)
            
            if lines is not None:
                print(f"  Found {len(lines)} lines")
                
                # Draw lines on a separate image
                line_img = np.zeros_like(img_cv)
                for line in lines:
                    x1, y1, x2, y2 = line[0]
                    cv2.line(line_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Save the line detection image
                line_filename = f"page{page_num+1}_lines.png"
                line_path = os.path.join(output_folder, line_filename)
                cv2.imwrite(line_path, line_img)
                print(f"  Saved line detection: {line_path}")
                
                # Try to find chart regions using densities of lines
                if len(lines) > 5:  # If we found enough lines
                    # Detect grid patterns - common in charts
                    # (Simplified approach: divide page into sections and check line density)
                    sections_h = 3
                    sections_v = 3
                    height, width = dilated.shape
                    
                    for section_y in range(sections_v):
                        for section_x in range(sections_h):
                            # Calculate section coordinates
                            x1 = int(section_x * width / sections_h)
                            y1 = int(section_y * height / sections_v)
                            x2 = int((section_x + 1) * width / sections_h)
                            y2 = int((section_y + 1) * height / sections_v)
                            
                            # Count lines in this section
                            section_lines = 0
                            for line in lines:
                                lx1, ly1, lx2, ly2 = line[0]
                                # Check if line is in this section
                                if (x1 <= lx1 <= x2 or x1 <= lx2 <= x2) and \
                                   (y1 <= ly1 <= y2 or y1 <= ly2 <= y2):
                                    section_lines += 1
                            
                            # If section has more than 5 lines, it might be a chart
                            if section_lines > 5:
                                # Save the section
                                section = img_cv[y1:y2, x1:x2]
                                chart_count += 1
                                sect_filename = f"chart_page{page_num+1}_section{section_y*sections_h+section_x+1}.png"
                                sect_path = os.path.join(output_folder, sect_filename)
                                cv2.imwrite(sect_path, section)
                                print(f"  Saved potential chart section: {sect_path}")
                
                # If we still haven't found any chart regions
                if chart_count == 0:
                    # Save whole page as a fallback
                    page_filename = f"page{page_num+1}_whole.png"
                    page_path = os.path.join(output_folder, page_filename)
                    cv2.imwrite(page_path, img_cv)
                    print(f"  Saved whole page as fallback: {page_path}")
        
    print(f"Processing complete. Check {output_folder} for extracted charts.")

# Path to PDF file
pdf_path = "C:/Users/clint/Desktop/Scraping Task/pdfs/02_2018.pdf"

# Output folder for charts
output_folder = "C:/Users/clint/Desktop/Scraping Task/extracted_charts"

# Extract charts from PDF
extract_charts_from_pdf(pdf_path, output_folder)

Processing PDF: C:/Users/clint/Desktop/Scraping Task/pdfs/02_2018.pdf
Total pages: 8
Processing page 1...
  Saved chart region: C:/Users/clint/Desktop/Scraping Task/extracted_charts\chart_page1_region1.png
  Saved detection visualization: C:/Users/clint/Desktop/Scraping Task/extracted_charts\page1_detection.png
  Saved binary image: C:/Users/clint/Desktop/Scraping Task/extracted_charts\page1_binary.png
Processing page 2...
  Saved chart region: C:/Users/clint/Desktop/Scraping Task/extracted_charts\chart_page2_region1.png
  Saved detection visualization: C:/Users/clint/Desktop/Scraping Task/extracted_charts\page2_detection.png
  Saved binary image: C:/Users/clint/Desktop/Scraping Task/extracted_charts\page2_binary.png
Processing page 3...
  Saved chart region: C:/Users/clint/Desktop/Scraping Task/extracted_charts\chart_page3_region1.png
  Saved chart region: C:/Users/clint/Desktop/Scraping Task/extracted_charts\chart_page3_region2.png
  Saved chart region: C:/Users/clint/Desktop/Scrapin

In [7]:
import os
import fitz  # PyMuPDF
import numpy as np
import cv2

def extract_charts_from_pdf(pdf_path, output_folder):
    """
    Extract only chart regions from PDF using computer vision techniques
    with lower detection thresholds to catch more charts.
    Only saves the chart images from the main contour detection method.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    print(f"Processing PDF: {pdf_path}")
    print(f"Total pages: {len(doc)}")
    
    total_charts = 0
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num+1}...")
        
        # Render page at high resolution
        zoom = 8  # Increased from 4 to 8 for higher quality
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        
        # Convert to numpy array for OpenCV processing
        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
        img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        
        # Convert to grayscale for processing
        gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
        
        # Apply threshold to separate foreground from background
        _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)
        
        # Find contours
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Sort contours by area (largest first)
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        
        # Process large contours that might be charts
        chart_count = 0
        for i, contour in enumerate(contours[:15]):  # Check the 15 largest contours
            # Get the contour area
            area = cv2.contourArea(contour)
            
            # Skip if the area is too small or too large
            total_area = img_cv.shape[0] * img_cv.shape[1]
            if area < (total_area * 0.02) or area > (total_area * 0.95):
                continue
            
            # Get bounding rectangle
            x, y, w, h = cv2.boundingRect(contour)
            
            # Skip if aspect ratio is extreme
            aspect_ratio = float(w) / h
            if aspect_ratio < 0.1 or aspect_ratio > 8:
                continue
            
            # Skip if region appears to be full of text
            roi = binary[y:y+h, x:x+w]
            density = np.count_nonzero(roi) / (w * h)
            if density < 0.05:
                continue
            
            # Cut out the region
            chart_region = img_cv[y:y+h, x:x+w]
            
            # Save the chart region
            chart_count += 1
            total_charts += 1
            chart_filename = f"chart_page{page_num+1}_region{chart_count}.png"
            chart_path = os.path.join(output_folder, chart_filename)
            cv2.imwrite(chart_path, chart_region)
            print(f"  Saved chart: {chart_filename}")
        
        if chart_count == 0:
            print(f"  No charts detected on page {page_num+1}")
    
    print(f"Processing complete. Extracted {total_charts} charts to {output_folder}")

# Path to PDF file
pdf_path = "C:/Users/clint/Desktop/Scraping Task/pdfs/01_2018.pdf" 
# Output folder for charts
output_folder = "C:/Users/clint/Desktop/Scraping Task/extracted_charts"

# Extract charts from PDF
extract_charts_from_pdf(pdf_path, output_folder)

Processing PDF: C:/Users/clint/Desktop/Scraping Task/pdfs/01_2018.pdf
Total pages: 5
Processing page 1...
  Saved chart: chart_page1_region1.png
  Saved chart: chart_page1_region2.png
Processing page 2...
  Saved chart: chart_page2_region1.png
  Saved chart: chart_page2_region2.png
Processing page 3...
  Saved chart: chart_page3_region1.png
  Saved chart: chart_page3_region2.png
  Saved chart: chart_page3_region3.png
Processing page 4...
  Saved chart: chart_page4_region1.png
  Saved chart: chart_page4_region2.png
  Saved chart: chart_page4_region3.png
Processing page 5...
  Saved chart: chart_page5_region1.png
Processing complete. Extracted 11 charts to C:/Users/clint/Desktop/Scraping Task/extracted_charts


In [8]:
import os
import fitz  # PyMuPDF
import numpy as np
import cv2
import csv
import datetime
import re
from PIL import Image
import io

def is_chart_background(img):
    """
    Analyze background color to determine if it's likely a chart.
    Charts typically have white/very light backgrounds.
    """
    # Convert to HSV for better color analysis
    img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    
    # Sample the edges of the image (likely background)
    top = img_hsv[0:10, :, :]
    bottom = img_hsv[-10:, :, :]
    left = img_hsv[:, 0:10, :]
    right = img_hsv[:, -10:, :]
    
    # Combine edges
    edges = np.vstack([top.reshape(-1, 3), bottom.reshape(-1, 3), 
                      left.reshape(-1, 3), right.reshape(-1, 3)])
    
    # Calculate average values
    avg_v = np.mean(edges[:, 2])  # V in HSV (brightness)
    avg_s = np.mean(edges[:, 1])  # S in HSV (saturation)
    
    # White/light background: high V (>220), low S (<30)
    is_white_bg = avg_v > 220 and avg_s < 30
    
    # Check for colored backgrounds - charts rarely have colored backgrounds
    dominant_hue = np.median(edges[:, 0])
    hue_std = np.std(edges[:, 0])
    has_colored_bg = avg_s > 50 and hue_std < 20  # Consistent, saturated color
    
    # Calculate percentage of dark pixels in the border
    dark_pixel_percent = np.mean(edges[:, 2] < 100)
    is_dark_bg = dark_pixel_percent > 0.6  # More than 60% dark pixels
    
    result = {
        "is_light_bg": is_white_bg,
        "is_dark_bg": is_dark_bg,
        "has_colored_bg": has_colored_bg,
        "brightness": avg_v,
        "saturation": avg_s
    }
    
    return result

def has_chart_elements(img):
    """Detect if image has common chart elements like axes, grid lines, etc."""
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Edge detection for finding lines
    edges = cv2.Canny(gray, 50, 150)
    
    # Use Hough transform to detect lines
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)
    
    if lines is None:
        return False, 0, 0
    
    # Count horizontal and vertical lines (common in charts)
    horiz_lines = 0
    vert_lines = 0
    
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
        
        # Horizontal lines (within 10 degrees of horizontal)
        if angle < 10 or angle > 170:
            horiz_lines += 1
        # Vertical lines (within 10 degrees of vertical)
        elif 80 < angle < 100:
            vert_lines += 1
    
    # Charts typically have several horizontal and vertical lines
    has_axis_lines = horiz_lines >= 3 and vert_lines >= 3
    
    return has_axis_lines, horiz_lines, vert_lines

def extract_charts_from_all_pdfs(pdf_dir, output_dir, csv_path):
    """Extract charts from all PDFs using enhanced detection criteria"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if CSV exists, create with headers if it doesn't
    csv_exists = os.path.isfile(csv_path)
    
    # List all PDF files in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print("No PDF files found in the directory.")
        return
    
    # Get current date/time for CSV
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Counter for total processed
    total_pdfs_processed = 0
    total_charts_saved = 0
    
    # Open CSV file in append mode - open once for all PDFs
    with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['date_extracted', 'pdf_filename', 'accepted_images_count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header if file is new
        if not csv_exists:
            writer.writeheader()
        
        # Process each PDF file
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            
            print(f"\n{'='*60}")
            print(f"Processing PDF {total_pdfs_processed+1}/{len(pdf_files)}: {pdf_file}")
            print(f"{'='*60}\n")
            
            # Open the PDF
            try:
                doc = fitz.open(pdf_path)
            except Exception as e:
                print(f"Error opening PDF {pdf_file}: {e}")
                continue
            
            # Counter for charts in this PDF
            pdf_charts_found = 0
            
            # Skip the first page as in your code
            for page_num, page in enumerate(doc):
                if page_num == 0:
                    print(f"Skipping page 1 (first page) as requested")
                    continue
                    
                print(f"Processing page {page_num+1}...")
                
                # Render page at high resolution (zoom 8 as requested)
                zoom = 8
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                
                # Convert to numpy array for OpenCV processing
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
                img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
                
                # Convert to grayscale for processing
                gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
                
                # Apply threshold to separate foreground from background
                _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)
                
                # Find contours
                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                
                # Sort contours by area (largest first)
                contours = sorted(contours, key=cv2.contourArea, reverse=True)
                
                # Try to find titles for the charts (similar to your code)
                text_blocks = page.get_text("dict")["blocks"]
                
                # Process large contours that might be charts
                chart_count = 0
                for i, contour in enumerate(contours[:15]):  # Check the 15 largest contours
                    # Get the contour area
                    area = cv2.contourArea(contour)
                    
                    # Skip if the area is too small or too large
                    total_area = img_cv.shape[0] * img_cv.shape[1]
                    if area < (total_area * 0.02) or area > (total_area * 0.95):
                        continue
                    
                    # Get bounding rectangle and convert to integers
                    x, y, w, h = cv2.boundingRect(contour)
                    x, y, w, h = int(x), int(y), int(w), int(h)
                    
                    # Check dimensions - avoid extremely wide/tall images
                    aspect_ratio = float(w) / h
                    
                    # STRICTER ASPECT RATIO CHECK - plot charts are typically not extremely wide/tall
                    if aspect_ratio < 0.6 or aspect_ratio > 2.0:
                        print(f"REJECTED: Region with dimensions {w}x{h} - Extreme aspect ratio {aspect_ratio:.2f}")
                        continue
                    
                    # FILTER: Check for unusual dimensions like the specific examples mentioned
                    # Check for wide banners (like 4896x663)
                    if w > 3000 and h < 800:
                        print(f"REJECTED: Region with dimensions {w}x{h} - Looks like a banner/header")
                        continue
                    
                    # FILTER: Check if dimensions are close to known non-chart examples
                    known_non_charts = [(4896, 663), (1590, 1300), (1103, 938), (1440, 1202)]
                    for known_w, known_h in known_non_charts:
                        w_sim = abs(w - known_w) / max(w, known_w)
                        h_sim = abs(h - known_h) / max(h, known_h)
                        if w_sim < 0.1 and h_sim < 0.1:  # Within 10% of known non-chart dimensions
                            print(f"REJECTED: Region with dimensions {w}x{h} - Similar to known non-chart")
                            continue
                    
                    # NEW: Find the title to determine how much to extend upward
                    title = "Unknown_Title"
                    
                    # Calculate the chart position in original PDF coordinates (divide by zoom)
                    orig_rect = fitz.Rect(
                        x / zoom, 
                        y / zoom, 
                        (x + w) / zoom, 
                        (y + h) / zoom
                    )
                    
                    # Look for text blocks that could be titles (above the image)
                    search_area = fitz.Rect(
                        orig_rect.x0 - 20,      # Left edge, expanded by 20 points
                        orig_rect.y0 - 150,     # Top edge, look up to 150 points above
                        orig_rect.x1 + 20,      # Right edge, expanded by 20 points
                        orig_rect.y0 + 10       # Include a bit below the top of the image
                    )
                    
                    potential_titles = []
                    closest_title_y = None
                    
                    for block in text_blocks:
                        if block["type"] == 0:  # Text block
                            block_rect = fitz.Rect(block["bbox"])
                            
                            # Check if text is within the search area
                            if search_area.intersects(block_rect):
                                # Extract the text
                                block_text = ""
                                for line in block["lines"]:
                                    for span in line["spans"]:
                                        block_text += span["text"] + " "
                                
                                # Store the potential title along with its distance and vertical position
                                potential_titles.append({
                                    "text": block_text.strip(),
                                    "distance": abs(orig_rect.y0 - block_rect.y1),  # Distance to chart
                                    "y_pos": block_rect.y0,  # Y position (for sorting from top to bottom)
                                    "rect": block_rect
                                })
                    
                    # Find the best title - short titles close to the chart
                    short_titles = [t for t in potential_titles if 
                                  len(t["text"].split('\n')) <= 3 and 
                                  len(t["text"]) <= 200 and
                                  t["distance"] <= 100]
                    
                    if short_titles:
                        # Sort by distance (closest first)
                        short_titles.sort(key=lambda x: x["distance"])
                        title = short_titles[0]["text"]
                        closest_title_y = short_titles[0]["rect"].y0 * zoom  # Convert back to image coords
                    elif potential_titles:
                        # If no good short titles, try all text sorted from top to bottom
                        potential_titles.sort(key=lambda x: x["y_pos"])
                        title = potential_titles[0]["text"]
                        closest_title_y = potential_titles[0]["rect"].y0 * zoom  # Convert back to image coords
                        
                        # If title is too long, truncate it
                        if len(title) > 200:
                            title = title[:197] + "..."
                    
                    # Clean up the title
                    title = re.sub(r'^(Figure|Fig\.)\s+\d+[.:]\s*', '', title)
                    title = re.sub(r'\s+', ' ', title).strip()
                    
                    # NEW: Extend the bounding rectangle upward to include the title
                    # Default extend by fixed 100 pixels or 10% of height, whichever is larger
                    extend_upward = max(100, int(h * 0.1))
                    
                    # If we found a title, extend to include it plus a small margin
                    if closest_title_y is not None:
                        # Convert to integer explicitly
                        closest_title_y = int(closest_title_y)
                        title_offset = y - closest_title_y
                        if title_offset > 0:  # Title is above the detected region
                            extend_upward = max(extend_upward, title_offset + 20)  # Add 20px margin
                    
                    # Make sure we don't go outside the image bounds
                    # Convert all coordinates to integers
                    new_y = int(max(0, y - extend_upward))
                    new_h = int(h + (y - new_y))
                    
                    # Debug print to verify types
                    print(f"Coordinates: new_y={new_y} (type={type(new_y)}), new_h={new_h} (type={type(new_h)})")
                    
                    # Cut out the extended region for analysis (including potential title)
                    extended_chart_region = img_cv[new_y:new_y+new_h, x:x+w]
                    
                    # Now analyze the actual chart portion (not the extended part) for chart-specific features
                    chart_region = img_cv[y:y+h, x:x+w]
                    
                    # FILTER: Background check
                    bg_analysis = is_chart_background(chart_region)
                    
                    # Reject dark or colored backgrounds (unusual for plots)
                    if bg_analysis["is_dark_bg"]:
                        print(f"REJECTED: Region with dimensions {w}x{h} - Dark background (not typical for plot charts)")
                        continue
                        
                    if bg_analysis["has_colored_bg"]:
                        print(f"REJECTED: Region with dimensions {w}x{h} - Colored background (unusual for plot charts)")
                        continue
                    
                    # FILTER: Check for chart elements (axes, grid lines)
                    has_axes, horiz_count, vert_count = has_chart_elements(chart_region)
                    if not has_axes:
                        print(f"REJECTED: Region with dimensions {w}x{h} - No clear chart elements found")
                        print(f"  (Found {horiz_count} horizontal lines, {vert_count} vertical lines)")
                        continue
                    
                    # Skip if region appears to be full of text (not a chart)
                    roi = binary[y:y+h, x:x+w]
                    density = np.count_nonzero(roi) / (w * h)
                    if density < 0.05 or density > 0.4:  # Too sparse or too dense
                        print(f"REJECTED: Region with dimensions {w}x{h} - Content density ({density:.3f}) not characteristic of charts")
                        continue
                    
                    # Clean title for filename use
                    clean_title = re.sub(r'[^\w\s-]', '', title)
                    clean_title = re.sub(r'\s+', '_', clean_title)
                    clean_title = clean_title[:50]  # Limit length for filename
                    
                    # Save the chart
                    chart_count += 1
                    pdf_charts_found += 1
                    total_charts_saved += 1
                    
                    # Format the filename following your convention
                    if clean_title == "Unknown_Title" or not clean_title:
                        chart_filename = f"{pdf_file.split('.')[0]}_plot_{pdf_charts_found}.png"
                    else:
                        chart_filename = f"{pdf_file.split('.')[0]}_plot_{pdf_charts_found}_{clean_title}.png"
                    
                    chart_path = os.path.join(output_dir, chart_filename)
                    
                    # Save the EXTENDED chart region (including title area)
                    cv2.imwrite(chart_path, extended_chart_region)
                    
                    # Print chart info
                    print(f"ACCEPTED: {chart_filename}")
                    print(f"Original dimensions: {w}x{h}, Extended height: {new_h}, Ratio: {aspect_ratio:.2f}")
                    print(f"Chart elements: {horiz_count} horizontal lines, {vert_count} vertical lines")
                    print(f"Background brightness: {bg_analysis['brightness']:.1f}, saturation: {bg_analysis['saturation']:.1f}")
                    print(f"Title: {title}")
                    print(f"Extended upward by: {y - new_y} pixels")
                    print("-" * 50)
                
                if chart_count == 0:
                    print(f"  No charts detected on page {page_num+1}")
            
            # Write a single CSV entry per PDF
            writer.writerow({
                'date_extracted': timestamp,
                'pdf_filename': pdf_file,
                'accepted_images_count': pdf_charts_found
            })
            
            print(f"PDF Summary: {pdf_file}")
            print(f"Charts found and saved: {pdf_charts_found}")
            print(f"{'='*60}\n")
            
            doc.close()
            total_pdfs_processed += 1
    
    print(f"\nComplete! Processed {total_pdfs_processed} PDFs")
    print(f"Total charts saved: {total_charts_saved}")

# Paths
pdf_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs"
output_dir = r"C:\Users\clint\Desktop\Scraping Task\pdfs\Images"
csv_path = r"C:\Users\clint\Desktop\Scraping Task\pdf_image_data.csv"

# Extract charts from all PDFs and update CSV
extract_charts_from_all_pdfs(pdf_dir, output_dir, csv_path)


Processing PDF 1/80: 01_2018.pdf

Skipping page 1 (first page) as requested
Processing page 2...
Coordinates: new_y=640 (type=<class 'int'>), new_h=2031 (type=<class 'int'>)
REJECTED: Region with dimensions 3152x1839 - No clear chart elements found
  (Found 1 horizontal lines, 6 vertical lines)
REJECTED: Region with dimensions 3744x501 - Extreme aspect ratio 7.47
  No charts detected on page 2
Processing page 3...
Coordinates: new_y=1163 (type=<class 'int'>), new_h=2170 (type=<class 'int'>)
REJECTED: Region with dimensions 3104x1973 - No clear chart elements found
  (Found 0 horizontal lines, 0 vertical lines)
Coordinates: new_y=3240 (type=<class 'int'>), new_h=2256 (type=<class 'int'>)
REJECTED: Region with dimensions 3112x1856 - No clear chart elements found
  (Found 0 horizontal lines, 0 vertical lines)
REJECTED: Region with dimensions 3744x501 - Extreme aspect ratio 7.47
  No charts detected on page 3
Processing page 4...
Coordinates: new_y=3308 (type=<class 'int'>), new_h=2250 (t