In [3]:
import os
import fitz  # PyMuPDF

In [4]:
def extract_chart_near_text(pdf_path, search_texts=["Average Retail Selling Price", "Avg. Retail Selling Price"], 
                           output_folder="C:\\Users\\clint\\Desktop\\Scraping Task\\pdfs\\Images", 
                           pixels_above=20, pixels_below=400):
    """
    Extracts the chart/plot near the text "Average Retail Selling Price" or "Avg. Retail Selling Price" 
    by taking a screenshot of the region with a fixed height below the search text.
    
    Args:
        pdf_path (str): Path to the PDF file
        search_texts (list): List of text variations to search for in the PDF
        output_folder (str): Folder to save the extracted image
        pixels_above (int): How many pixels to extend above the search text
        pixels_below (int): How many pixels to extend below the search text
    
    Returns:
        str: Path to the saved image, or None if no image was found
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get PDF filename for output image naming
    pdf_filename = os.path.basename(pdf_path).split('.')[0]
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Get page text with locations
        text_blocks = page.get_text("dict")["blocks"]
        
        # Try each search text
        for search_text in search_texts:
            # Search for the target text
            text_instances = page.search_for(search_text)
            if not text_instances:
                continue
                
            print(f"Found '{search_text}' on page {page_num+1}")
            
            # Get the position of the text
            target_rect = text_instances[0]
            
            # Find the text block containing our search text
            target_block_index = -1
            for i, block in enumerate(text_blocks):
                if "lines" not in block:
                    continue
                    
                # Check if this block contains our search text
                for line in block["lines"]:
                    for span in line["spans"]:
                        span_text = span["text"]
                        if search_text.lower() in span_text.lower():
                            target_block_index = i
                            break
                    if target_block_index >= 0:
                        break
                if target_block_index >= 0:
                    break
            
            if target_block_index < 0:
                print(f"Found text but couldn't identify the containing block on page {page_num+1}")
                continue
            
            # Get the block with our target text
            target_block = text_blocks[target_block_index]
            target_block_rect = fitz.Rect(target_block["bbox"])
            
            # Create capture area with fixed distance from the title
            # Use full page width and fixed height from the title
            capture_rect = fitz.Rect(
                0,                          # Start from left edge
                target_block_rect.y0 - pixels_above,  # Extend above the text
                page.rect.width,            # Full page width
                target_block_rect.y0 + pixels_below   # Fixed distance below the text
            )
            
            # Make sure we stay within page bounds
            capture_rect = capture_rect.intersect(page.rect)
            
            # Take a screenshot of this area (2x zoom for better quality)
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=capture_rect)
            
            # Save the image with the actual matched text in the filename
            image_path = os.path.join(output_folder, f"{pdf_filename}_{search_text.replace(' ', '_').replace('.', '')}_chart.png")
            pix.save(image_path)
            
            print(f"Saved chart region from page {page_num+1} to {image_path}")
            return image_path
    
    # If we get here, we didn't find what we needed
    print(f"Could not find a suitable chart associated with any of these texts: {search_texts}")
    return None

In [5]:
def main():
    # Directory containing PDF files
    pdf_directory = "C:\\Users\\clint\\Desktop\\Scraping Task\\pdfs"
    output_folder = "C:\\Users\\clint\\Desktop\\Scraping Task\\pdfs\\Images"
    
    # Search texts to find
    search_texts = ["Retail Selling Price"]
    
    # Get all PDF files in the directory
    pdf_files = [os.path.join(pdf_directory, f) for f in os.listdir(pdf_directory) 
                if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(pdf_directory, f))]
    
    if not pdf_files:
        print(f"No PDF files found in {pdf_directory}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    # Process each PDF file
    results = []
    for pdf_file in pdf_files:
        print(f"\nProcessing: {os.path.basename(pdf_file)}")
        extracted_image = extract_chart_near_text(
            pdf_path=pdf_file,
            search_texts=search_texts,
            output_folder=output_folder, 
            pixels_above=20,
            pixels_below=300
        )
        results.append({
            "pdf_file": pdf_file,
            "image_path": extracted_image,
            "success": extracted_image is not None
        })
    
    # Print summary
    print("\n========= SUMMARY =========")
    successful = sum(1 for r in results if r["success"])
    print(f"Successfully extracted charts from {successful} out of {len(pdf_files)} PDF files")
    
    # List successful extractions
    if successful > 0:
        print("\nSuccessful extractions:")
        for r in results:
            if r["success"]:
                print(f"- {os.path.basename(r['pdf_file'])} → {os.path.basename(r['image_path'])}")
    
    # List failed extractions
    if successful < len(pdf_files):
        print("\nFailed extractions:")
        for r in results:
            if not r["success"]:
                print(f"- {os.path.basename(r['pdf_file'])}")

if __name__ == "__main__":
    main()

Found 79 PDF files to process

Processing: 01_2018.pdf
Found 'Retail Selling Price' on page 3
Saved chart region from page 3 to C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2018_Retail_Selling_Price_chart.png

Processing: 01_2019.pdf
Found 'Retail Selling Price' on page 3
Saved chart region from page 3 to C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2019_Retail_Selling_Price_chart.png

Processing: 01_2020.pdf
Found 'Retail Selling Price' on page 4
Saved chart region from page 4 to C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2020_Retail_Selling_Price_chart.png

Processing: 01_2021.pdf
Found 'Retail Selling Price' on page 6
Saved chart region from page 6 to C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2021_Retail_Selling_Price_chart.png

Processing: 01_2022.pdf
Could not find a suitable chart associated with any of these texts: ['Retail Selling Price']

Processing: 01_2023.pdf
Found 'Retail Selling Price' on page 4
Saved chart region from page 4 to C:\Users\c