In [1]:
#!/usr/bin/env python3
"""
PDF to Images to PDF Converter

This program converts a PDF file to individual images and then combines
those images back into a new PDF file.

Required packages:
- PyMuPDF (fitz): pip install PyMuPDF
- Pillow: pip install Pillow

Usage:
    python pdf_converter.py input.pdf output.pdf
"""

import fitz  # PyMuPDF
from PIL import Image
import os
import sys
import tempfile
import argparse
from pathlib import Path


class PDFConverter:
    def __init__(self, dpi=300, image_format='PNG'):
        """
        Initialize the PDF converter.
        
        Args:
            dpi (int): Resolution for image conversion (default: 300)
            image_format (str): Image format for temporary files (default: 'PNG')
        """
        self.dpi = dpi
        self.image_format = image_format.upper()
        
    def pdf_to_images(self, pdf_path, output_dir=None):
        """
        Convert PDF pages to images.
        
        Args:
            pdf_path (str): Path to input PDF file
            output_dir (str): Directory to save images (optional)
            
        Returns:
            list: List of image file paths
        """
        # Open the PDF
        pdf_document = fitz.open(pdf_path)
        image_paths = []
        
        # Create output directory if specified
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
        else:
            output_dir = tempfile.mkdtemp()
            
        print(f"Converting PDF to images (DPI: {self.dpi})...")
        
        # Convert each page to image
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            
            # Create a matrix for the desired DPI
            mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)
            
            # Render page to image
            pix = page.get_pixmap(matrix=mat)
            
            # Save image
            image_filename = f"page_{page_num + 1:04d}.{self.image_format.lower()}"
            image_path = os.path.join(output_dir, image_filename)
            
            if self.image_format == 'PNG':
                pix.save(image_path)
            else:
                # Convert to PIL Image for other formats
                img_data = pix.tobytes("ppm")
                img = Image.open(io.BytesIO(img_data))
                img.save(image_path, self.image_format)
            
            image_paths.append(image_path)
            print(f"  Saved page {page_num + 1}/{len(pdf_document)}: {image_filename}")
        
        pdf_document.close()
        print(f"PDF converted to {len(image_paths)} images in: {output_dir}")
        return image_paths
    
    def images_to_pdf(self, image_paths, output_pdf_path):
        """
        Combine images into a PDF file.
        
        Args:
            image_paths (list): List of image file paths
            output_pdf_path (str): Path for output PDF file
        """
        if not image_paths:
            raise ValueError("No images provided")
        
        print(f"Combining {len(image_paths)} images into PDF...")
        
        # Open first image to get dimensions and create PDF
        first_image = Image.open(image_paths[0])
        
        # Convert to RGB if necessary (for JPEG compatibility)
        if first_image.mode != 'RGB':
            first_image = first_image.convert('RGB')
        
        # Prepare list of remaining images
        other_images = []
        for img_path in image_paths[1:]:
            img = Image.open(img_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            other_images.append(img)
        
        # Save as PDF
        first_image.save(
            output_pdf_path,
            "PDF",
            resolution=self.dpi,
            save_all=True,
            append_images=other_images
        )
        
        # Close images
        first_image.close()
        for img in other_images:
            img.close()
            
        print(f"PDF created: {output_pdf_path}")
    
    def convert_pdf_via_images(self, input_pdf, output_pdf, temp_dir=None, keep_images=False):
        """
        Complete conversion: PDF -> Images -> PDF
        
        Args:
            input_pdf (str): Path to input PDF
            output_pdf (str): Path to output PDF
            temp_dir (str): Directory for temporary images (optional)
            keep_images (bool): Whether to keep temporary images (default: False)
        """
        print(f"Starting conversion: {input_pdf} -> {output_pdf}")
        
        # Create temporary directory if not specified
        if temp_dir is None:
            temp_dir = tempfile.mkdtemp()
            cleanup_temp = True
        else:
            os.makedirs(temp_dir, exist_ok=True)
            cleanup_temp = False
        
        try:
            # Step 1: Convert PDF to images
            image_paths = self.pdf_to_images(input_pdf, temp_dir)
            
            # Step 2: Convert images back to PDF
            self.images_to_pdf(image_paths, output_pdf)
            
            print(f"Conversion completed successfully!")
            
        finally:
            # Cleanup temporary files if requested
            if not keep_images and cleanup_temp:
                import shutil
                shutil.rmtree(temp_dir)
                print(f"Temporary files cleaned up")
            elif keep_images:
                print(f"Images saved in: {temp_dir}")


def main():
    parser = argparse.ArgumentParser(
        description="Convert PDF to images and back to PDF",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s input.pdf output.pdf
  %(prog)s input.pdf output.pdf --dpi 150
  %(prog)s input.pdf output.pdf --keep-images --temp-dir ./temp_images
        """
    )
    
    parser.add_argument('input_pdf', help='Input PDF file path')
    parser.add_argument('output_pdf', help='Output PDF file path')
    parser.add_argument('--dpi', type=int, default=300, 
                       help='DPI for image conversion (default: 300)')
    parser.add_argument('--format', default='PNG', 
                       choices=['PNG', 'JPEG', 'TIFF'],
                       help='Image format for temporary files (default: PNG)')
    parser.add_argument('--temp-dir', 
                       help='Directory for temporary images')
    parser.add_argument('--keep-images', action='store_true',
                       help='Keep temporary image files')
    
    args = parser.parse_args()
    
    # Validate input file
    if not os.path.exists(args.input_pdf):
        print(f"Error: Input file '{args.input_pdf}' does not exist")
        sys.exit(1)
    
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(args.output_pdf)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Initialize converter
    converter = PDFConverter(dpi=args.dpi, image_format=args.format)
    
    try:
        # Perform conversion
        converter.convert_pdf_via_images(
            args.input_pdf,
            args.output_pdf,
            temp_dir=args.temp_dir,
            keep_images=args.keep_images
        )
    except Exception as e:
        print(f"Error during conversion: {str(e)}")
        sys.exit(1)


if __name__ == "__main__":
    main()


# Example usage as a module
def example_usage():
    """
    Example of how to use the PDFConverter class directly
    """
    # Initialize converter with custom settings
    converter = PDFConverter(dpi=200, image_format='JPEG')
    
    # Method 1: Complete conversion in one step
    converter.convert_pdf_via_images(
        'input.pdf', 
        'output.pdf',
        keep_images=True,
        temp_dir='./temp_images'
    )
    
    # Method 2: Step-by-step conversion
    # Step 1: PDF to images
    image_paths = converter.pdf_to_images('input.pdf', './images')
    
    # Step 2: Images to PDF
    converter.images_to_pdf(image_paths, 'output.pdf')


# Batch processing example
def batch_convert_pdfs(input_directory, output_directory, dpi=300):
    """
    Convert all PDF files in a directory
    """
    converter = PDFConverter(dpi=dpi)
    
    input_path = Path(input_directory)
    output_path = Path(output_directory)
    output_path.mkdir(exist_ok=True)
    
    pdf_files = list(input_path.glob("*.pdf"))
    
    for pdf_file in pdf_files:
        output_file = output_path / f"converted_{pdf_file.name}"
        print(f"Processing: {pdf_file.name}")
        
        try:
            converter.convert_pdf_via_images(str(pdf_file), str(output_file))
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")

ModuleNotFoundError: No module named 'fitz'

In [12]:
# Install required packages first


from pdf2image import convert_from_path
from PIL import Image
import os

def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path,dpi=500)
    image_files = []
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i, image in enumerate(images):
        image_file = os.path.join(output_folder, f'page_{i + 1}.png')
        image.save(image_file, 'PNG')
        image_files.append(image_file)
    return image_files

def images_to_pdf(image_files, output_pdf_path):
    images = [Image.open(img).convert('RGB') for img in image_files]
    if images:
        images[0].save(output_pdf_path, save_all=True, append_images=images[1:])

# Example usage
file_name='statement_chase'
pdf_path = file_name+'.pdf'
output_folder = file_name+'_images'
output_pdf_path = file_name+'_imagenised.pdf'

image_files = pdf_to_images(pdf_path, output_folder)
images_to_pdf(image_files, output_pdf_path)
