In [1]:
!pip install pypdf



In [3]:
from pypdf import PdfReader

import pypdf
print(pypdf.__version__)

5.1.0


In [4]:
import os
import re
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
    pdf_path (str): The path to the PDF file.
    
    Returns:
    str: The extracted text from the PDF.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_billing_address(text):
    """
    Extract the billing address following 'Bill To' in the extracted PDF text.
    
    Args:
    text (str): The extracted text from the PDF.
    
    Returns:
    str: The billing address found after 'Bill To'. If not found, returns 'Not found'.
    """
    match = re.search(r"Bill To[:\s]*(.*?)(?=\n\n|\n\s*\n|$)", text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return "Not found"

def process_pdfs(directory):
    """
    Process all PDF files in the specified directory.
    
    Args:
    directory (str): The path to the directory containing PDF files.
    """
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return
    
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            text = extract_text_from_pdf(pdf_path)
            billing_address = extract_billing_address(text)
            print(f"Billing Address in {filename}:\n{billing_address}\n")

# Set the directory containing the PDF files
pdf_directory = "/home/codespace/Extract-text-PDF-1/random_invoices"  # Replace with your actual path

# Process the PDFs in the directory
process_pdfs(pdf_directory)

Billing Address in Sharp Invoice 10003.pdf:
Active Aging Centre Golden LilyMethodist Welfare Service Simei St 51
Qty Description Unit Price Total
4 Metre Charge Black $0.50 $2.00
4 Metre Charge Colour $1.00 $4.00
$6.00Subtotal
$0.54tax
$6.54Total
$6.54Balance
Thank you for your business.

Billing Address in Sharp Invoice 10002.pdf:
Active Aging Centre Golden LilyMethodist Welfare Service Simei St 51
Qty Description Unit Price Total
3 Metre Charge Black $0.50 $1.50
4 Metre Charge Colour $1.00 $4.00
$5.50Subtotal
$0.50tax
$6.00Total
$6.00Balance
Thank you for your business.

Billing Address in Sharp Invoice 10000.pdf:
Active Aging Centre Teck GheeMethodist Welfare Services
Qty Description Unit Price Total
1 Metre Charge Black $0.50 $0.50
1 Metre Charge Colour $1.00 $1.00
$1.50Subtotal
$0.14tax
$1.64Total
$1.64Balance
Thank you for your business.

Billing Address in Sharp Invoice 10001.pdf:
Active Aging Centre Teck GheeMethodist Welfare Services
Qty Description Unit Price Total
1 Metre Ch

In [1]:
import os
import re
import shutil
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
    pdf_path (str): The path to the PDF file.
    
    Returns:
    str: The extracted text from the PDF.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_billing_address(text):
    """
    Extract the billing address following 'Bill To' in the extracted PDF text.
    
    Args:
    text (str): The extracted text from the PDF.
    
    Returns:
    str: The billing address found after 'Bill To'. If not found, returns 'Not found'.
    """
    match = re.search(r"Bill To[:\s]*(.*?)(?=\n\n|\n\s*\n|$)", text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return "Not found"

def sanitize_folder_name(name):
    """
    Sanitize the folder name by replacing invalid characters.
    
    Args:
    name (str): The original folder name.
    
    Returns:
    str: The sanitized folder name.
    """
    return re.sub(r'[<>:"/\\|?*]', '_', name)

def process_pdfs(directory):
    """
    Process all PDF files in the specified directory and sort them into folders based on billing address.
    
    Args:
    directory (str): The path to the directory containing PDF files.
    """
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return
    
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            text = extract_text_from_pdf(pdf_path)
            billing_address = extract_billing_address(text)
            print(f"Billing Address in {filename}:\n{billing_address}\n")
            
            if billing_address != "Not found":
                sanitized_address = sanitize_folder_name(billing_address)
                address_folder = os.path.join(directory, sanitized_address)
                
                if not os.path.exists(address_folder):
                    os.makedirs(address_folder)
                
                shutil.move(pdf_path, os.path.join(address_folder, filename))

# Set the directory containing the PDF files
pdf_directory = "/home/codespace/Extract-text-PDF-1/random_invoices"  # Replace with your actual path

# Process the PDFs in the directory
process_pdfs(pdf_directory)


Billing Address in Sharp Invoice 10003.pdf:
Active Aging Centre Golden LilyMethodist Welfare Service Simei St 51
Qty Description Unit Price Total
4 Metre Charge Black $0.50 $2.00
4 Metre Charge Colour $1.00 $4.00
$6.00Subtotal
$0.54tax
$6.54Total
$6.54Balance
Thank you for your business.

Billing Address in Sharp Invoice 10002.pdf:
Active Aging Centre Golden LilyMethodist Welfare Service Simei St 51
Qty Description Unit Price Total
3 Metre Charge Black $0.50 $1.50
4 Metre Charge Colour $1.00 $4.00
$5.50Subtotal
$0.50tax
$6.00Total
$6.00Balance
Thank you for your business.

Billing Address in Sharp Invoice 10000.pdf:
Active Aging Centre Teck GheeMethodist Welfare Services
Qty Description Unit Price Total
1 Metre Charge Black $0.50 $0.50
1 Metre Charge Colour $1.00 $1.00
$1.50Subtotal
$0.14tax
$1.64Total
$1.64Balance
Thank you for your business.

Billing Address in Sharp Invoice 10001.pdf:
Active Aging Centre Teck GheeMethodist Welfare Services
Qty Description Unit Price Total
1 Metre Ch