In [2]:
import pdfplumber
import re
from typing import List, Optional
import pyperclip

def extract_invoice_data(pdf_path):
    """
    Extracts specific invoice details from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing the extracted data.
    """
    data = {}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # We know the required data is on the first page
            first_page = pdf.pages[0]

            # Extract all tables from the page
            tables = first_page.extract_tables()
            
            # The data is located in tables on the page
            # We'll iterate through all tables to find the right information
            for table in tables:
                for row in table:
                    # Clean up the row data
                    clean_row = [cell.strip() if cell else '' for cell in row]
                    # print(clean_row)
                    # Look for the Invoice Number
                    if "Invoice Number" in clean_row:
                        # Extract the invoice number from the next cell
                        invoice_number = extract_value(clean_row)
                        if invoice_number:
                            data['Invoice Number'] = invoice_number
                    elif "Order Number" in clean_row:
                        order_number = extract_value(clean_row)
                        order_number = order_number.replace('SI - ','WEPA')
                        if order_number:
                            data['Order Number'] = order_number

            text = first_page.extract_text()        
            # Regex pattern to find the key and its value
            patterns = {
                'Net Total Weight': r'Net Total Weight:\s*(\d+,\d+\s*KG)',
                'Gross Total Weight': r'Gross Total Weight:\s*(\d+,\d+\s*KG)',
                'Total Quantity': r'Total Quantity:\s*(\d+\s*PL)',
                'Total Amount': r'Total Amount:\s*([\d,]+\.\d+\s*GBP)'
            }
            new_names = {
                'Net Total Weight': 'Total Net Mass',
                'Gross Total Weight': 'Total Gross Mass',
                'Total Quantity': 'Total Packages',
                'Total Amount': 'Total Invoice'
            }
            
            for key, pattern in patterns.items():
                match = re.search(pattern, text, re.DOTALL)
                if match:
                    data[new_names[key]] = convert_to_float(match.group(1).strip())
    
    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return data

def extract_value(my_list: List[str]) -> Optional[str]:
    """
    Finds the first element in a list that contains a digit.

    Args:
        my_list: A list of strings to search.

    Returns:
        The first string containing a digit, or None if no such string is found.
    """
    # Use a generator expression with next() for a concise and efficient search
    return next((element for element in my_list if re.search(r'\d', element)), None)

def convert_to_float(value_str: str) -> Optional[float]:
    """
    Cleans a string by removing non-numeric characters and converts it to a float.
    Handles commas as thousands separators.

    Args:
        value_str: The string to convert.

    Returns:
        The converted float value, or None if conversion fails.
    """
    if not isinstance(value_str, str):
        return None
        
    try:
        # Remove commas first, then use regex to remove everything except digits and a single period
        cleaned_string = value_str.replace(',', '')
        digits_only = re.sub(r'[^0-9.]', '', cleaned_string)

        if digits_only:
            # Attempt to convert to float and return
            return float(digits_only)
    except (ValueError, TypeError):
        # Return None if the conversion raises an error
        return None

In [3]:
# Replace 'SI 25818067 - Customer Invoice Aldi UK.pdf' with your file path
invoice_data = extract_invoice_data('SI 25818067 - Customer Invoice Aldi UK.pdf')

In [12]:
def format_string_with_spaces(s: str) -> str:
    # Remove all spaces from the original string
    s_no_spaces = s.replace(" ", "")

    # Reformat the string by adding a space every 4 characters
    # A list comprehension iterates over the string in steps of 4
    formatted_s = " ".join([s_no_spaces[i:i+4] for i in range(0, len(s_no_spaces), 4)])

    return formatted_s

def extract_mrn_from_pdf(pdf_path):
    """
    Extracts the MRN number from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted MRN number, or None if not found.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # The MRN is on the first page
            first_page = pdf.pages[0]
            text = first_page.extract_text()

            # Use a regular expression to find the MRN number pattern
            # The pattern looks for "MRN: " followed by alphanumeric characters and spaces
            # The pattern is based on source [1]
            match = re.search(r'MRN:\s*([a-zA-Z0-9\s]+)', text)
            
            if match:
                # Group 1 of the regex captures the number itself
                return 'MRN:'+format_string_with_spaces(match.group(1).strip())
            else:
                return None

    except FileNotFoundError:
        print(f"Error: The file '{pdf_path}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Replace 'CDS-Import-31176317-20250828-1440.pdf' with your file path
pdf_file = 'CDS-Import-31176317-20250828-1440.pdf'
mrn_number = extract_mrn_from_pdf(pdf_file)

In [13]:
def generate_and_copy_email_message(order_number: str, mrn_number: str):
    """
    Generates a standardized email message, copies it to the clipboard,
    and returns the message string.
    """
    message = f"""
Hello,

Your Ref {order_number}.

Please kindly find attached import entry docs.

{mrn_number}.

Thank you.
"""
    
    try:
        pyperclip.copy(message)
        print("Message successfully copied to clipboard. You can now paste it.")
    except pyperclip.PyperclipException:
        print("Could not copy to clipboard. Please check your pyperclip installation.")

    return message



In [11]:
invoice_data

{'Invoice Number': '25809840 - RI',
 'Order Number': 'WEPA25818067',
 'Total Net Mass': 7669.0,
 'Total Gross Mass': 9777.0,
 'Total Packages': 66.0,
 'Total Invoice': 12249.6}

In [14]:
msg = generate_and_copy_email_message(invoice_data['Order Number'].replace('WEPA',''), mrn_number)
print(msg)

Message successfully copied to clipboard. You can now paste it.

Hello,

Your Ref 25818067.

Please kindly find attached import entry docs.

MRN:25GB 9IBF 2TOR TUNA R2.

Thank you.

