SHIPMENT DATE ETD

In [2]:
import fitz  # PyMuPDF
import re
import sys
from datetime import datetime, timedelta

def replace_shipment_date(input_pdf, output_pdf):
    """
    Replace shipment date in a PDF document with a date 15 days before the original date
    
    Args:
        input_pdf (str): Path to input PDF file
        output_pdf (str): Path to output PDF file
    """
    # Open the PDF document
    doc = fitz.open(input_pdf)
    
    # Extract text from the first page to find the shipment date
    text = doc[0].get_text("text")
    
    # Regex pattern to find the shipment date (MM-DD-YYYY format)
    # This pattern looks for "Shipment Date ETD" followed by the date
    date_pattern = r"Shipment Date ETD\s*(\d{2}-\d{2}-\d{4})"
    match = re.search(date_pattern, text)
    
    if not match:
        print("Shipment date not found in the PDF")
        doc.close()
        return False
    
    old_shipment_date = match.group(1)
    print(f"Found shipment date: {old_shipment_date}")
    
    # Parse the original date and subtract 15 days
    original_date = datetime.strptime(old_shipment_date, "%m-%d-%Y")
    new_date = original_date - timedelta(days=15)
    new_shipment_date = new_date.strftime("%m-%d-%Y")
    print(f"New shipment date (15 days before): {new_shipment_date}")
    
    # Iterate through each page to find and replace the date
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Search for the old shipment date text
        text_instances = page.search_for(old_shipment_date)
        
        # Remove each found instance
        for inst in text_instances:
            # Add a white rectangle to cover the old date
            page.add_redact_annot(inst, fill=(1, 1, 1))  # White fill
            
        # Apply the redactions (remove the text)
        page.apply_redactions()
        
        # Now add the new shipment date at the first found location
        if text_instances:
            # Use the position of the first found instance
            rect = text_instances[0]
            
            # Insert the new text at the same position
            page.insert_text(
                (rect.x0, rect.y0+8.5),  # Position
                new_shipment_date,   # New text
                fontsize=7.5,          # Adjust font size to match original
                color=(0, 0, 0)      # Black text
            )
    
    # Save the document
    doc.save(output_pdf)
    doc.close()
    print(f"Successfully processed PDF. Saved as: {output_pdf}")
    return True

if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO.pdf"
    output_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_edited.pdf"
    
    # Replace shipment date
    replace_shipment_date(input_file, output_file)

Found shipment date: 05-30-2025
New shipment date (15 days before): 05-15-2025
Successfully processed PDF. Saved as: C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_edited.pdf


PRICES

In [4]:
import fitz  # PyMuPDF
import re
import sys
from datetime import datetime, timedelta

def replace_price(doc):
    """
    Replace all prices in the PDF document by reducing each by 0.75
    
    Args:
        doc: PyMuPDF document object
    """
    # Extract text from all pages to find all prices
    all_text = ""
    for page_num in range(len(doc)):
        all_text += doc[page_num].get_text("text") + "\n"
    
    # Regex pattern to find all prices (looks for "Price" followed by the value)
    price_pattern = r"Price\s*([\d.,]+)"
    matches = re.findall(price_pattern, all_text, re.IGNORECASE)
    
    if not matches:
        print("No prices found in the PDF")
        return False
    
    print(f"Found {len(matches)} prices: {matches}")
    
    # Create a list of price conversions
    price_conversions = []
    
    for old_price_text in matches:
        # Clean and convert the price
        cleaned_price = old_price_text.replace(',', '.')
        dot_count = cleaned_price.count('.')
        
        # If there's more than one dot, remove all but the last one
        if dot_count > 1:
            parts = [part for part in cleaned_price.split('.') if part]
            if len(parts) > 1:
                cleaned_price = ''.join(parts[:-1]) + '.' + parts[-1]
            else:
                cleaned_price = ''.join(parts)
        
        # Convert to float and reduce by 0.75
        try:
            old_price_float = float(cleaned_price)
            new_price_float = old_price_float - 0.75
            # Format back to European style (comma as decimal separator)
            new_price_text = f"{new_price_float:.2f}".replace('.', ',')
            price_conversions.append((old_price_text, new_price_text))
            print(f"Price {old_price_text} → {new_price_text}")
        except ValueError:
            print(f"Could not convert price to float: {cleaned_price}")
            continue
    
    # Sort by length (longest first) to avoid partial replacements
    price_conversions.sort(key=lambda x: len(x[0]), reverse=True)
    
    # Iterate through each page to find and replace all prices
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # First pass: find all instances and create redaction annotations
        annotations = []
        for old_price_text, new_price_text in price_conversions:
            # Search for the old price text on this page
            text_instances = page.search_for(old_price_text)
            
            for inst in text_instances:
                annotations.append((inst, new_price_text))
        
        # Apply all redactions at once
        for rect, new_text in annotations:
            page.add_redact_annot(rect, fill=(1, 1, 1))  # White fill
        
        page.apply_redactions()
        
        # Second pass: insert new text
        for rect, new_text in annotations:
            page.insert_text(
                (rect.x0, rect.y0+9),  # Position
                new_text,            # New text
                fontsize=7.5,          # Adjust font size to match original
                color=(0, 0, 0)      # Black text
            )
    
    return True

def process_pdf(input_pdf, output_pdf):
    """
    Main function to process PDF - replace all prices
    """
    # Open the PDF document
    doc = fitz.open(input_pdf)
    
    # Process all prices
    success = replace_price(doc)
    if not success:
        print("Price replacement failed")
    
    # Save the document
    doc.save(output_pdf)
    doc.close()
    print(f"Successfully processed PDF. Saved as: {output_pdf}")
    return True

if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO.pdf"
    output_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_editedqdt.pdf"
    
    # Process both shipment date and price
    process_pdf(input_file, output_file)

Found 8 prices: ['4,81', '5,11', '5,31', '5,56', '4,56', '4,81', '5,16', '5,51']
Price 4,81 → 4,06
Price 5,11 → 4,36
Price 5,31 → 4,56
Price 5,56 → 4,81
Price 4,56 → 3,81
Price 4,81 → 4,06
Price 5,16 → 4,41
Price 5,51 → 4,76
Successfully processed PDF. Saved as: C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_editedqdt.pdf


Amount

In [5]:
import fitz  # PyMuPDF
import re

def remove_amount_values_only(doc):
    """
    Remove only the numeric values following the 'Amount' text 
    (keep 'Amount' word and currency symbol)
    
    Args:
        doc: PyMuPDF document object
    """
    # Extract text from all pages to find all amounts
    all_text = ""
    for page_num in range(len(doc)):
        all_text += doc[page_num].get_text("text") + "\n"
    
    # Regex pattern to find the numeric values after "Amount"
    # This captures only the numeric part (digits, commas, dots) before the currency
    amount_pattern = r"Amount\s*([\d.,]+)\s*([A-Z]{3})"
    matches = re.findall(amount_pattern, all_text, re.IGNORECASE)
    
    if not matches:
        print("No amounts found in the PDF")
        return False
    
    print(f"Found {len(matches)} amounts to process:")
    for numeric_part, currency in matches:
        print(f"  {numeric_part} {currency}")
    
    # Extract just the numeric parts to remove (remove duplicates)
    numeric_values_to_remove = list(set([match[0] for match in matches]))
    numeric_values_to_remove.sort(key=len, reverse=True)
    
    # Iterate through each page to find and remove only the numeric values
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        redaction_rects = []
        for numeric_value in numeric_values_to_remove:
            # Search for the numeric value text on this page
            text_instances = page.search_for(numeric_value)
            redaction_rects.extend(text_instances)
        
        # Apply all redactions at once
        for rect in redaction_rects:
            page.add_redact_annot(rect, fill=(1, 1, 1))  # White fill
        
        if redaction_rects:
            page.apply_redactions()
            print(f"Removed {len(redaction_rects)} numeric values from page {page_num + 1}")
    
    return True

def process_pdf(input_pdf, output_pdf):
    """
    Main function to process PDF - remove numeric values only
    """
    # Open the PDF document
    doc = fitz.open(input_pdf)
    
    # Process all numeric values
    success = remove_amount_values_only(doc)
    if not success:
        print("Numeric value removal failed")
    
    # Save the document
    doc.save(output_pdf)
    doc.close()
    print(f"Successfully processed PDF. Saved as: {output_pdf}")
    return True

if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO.pdf"
    output_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_no_values.pdf"
    
    # Process PDF to remove numeric values only
    process_pdf(input_file, output_file)

Found 8 amounts to process:
  4.713,80 USD
  16.556,40 USD
  1.991,25 USD
  17.013,60 USD
  8.116,80 USD
  48.677,20 USD
  4.179,60 USD
  16.695,30 USD
Removed 3 numeric values from page 1
Removed 5 numeric values from page 2
Successfully processed PDF. Saved as: C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_no_values.pdf


Total USD

In [6]:
import fitz  # PyMuPDF
import re

def remove_total_usd_value(doc):
    """
    Remove only the numerical value following the first 'Total USD' text
    
    Args:
        doc: PyMuPDF document object
    """
    # Extract text from all pages to find the Total USD value
    all_text = ""
    for page_num in range(len(doc)):
        all_text += doc[page_num].get_text("text") + "\n"
    
    # Regex pattern to find the numeric value after "Total USD"
    total_pattern = r"Total USD\s*([\d.,]+)"
    match = re.search(total_pattern, all_text, re.IGNORECASE)
    
    if not match:
        print("Total USD not found in the PDF")
        return False
    
    numeric_value = match.group(1)
    print(f"Found Total USD value: {numeric_value}")
    
    # Iterate through each page to find and remove the numeric value
    found = False
    for page_num in range(len(doc)):
        if found:
            break  # Stop after finding and processing the first occurrence
            
        page = doc[page_num]
        
        # Search for the numeric value on this page
        text_instances = page.search_for(numeric_value)
        
        if text_instances:
            # Apply redaction to remove the numeric value
            for rect in text_instances:
                page.add_redact_annot(rect, fill=(1, 1, 1))  # White fill
            
            page.apply_redactions()
            print(f"Removed Total USD value '{numeric_value}' from page {page_num + 1}")
            found = True
    
    if not found:
        print("Could not locate the Total USD value on any page for removal")
        return False
    
    return True

def process_pdf(input_pdf, output_pdf):
    """
    Main function to process PDF - remove Total USD numeric value
    """
    # Open the PDF document
    doc = fitz.open(input_pdf)
    
    # Process Total USD value
    success = remove_total_usd_value(doc)
    if not success:
        print("Total USD value removal failed")
    
    # Save the document
    doc.save(output_pdf)
    doc.close()
    print(f"Successfully processed PDF. Saved as: {output_pdf}")
    return True

if __name__ == "__main__":
    # Example usage
    input_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO.pdf"
    output_file = r"C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_no_total.pdf"
    
    # Process PDF to remove Total USD value
    process_pdf(input_file, output_file)

Found Total USD value: 117.943,95
Removed Total USD value '117.943,95' from page 2
Successfully processed PDF. Saved as: C:\Users\Altersense\Desktop\ERP-RPA\Sample\Other Samples\LIDL Polo_55th Round_Manami_JF-05B41688_Light Blue_DE_PO_no_total.pdf
