In [1]:
!pip install pdfplumber



In [2]:
import pdfplumber
import re

In [3]:
pdf_path = 'C:\\Users\\thaku\\OneDrive - Manipal University Jaipur\\Data Science Labs\\New folder (2)\\1000+ PDF_Invoice_Folder\\invoice_Aaron Bergman_36258.pdf'

In [4]:
with pdfplumber.open(pdf_path) as pdf:
    text=""
    for page in pdf.pages:
        text += page.extract_text() + "\n"

print("==Extracted Text==")
print(text)

# Extract invoice components using regex
invoice_data = {
    "Invoice Number": re.search(r'Invoice\s+#\s*(\d+)', text),
    "Date": re.search(r'Date:\s*([A-Za-z]{3}\s+\d{2}\s+\d{4})', text),
    "Ship Mode": re.search(r'Ship Mode:\s*(.*)', text),
    "Balance Due": re.search(r'Balance Due:\s*\$([\d.]+)', text),
    "Subtotal": re.search(r'Subtotal:\s*\$([\d.]+)', text),
    "Discount": re.search(r'Discount.*?:\s*\$([\d.]+)', text),
    "Shipping": re.search(r'Shipping:\s*\$([\d.]+)', text),
    "Total": re.search(r'Total:\s*\$([\d.]+)', text),
    "Customer Name": re.search(r'Bill To:\s*(.+)', text),
    "Ship To": re.search(r'Ship To:\s*(.+)', text),
    "Order ID": re.search(r'Order ID\s*:\s*(.+)', text)
}

# Clean extracted values
parsed_data = {}
for key, match in invoice_data.items():
    parsed_data[key] = match.group(1).strip() if match else "Not found"

# Print extracted results
print("\n=== Parsed Invoice Data ===")
for key, value in parsed_data.items():
    print(f"{key}: {value}")

==Extracted Text==
SuperStore INVOICE
# 36258
Date: Mar 06 2012
Bill To: Ship To:
Ship Mode: First Class
Aaron Bergman 98103, Seattle,
Washington, United
Balance Due: $50.10
States
Item Quantity Rate Amount
Global Push Button Manager's Chair, Indigo 1 $48.71 $48.71
Chairs, Furniture, FUR-CH-4421
Subtotal: $48.71
Discount (20%): $9.74
Shipping: $11.13
Total: $50.10
Notes:
Thanks for your business!
Terms:
Order ID : CA-2012-AB10015140-40974


=== Parsed Invoice Data ===
Invoice Number: Not found
Date: Mar 06 2012
Ship Mode: First Class
Balance Due: 50.10
Subtotal: 48.71
Discount: 9.74
Shipping: 11.13
Total: 50.10
Customer Name: Ship To:
Ship To: Ship Mode: First Class
Order ID: CA-2012-AB10015140-40974


In [21]:
import os
import pdfplumber
import re
import pandas as pd
import smtplib
from email.message import EmailMessage
from datetime import datetime

# --- STEP 1: Extract invoice data ---

pdf_folder = r"C:\Users\thaku\OneDrive - Manipal University Jaipur\Data Science Labs\New folder (2)\1000+ PDF_Invoice_Folder"

def extract_invoice_data(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    text = re.sub(r'\n+', '\n', text).strip()

    invoice_number = re.search(r'INVOICE\s*#\s*(\d+)', text, re.IGNORECASE)
    date = re.search(r'Date:\s*([A-Za-z]+\s+\d{1,2}\s+\d{4})', text)
    ship_mode = re.search(r'Ship Mode:\s*([A-Za-z ]+)', text)
    customer_name_match = re.search(r'Bill To:\s*\n?([^\n]+)', text)
    address_match = re.search(r'Ship To:\s*\n?([^\n]+)', text)
    total_amount_match = re.search(r'Total:\s*\$?([\d,]+\.\d{2})', text)

    data = {
        "File": os.path.basename(pdf_path),
        "Invoice Number": invoice_number.group(1) if invoice_number else "Not found",
        "Date": date.group(1) if date else "Not found",
        "Customer Name": customer_name_match.group(1).strip() if customer_name_match else "Not found",
        "Address": address_match.group(1).strip() if address_match else "Not found",
        "Ship Mode": ship_mode.group(1).strip() if ship_mode else "Not found",
        "Total Amount": total_amount_match.group(1).replace(",", "") if total_amount_match else "Not found"
    }
    return data

all_data = []
for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        full_path = os.path.join(pdf_folder, filename)
        invoice_info = extract_invoice_data(full_path)
        all_data.append(invoice_info)

df = pd.DataFrame(all_data)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Total Amount'] = pd.to_numeric(df['Total Amount'], errors='coerce')

summary = df.groupby('Date').agg({
    'Invoice Number': 'count',
    'Total Amount': 'sum'
}).rename(columns={
    'Invoice Number': 'Invoices Count',
    'Total Amount': 'Total Billed'
}).reset_index()

# Save Excel files
detail_file = "All_Invoices_Detail.xlsx"
summary_file = "Invoice_Summary_By_Date.xlsx"
df.to_excel(detail_file, index=False)
summary.to_excel(summary_file, index=False)

print("Invoice files generated.")

# --- STEP 2: Send email with attachments ---

YOUR_EMAIL = "aditi78sh@gmail.com"
YOUR_PASSWORD = "qlpr xqpz ytuk zgrw"  # Use App Password if using Gmail

recipients = ["thakurrahulsinghjdm123@gmail.com", "aditi829066@gmail.com", "hiirahulsingh@gmail.com"]
cc = ["rchemistryjdm@gmail.com"]
bcc = ["rahul.2314107786@mujonline.edu.in"]

msg = EmailMessage()
msg["Subject"] = "Invoice report - All vendors dated 04082025"
msg["From"] = YOUR_EMAIL
msg["To"] = ", ".join(recipients)
msg["Cc"] = ", ".join(cc)
msg["Bcc"] = ", ".join(bcc)

msg.set_content("""\
Dear All,

Please find the attached Invoice report for all the respective BU and request you to please clear the same before the due date to avoid charges.

Warm Regards,  
Technology Team
""")

# Attach files
for filepath in [detail_file, summary_file]:
    with open(filepath, "rb") as f:
        file_data = f.read()
        file_name = os.path.basename(filepath)
        msg.add_attachment(file_data, maintype="application", subtype="octet-stream", filename=file_name)

# SMTP Send
try:
    with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:
        smtp.login(YOUR_EMAIL, YOUR_PASSWORD)
        smtp.send_message(msg)
    print("Email sent successfully.")
except Exception as e:
    print("Failed to send email:", e)


Invoice files generated.
Email sent successfully.
