In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
pip install PyPDF2 pdfplumber pytesseract opencv-python pdf2image



In [34]:
import os
import pandas as pd
import re
import csv
from typing import List, Dict
import glob

In [35]:
import pdfplumber
import re
from pdf2image import convert_from_path
import pytesseract

In [36]:
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n" if page.extract_text() else ""
    return text

In [37]:
def extract_text_from_scanned_pdf(file_path):
    images = convert_from_path(file_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text

In [38]:
def extract_text(file_path):
    with open(file_path, "rb") as f:
        try:
            if pdfplumber.open(f).pages[0].extract_text():
                return extract_text_from_pdf(file_path)
            else:
                return extract_text_from_scanned_pdf(file_path)
        except:
            return extract_text_from_scanned_pdf(file_path)

In [39]:
def extract_between(text: str, start: str, end: str) -> str:
    pattern = f'{re.escape(start)}(.*?){re.escape(end)}'
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ''

In [40]:
def extract_invoice_data(text: str) -> Dict[str, str]:
    patterns = {
        'invoice_number': r'Invoice #:\s*(INV-\d+)',
        'invoice_date': r'Invoice Date:\s*(\d{2}\s+\w{3}\s+\d{4})',
        'due_date': r'Due Date:\s*(\d{2}\s+\w{3}\s+\d{4})',
        'gstin': r'GSTIN\s*(\S+)',
        'mobile': r'Mobile\s*(\+?\d[\d\s-]{10,})',
        'email': r'Email\s*(\S+@\S+)',
        'customer_name': None,  # Will be extracted separately
        'place_of_supply': None,  # Will be extracted separately
        'total_amount': r'Total\s*₹([\d,.]+)',
        'total_discount': r'Total Discount\s*₹([\d,.]+)',
        'total_items_qty': r'Total Items / Qty\s*:\s*([\d.]+\s*/\s*[\d.]+)',
        'amount_in_words': None,  # Will be extracted separately
        'bank_name': None,  # Will be extracted separately
        'account_number': r'Account #:\s*(\d+)',
        'ifsc_code': r'IFSC Code:\s*(\S+)',
        'branch': None,  # Will be extracted separately
    }

    data = {}
    for key, pattern in patterns.items():
        if pattern is None:
            continue
        match = re.search(pattern, text, re.MULTILINE | re.DOTALL)
        if match:
            data[key] = match.group(1).strip()

    # Targeted extraction for problematic fields
    data['customer_name'] = extract_between(text, 'Customer Details:', 'Place of Supply:')
    data['place_of_supply'] = extract_between(text, 'Place of Supply:', '# Item')
    data['amount_in_words'] = extract_between(text, 'Total amount (in words):', 'Amount Paid')
    data['bank_name'] = extract_between(text, 'Bank:', 'Account #:')
    data['branch'] = extract_between(text, 'Branch:', 'Authorized Signatory')

    # Ensure original order of columns
    ordered_data = {key: data.get(key, '') for key in patterns.keys()}
    return ordered_data

In [41]:
def extract_item_data(text: str) -> List[Dict[str, str]]:
    item_section = re.search(r'# Item Rate / Item Qty Taxable Value Tax Amount Amount(.*?)Taxable Amount', text, re.DOTALL)
    if not item_section:
        return []

    item_lines = item_section.group(1).strip().split('\n')
    items = []
    i = 0
    while i < len(item_lines):
        if len(item_lines) - i >= 2:  # Ensure we have at least two lines to process
            line1 = item_lines[i].strip()
            line2 = item_lines[i+1].strip()

            # Extract information from the two lines
            match = re.match(r'(.*?)\s+([\d.]+)\s+(\d+)\s+(\d+\s+\w+)\s+([\d.]+)\s+([\d.]+\s+\(\d+%\))\s+([\d.]+)$', line1)
            if match:
                name = match.group(1)
                rate = match.group(2)
                qty = match.group(3)
                qty_unit = match.group(4)
                taxable_value = match.group(5)
                tax_amount = match.group(6)
                amount = match.group(7)

                # Process the second line for discount and possible name continuation
                discount_match = re.search(r'(.*?)\s+([\d.]+)\s+\(([-\d.]+)%\)$', line2)
                if discount_match:
                    name_continuation = discount_match.group(1).strip()
                    original_rate = discount_match.group(2)
                    discount = discount_match.group(3)

                    # If there's a name continuation, add it to the name
                    if name_continuation:
                        name += ' ' + name_continuation

                item = {
                    'name': name,
                    'rate': rate,
                    'original_rate': original_rate,
                    'discount': discount,
                    'qty': qty,
                    'qty_unit': qty_unit,
                    'taxable_value': taxable_value,
                    'tax_amount': tax_amount,
                    'amount': amount
                }
                items.append(item)

            i += 2  # Move to the next item (skip the processed second line)
        else:
            i += 1  # Move to the next line if we don't have a pair

    return items

In [42]:
def process_invoice(text: str, invoice_csv: str, items_csv: str):
    invoice_data = extract_invoice_data(text)
    item_data = extract_item_data(text)
    print(item_data)

    with open(invoice_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=invoice_data.keys())
        writer.writeheader()
        writer.writerow(invoice_data)

    if item_data:
        with open(items_csv, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=item_data[0].keys())
            writer.writeheader()
            writer.writerows(item_data)

In [43]:
def extract_invoice(file_path):
    extracted_text = extract_text(file_path)
    process_invoice(extracted_text, 'invoice_data.csv', 'item_data.csv')

In [44]:
folder_path = '/content/drive/MyDrive/Jan to Mar/INV-150_Bhusan Naresh.pdf'
pattern = os.path.join(folder_path, "*.pdf")

for pdf in glob.glob(pattern):
  extract_invoice(pdf)