In [2]:
!pip install transformers datasets
!pip install pandas openpyxl pytesseract spacy transformers torch torchvision googletrans==4.0.0-rc1
!python -m spacy download en_core_web_sm
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract
!pip install pytesseract opencv-python pandas openpyxl
!tesseract -v

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
import cv2
import pytesseract
import pandas as pd
import re
import os
from PIL import Image

In [7]:
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        return None
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    processed_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return processed_img

In [6]:
def extract_text_tesseract(image_path):
    processed_img = preprocess_image(image_path)
    if processed_img is None:
        return None
    text = pytesseract.image_to_string(processed_img, config="--psm 6")
    return text

def extract_invoice_fields(text):
    fields = {
        "invoice_no": "Not Found",
        "invoice_date": "Not Found",
        "total_amount": "Not Found",
        "base_amount": "Not Found",
        "tax_amount": "Not Found",
        "recipient_name": "Not Found",
        "sender_name": "Not Found",
        "gstin": "Not Found",
        "recipient_address": "Not Found",
        "sender_address": "Not Found",
        "contact": "Not Found",
    }

    patterns = {
        "invoice_no": r"Serial No\. of Invoice:\s*(\d+)",
        "invoice_date": r"Invoice Date\s*[:\-]?\s*(\d{1,2}-[A-Za-z]{3}-\d{4})",
        "total_amount": r"GRAND TOTAL\s*([\d,]+(?:\.\d{2})?)",
        "tax_amount": r"IGST\s*[@]?\s*[\d%]+\s*([\d,]+(?:\.\d{2})?)",
        "base_amount": r"TOTAL\s*([\d,]+(?:\.\d{2})?)",
        "recipient_name": r"Details of Receiver \(Billed to\)\s*([\w\s]+)",
        "sender_name": r"Details of Consignee \(Shipped to\)\s*([\w\s]+)",
        "gstin": r"GSTIN\s*[:\-]?\s*([\w\d]+)",
        "recipient_address": r"Details of Receiver \(Billed to\)[\s\S]*?Address\s*([\w\s,]+)",
        "sender_address": r"Details of Consignee \(Shipped to\)[\s\S]*?Address\s*([\w\s,]+)",
        "contact": r"Contact\s*[:\-]?\s*([\d,]+)",
    }

    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            fields[field] = match.group(1)

    return fields


In [8]:
def export_to_excel(extracted_data, excel_path):
    try:
        if os.path.exists(excel_path):
            df = pd.read_excel(excel_path)
        else:
            df = pd.DataFrame(columns=[
                'Invoice No', 'Invoice Date', 'Total Invoice Amount',
                'Tax Amount', 'Base Amount', 'Recipient Name', 'Sender Name',
                'GSTIN', 'Recipient Address', 'Sender Address', 'Contact'
            ])

        df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        df.to_excel(excel_path, index=False)
        print(f"Data saved to {excel_path}")
    except Exception as e:
        print(f"Error while exporting to Excel: {e}")

In [9]:
# Main function to process the invoice image
def process_invoice(image_path, excel_path):
    print("Extracting text using Tesseract OCR...")
    invoice_text = extract_text_tesseract(image_path)

    if invoice_text:
        print("OCR Output:\n", invoice_text)
        extracted_fields = extract_invoice_fields(invoice_text)
        print("Extracted Fields:", extracted_fields)
        export_to_excel(extracted_fields, excel_path)
    else:
        print("No text extracted from the image.")

In [11]:
image_path = "/content/Bill 4.jpg"
excel_path = "/content/Datathon.xlsx"
process_invoice(image_path, excel_path)


Extracting text using Tesseract OCR...
OCR Output:
 (ORIGINAL FOR RECIPIENT )
Tax invoice
{As Per Section 31(1) of CGST Act 2017 & Rule 1 of invoice Rules }
ABC Company
Address 1, Address 2, Address 3, Contact : 22222222,9999999999
Email : a@a.com. Website : www.abe.com
CIN: CINO001
PAN No: BIFPS8820S P.O.No. Date: Mode of Transport :
GSTIN : 27AAAAA1234A125 Vendor Code : Payment Terms : Transporter :VTrans
Serial No. of Invoice: 1 Oelvery Note: Veh.No / LR No: MH02' 120308123
invoice Date :2-Apr-2017 FSC COC NO.: FCCODE Oate & Time of Supply :
Place Of Supply : PUNE
Details of Receiver (Billed to) Details of Consignee (Shipped to}
Zenith Enterprises Zenith Enterprises
Address 1 Address 1
Address 2 Address 2
Address 3 Address 3
Address 4 Address 4
GSTIN : 24AAAAA1234A129 GSTIN : 24AAAAA1234A129
S.No Description of Goods HSN GST No& Avg Oty Rate per Amount Disc. Taxable
Code Rate Desc Cont Amount
of = per
Pkgs = Pkgs
1 hem 1 6a8sss 18% 0 0 100 nos 1,000.00/nes nos 1,00.000.00 0 1,00,000