In [None]:
# Invoice Data Extraction and Organising Using OCR and Regex

In [None]:
!pip install pytesseract -q
!pip install pillow -q
!pip install paddleocr -q
!pip install paddlepaddle -q
!sudo apt install tesseract-ocr -q

Reading package lists...
Building dependency tree...
Reading state information...
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
# "/content/drive/MyDrive/Task3/invoices/printable_invoice.pdf"
# "/content/drive/MyDrive/Task3/invoices/printable_invoice_pages-to-jpg-0001.jpg"

In [None]:
# Import Libraries
import pytesseract
from PIL import Image
import re
import json
from paddleocr import PaddleOCR

In [None]:
# Function to extract text using Tesseract OCR
def extract_text_tesseract(image):
    return pytesseract.image_to_string(image)

In [None]:
# Function to extract text using PaddleOCR
def extract_text_paddleocr(image_path):
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    result = ocr.ocr(image_path)
    return "\n".join([line[1][0] for line in result[0]])

In [None]:
# Function to extract data using regex
def extract_data_using_regex(text, patterns):
    extracted_data = {}
    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        extracted_data[field] = match.group(1).strip() if match else None
    return extracted_data

In [None]:
# Function to save the extracted data to JSON and display it on the console
def save_to_json(data, file_path):
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)
    print("Data saved to json file")
    print(json.dumps(data, indent=4))

In [None]:
# Path to the image on Google Drive
image_path = "/content/drive/MyDrive/Task3/invoices/invoice_pages-to-jpg-0001.jpg"
output_file_path = "/content/drive/MyDrive/Colab Notebooks/task8.json"

# Load the image
image = Image.open(image_path)

In [None]:
# Extract text using Tesseract OCR
extracted_text_tesseract = extract_text_tesseract(image)

# Extract text using PaddleOCR
extracted_text_paddle = extract_text_paddleocr(image_path)

[2025/01/21 09:58:21] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

In [None]:
# Define regex patterns for fields
patterns = {
    "invoice_number": r"Invoice Number\s*:\s*(\S+)",
    "billing_address": r"Billing Address\s*:\s*([\s\S]*?)\n\n",
    "shipping_address": r"Shipping Address\s*:\s*([\s\S]*?)\n\n",
    "order_number": r"Order Number\s*:\s*(\S+)",
    "order_date": r"Order Date\s*:\s*(\S+)",
    "invoice_date": r"Invoice Date\s*:\s*(\S+)",
    "pan_no": r"PAN No\s*:\s*(\S+)",
    "gst_no": r"GST Registration No\s*:\s*(\S+)",
    "total_amount": r"Total Amount\s*:\s*₹([\d,]+(?:\.\d{2})?)",
}

In [None]:
# Organize data using regex from Tesseract OCR
extracted_data_tesseract = extract_data_using_regex(extracted_text_tesseract, patterns)

# Organize data using regex from PaddleOCR
extracted_data_paddle = extract_data_using_regex(extracted_text_paddle, patterns)


In [None]:
# Combine results from both OCRs
comparison_data = {
    "Tesseract OCR": extracted_data_tesseract,
    "PaddleOCR": extracted_data_paddle
}

# Save the extracted to .json file
save_to_json(comparison_data, output_file_path)


Data saved to json file
{
    "Tesseract OCR": {
        "invoice_number": "AMD2-911128",
        "billing_address": "CLICKTECH RETAIL PRIVATE LIMITED Viraj Tank\n\u201d Plot no. 120 X and part portion of plot no. 119 E/6, Sastrinagar\nW2, Gallops Industrial Park 1, Village Rajoda, RAJKOT, GUJARAT, 360004\nTaluka Bavla, District Anmedabad IN\nAhmedabad, GUJARAT, 382220 State/UT Code: 24\nIN",
        "shipping_address": "PAN No: AAJCC9783E Viraj Tank\nGST Registration No: 24AAJCC9783E1ZD Viraj Tank",
        "order_number": "404-0944623-8329157",
        "order_date": "26.09.2024",
        "invoice_date": "26.09.2024",
        "pan_no": "AAJCC9783E",
        "gst_no": "24AAJCC9783E1ZD",
        "total_amount": null
    },
    "PaddleOCR": {
        "invoice_number": "AMD2-911128",
        "billing_address": null,
        "shipping_address": null,
        "order_number": "404-0944623-8329157",
        "order_date": "26.09.2024",
        "invoice_date": "26.09.2024",
        "pan_no": "A