# Installing Dependensies

In [7]:
!pip install pytesseract



In [8]:
!pip install spacy



In [9]:
!python -m spacy download en_core_web_sm

2023-10-12 18:31:01.912302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


# OCR STRIPPING (SINCE ITS NOT HANDWRITTEN, THE COLOR CONVERSIONS ARE NOT PERFORMED

In [11]:
import pytesseract
from PIL import Image

# Specify the path to the Tesseract executable (not needed in Google Colab)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load the image
image_path = 'sample.png'
image = Image.open(image_path)

# Perform OCR
ocr_text = pytesseract.image_to_string(image)



## Let us see the result of the extraction.


In [12]:
print(ocr_text)

>

Salford & Co.

123 Anywhere St., Any City, ST 12345
Tel: +123-456-7890

INVOICE

Invoice No: 00000001 Date: 12 October, 2025

Bill to: Liceria & Co.

123 Anywhere St.,
Any City, ST 12345

     

       

Item Description Price Amount
1. Logo Design $200 $200
2. Advertising Design $500 $500
3. Poster Design $500 $500
4. Brochure Design $200 $200
5 Content Writer $500 $500

 

Total: $1900

Bank Name: Olivia Wilson
Bank Account: 0123 4567 8901

 

If you have any question please contact : hello@reallygreatsite.com



## Let us now use NER to extract the needed data and put them in a csv file.

In [13]:
import re
import csv


# Extract invoice number and date
invoice_number_match = re.search(r"Invoice No:\s*([0-9]+)", ocr_text)
invoice_date_match = re.search(r"Date:\s*([\d/]+)", ocr_text)

if invoice_number_match:
    invoice_number = invoice_number_match.group(1)
else:
    invoice_number = ""

if invoice_date_match:
    invoice_date = invoice_date_match.group(1)
else:
    invoice_date = ""

# Extract items and their descriptions, prices, and amounts
items = re.findall(r"(\d+)\.\s*(\w+\s\w+)\s*\$(\d+)\s*\$(\d+)", ocr_text)

# Calculate the total amount
total_match = re.search(r"Total:\s*\$(\d+)", ocr_text)

if total_match:
    total_amount = total_match.group(1)
else:
    total_amount = ""

# Specify CSV file path
csv_file_path = "invoice_summary.csv"

# Write extracted data to a CSV file
with open(csv_file_path, mode='w', newline='') as csv_file:
    fieldnames = ["Invoice Number", "Date", "Item Number", "Item Description", "Price", "Amount"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()  # Write CSV header

    for item in items:
        item_number, item_description, price, amount = item
        writer.writerow({
            "Invoice Number": invoice_number,
            "Date": invoice_date,
            "Item Number": item_number,
            "Item Description": item_description,
            "Price": price,
            "Amount": amount,
        })

print("CSV file created successfully.")


CSV file created successfully.


## Let us display the csv

In [14]:
import pandas as pd

In [15]:
csv=pd.read_csv("invoice_summary.csv")
csv

Unnamed: 0,Invoice Number,Date,Item Number,Item Description,Price,Amount
0,1,12,1,Logo Design,200,200
1,1,12,2,Advertising Design,500,500
2,1,12,3,Poster Design,500,500
3,1,12,4,Brochure Design,200,200


## Since it doesnt include the total amount, let us modify the code futher.

In [16]:
import re
import csv

# Initialize variables to track invoice details
current_invoice_number = ""
current_invoice_date = ""
current_invoice_total = 0

# Extracted data
extracted_data = []

# Extract items and their descriptions, prices, and amounts
items = re.findall(r"(\d+)\.\s*(\w+\s\w+)\s*\$(\d+)\s*\$(\d+)", ocr_text)

# Iterate through items and accumulate total for each invoice
for item in items:
    item_number, item_description, price, amount = item

    # Extract invoice number and date if present
    invoice_number_match = re.search(r"Invoice No:\s*([0-9]+)", ocr_text)
    invoice_date_match = re.search(r"Date:\s*([\d/]+)", ocr_text)

    if invoice_number_match:
        current_invoice_number = invoice_number_match.group(1)
    else:
        current_invoice_number = ""

    if invoice_date_match:
        current_invoice_date = invoice_date_match.group(1)
    else:
        current_invoice_date = ""

    # Accumulate the total amount
    current_invoice_total += int(amount)

    # Append data to the extracted_data list
    extracted_data.append({
        "Invoice Number": current_invoice_number,
        "Date": current_invoice_date,
        "Item Number": item_number,
        "Item Description": item_description,
        "Price": price,
        "Amount": amount,
        "Total Amount": current_invoice_total,
    })

# Specify CSV file path
csv_file_path = "invoice_summary_withTotal.csv"

# Write extracted data to a CSV file
with open(csv_file_path, mode='w', newline='') as csv_file:
    fieldnames = ["Invoice Number", "Date", "Item Number", "Item Description", "Price", "Amount", "Total Amount"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()  # Write CSV header

    for entry in extracted_data:
        writer.writerow(entry)

print("CSV file created successfully.")


CSV file created successfully.


# Now let us make the reader to read multiple invoices and append all invoices onto the same csv file

In [22]:
import pytesseract
from PIL import Image
import re
import csv
import os

# Initialize the CSV file path
csv_file_path = "invoice_summary_multiple.csv"

# Check if the CSV file already exists; if not, create it with headers
if not os.path.isfile(csv_file_path):
    with open(csv_file_path, mode='w', newline='') as csv_file:
        fieldnames = ["Invoice Number", "Date", "Item Number", "Item Description", "Price", "Amount", "Total Amount"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

# Specify the directory where invoice images are located
image_directory = "invoice_images"

# Check if the specified directory exists
if not os.path.exists(image_directory):
    print(f"The specified directory '{image_directory}' does not exist.")
else:
    # List all files in the directory
    invoice_image_files = [f for f in os.listdir(image_directory) if os.path.isfile(os.path.join(image_directory, f))]

    # Iterate through the invoice image files
    for image_file in invoice_image_files:
        # Get the full path to the image file
        image_path = os.path.join(image_directory, image_file)

        # Load the image
        image = Image.open(image_path)

        # Perform OCR
        ocr_text = pytesseract.image_to_string(image)

        # Initialize variables to track invoice details
        current_invoice_number = ""
        current_invoice_date = ""
        current_invoice_total = 0

        # Extract items and their descriptions, prices, and amounts
        items = re.findall(r"(\d+)\.\s*(\w+\s\w+)\s*\$(\d+)\s*\$(\d+)", ocr_text)

        # Extract invoice number and date if present
        invoice_number_match = re.search(r"Invoice No:\s*([0-9]+)", ocr_text)
        invoice_date_match = re.search(r"Date:\s*([\d/]+)", ocr_text)

        if invoice_number_match:
            current_invoice_number = invoice_number_match.group(1)
        else:
            current_invoice_number = ""

        if invoice_date_match:
            current_invoice_date = invoice_date_match.group(1)
        else:
            current_invoice_date = ""

        # Initialize total amount for the current invoice
        current_invoice_total = 0

        # Extracted data
        extracted_data = []

        # Iterate through items and accumulate total for the current invoice
        for item in items:
            item_number, item_description, price, amount = item
            current_invoice_total += int(amount)

            # Append data to the extracted_data list
            extracted_data.append({
                "Invoice Number": current_invoice_number,
                "Date": current_invoice_date,
                "Item Number": item_number,
                "Item Description": item_description,
                "Price": price,
                "Amount": amount,
                "Total Amount": current_invoice_total,
            })

        # Append the extracted data to the CSV file
        with open(csv_file_path, mode='a', newline='') as csv_file:
            fieldnames = ["Invoice Number", "Date", "Item Number", "Item Description", "Price", "Amount", "Total Amount"]
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

            for entry in extracted_data:
                writer.writerow(entry)

    print("Data from multiple invoices appended to the CSV file.")


Data from multiple invoices appended to the CSV file.
