In [13]:
import cv2
import pytesseract
import re
import pandas as pd
from PIL import Image
import numpy as np

In [21]:
def preprocess_image(image_path):
    """Preprocess the image for better OCR results"""

    img = cv2.imread("reccipt.png")


    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


    gray = cv2.medianBlur(gray, 3)


    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)


    kernel = np.ones((1, 1), np.uint8)
    thresh = cv2.dilate(thresh, kernel, iterations=1)

    return thresh

In [15]:
def extract_receipt_data(text):
    """Extract structured data from receipt text"""

    print("=== Raw OCR Text ===")
    print(text)
    print("=" * 50)


    result = {
        'store_name': '',
        'date': '',
        'receipt_number': '',
        'items': [],
        'subtotal': '',
        'total': '',
        'cash': '',
        'change': '',
        'cashier': ''
    }

    lines = text.split('\n')

    result['store_name'] = lines[0] if len(lines) > 0 else ""

    for line in lines:

        date_match = re.search(r'(\d{2}/\d{2}/\d{4})', line)
        if date_match:
            result['date'] = date_match.group(1)

        rcpt_match = re.search(r'Rcpt#:\s*([A-Za-z0-9]+)', line)
        if rcpt_match:
            result['receipt_number'] = rcpt_match.group(1)

        cashier_match = re.search(r'OP:\s*(\w+)', line)
        if cashier_match:
            result['cashier'] = cashier_match.group(1)

    item_lines = []
    in_items_section = False

    for line in lines:
        line = line.strip()

        if any(keyword in line for keyword in ['1', '2', '3', '4', '5']) and not in_items_section:
            in_items_section = True

        if 'SUBTOTAL' in line.upper() and in_items_section:
            in_items_section = False

        if in_items_section and line:

            if re.match(r'^\d+$', line.strip()):
                continue
            item_lines.append(line)

    for line in item_lines:

        price_match = re.search(r'(.+?)\s+(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?)', line)

        if price_match:
            item_name = price_match.group(1).strip()
            price = price_match.group(2).replace(',', '')

            if len(item_name) > 2 and not item_name.isdigit():
                result['items'].append({
                    'name': item_name,
                    'price': price
                })


    for line in lines:
        line_upper = line.upper()


        if 'SUBTOTAL' in line_upper:
            amount_match = re.search(r'SUBTOTAL\s+(\d{1,3}(?:[.,]\d{3})*)', line)
            if amount_match:
                result['subtotal'] = amount_match.group(1).replace(',', '')


        elif 'TOTAL' in line_upper and not 'SUBTOTAL' in line_upper:
            amount_match = re.search(r'TOTAL\s+(\d{1,3}(?:[.,]\d{3})*)', line)
            if amount_match:
                result['total'] = amount_match.group(1).replace(',', '')

        elif 'CASH' in line_upper:
            amount_match = re.search(r'CASH\s+(\d{1,3}(?:[.,]\d{3})*)', line)
            if amount_match:
                result['cash'] = amount_match.group(1).replace(',', '')

        elif 'CHANGE' in line_upper:
            amount_match = re.search(r'CHANGE\s+(\d{1,3}(?:[.,]\d{3})*)', line)
            if amount_match:
                result['change'] = amount_match.group(1).replace(',', '')

    return result

In [22]:
def main():

    image_path = "image.png"

    try:
        processed_img = preprocess_image(image_path)

        custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/., :#@&'


        text = pytesseract.image_to_string(processed_img, config=custom_config)


        receipt_data = extract_receipt_data(text)

        print("=== EXTRACTED RECEIPT DATA ===")
        print(f"Store: {receipt_data['store_name']}")
        print(f"Date: {receipt_data['date']}")
        print(f"Receipt #: {receipt_data['receipt_number']}")
        print(f"Cashier: {receipt_data['cashier']}")
        print("\n--- ITEMS ---")
        for item in receipt_data['items']:
            print(f"{item['name']}: {item['price']}")
        print(f"\nSubtotal: {receipt_data['subtotal']}")
        print(f"Total: {receipt_data['total']}")
        print(f"Cash: {receipt_data['cash']}")
        print(f"Change: {receipt_data['change']}")


        if receipt_data['items']:
            df = pd.DataFrame(receipt_data['items'])
            df['Date'] = receipt_data['date']
            df['Receipt_Number'] = receipt_data['receipt_number']
            df['Store'] = receipt_data['store_name']
            df['Total_Amount'] = receipt_data['total']
            df['Cashier'] = receipt_data['cashier']

            output_csv = "receipt_analysis.csv"
            df.to_csv(output_csv, index=False, encoding='utf-8')
            print(f"\n=== Data saved to {output_csv} ===")

            print("\n=== DATAFRAME ===")
            print(df)
        else:
            print("No items found to save to CSV.")

    except Exception as e:
        print(f"Error processing image: {e}")

In [17]:
if __name__ == "__main__":
    main()

=== Raw OCR Text ===
MOMIToYs
Ceram
CtppoMatlK
mgnhateaveyadeyafae
eindmamy Tiwitter
Check3850
Paes OFGrsnan Gunawan
POSTitieCasnier
Rewse7As500000125326/01/201516.13
wosar 0
2harCheese 74.090
1IceJavaTea 16.
1Wineralkater 13,900
1BlackMhite 72.000
SUBTOTAL 175,000
TOTAL 175.00G
CASH 200,000
ngPNescecsatzezsceazenaeee
. ClosedB111
nerenenn726/01/201518149
Thankyou fortasting,
ourNeltinglyCrepe

=== EXTRACTED RECEIPT DATA ===
Store: MOMIToYs
Date: 26/01/2015
Receipt #: 
Cashier: 

--- ITEMS ---
wosar: 0
2harCheese: 74.090
1IceJavaTea: 16
1Wineralkater: 13900
1BlackMhite: 72.000
TOTAL: 175.00
CASH: 200000

Subtotal: 175000
Total: 175
Cash: 200000
Change: 

=== Data saved to receipt_analysis.csv ===

=== DATAFRAME ===
            name   price        Date Receipt_Number     Store Total_Amount  \
0          wosar       0  26/01/2015                 MOMIToYs          175   
1     2harCheese  74.090  26/01/2015                 MOMIToYs          175   
2    1IceJavaTea      16  26/01/2015    

In [24]:

def simple_receipt_reader(image_path):
    """Simple version for quick testing"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    text = pytesseract.image_to_string(thresh)
    print("Extracted Text:")
    print(text)

    return text
