In [None]:
# OCR on SROIE Receipt Images
# Role: OCR Specialist
# Task: Extract text and total price from receipt images using OCR
# Dataset: SROIE (train/img)

In [None]:
!pip install pytesseract pillow opencv-python pandas tqdm

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import os
import re
import json
import pytesseract
import pandas as pd
from PIL import Image
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
IMG_DIR = "/content/drive/MyDrive/Colab Notebooks/7002 AI Tech/Group Assignment/train/img"

In [None]:
def ocr_image(image_path):
    """
    Run OCR on a receipt image and return cleaned text and text lines.
    """
    img = Image.open(image_path)
    raw_text = pytesseract.image_to_string(img)

    # Basic post-processing
    lines = raw_text.split("\n")
    lines = [l.strip() for l in lines if l.strip() != ""]

    clean_text = "\n".join(lines)
    return clean_text, lines

In [None]:
TOTAL_KEYWORDS = [
    "total", "grand total", "net total",
    "amount due", "amount", "balance", "total amount"
]

def extract_total(lines):
    """
    Improved total extraction:
    1. Multiple keywords
    2. Cross-line extraction
    3. Fallback to last monetary value
    """

    # 1️ Keyword-based extraction
    for i, line in enumerate(lines):
        line_lower = line.lower()

        if any(k in line_lower for k in TOTAL_KEYWORDS):
            nums = re.findall(r"\d+\.\d{2}", line)
            if nums:
                return nums[-1]

            if i + 1 < len(lines):
                nums_next = re.findall(r"\d+\.\d{2}", lines[i + 1])
                if nums_next:
                    return nums_next[0]

    # 2️ Fallback: last number in receipt
    all_numbers = []
    for line in lines:
        nums = re.findall(r"\d+\.\d{2}", line)
        all_numbers.extend(nums)

    if all_numbers:
        return all_numbers[-1]

    return None

In [None]:
def extract_items(lines):
    items = []
    for line in lines:
        if "rm" in line.lower() and "total" not in line.lower():
            items.append(line)
    return items

In [None]:
results = []

image_files = [
    f for f in os.listdir(IMG_DIR)
    if f.lower().endswith((".jpg", ".png", ".jpeg"))
]

for img_name in tqdm(image_files):
    img_path = os.path.join(IMG_DIR, img_name)

    clean_text, lines = ocr_image(img_path)

    result = {
        "image_name": img_name,
        "ocr_text": clean_text,
        "total_price": extract_total(lines),
        "items": extract_items(lines)
    }

    results.append(result)

100%|██████████| 626/626 [38:25<00:00,  3.68s/it]


In [None]:
results[0]

{'image_name': 'X51005441408.jpg',
 'ocr_text': '32 PUB & BISTRO own by CNU TRADING\n78, JALAN SS21/62,\nDAMANSARA UTAMA,\n47400 PETALING JAYA.\n(GST Reg. No : 000416321536)\nTax Invoice\nTable 5\nINV No. 504233 Pax(s): 0\nDate +: 10-03-2018 23:03:06\nCashier: CHEN\nDescr int ion Oty U.price Total TAX\nHEINEKE N (5 BT i)\n2% 36.00 190. 00 §k\nTotal a gst):\nGST Payable:\nTotal (Inclusive of GST):\nTOTAL: ©\nCceea: 0 11-03-2018 00: 92: 02\nServer: CHEN\nCASH : 200.00\nCHANGE : 16. 00\nGST Summary Anount (RM). "Tat (RM)\nSR (@ 6%) 71.95, 10. cI\nkK\nKK Tanke You mK\n~ Dla A.',
 'total_price': '71.95',
 'items': ['GST Summary Anount (RM). "Tat (RM)']}

In [None]:
df = pd.DataFrame(results)
df.to_csv("sroie_ocr_output_v2.csv", index=False)

In [None]:
total_images = len(df)
success_count = df["total_price"].notna().sum()
success_rate = success_count / total_images * 100

print(f"Total receipt images: {total_images}")
print(f"Successfully extracted totals: {success_count}")
print(f"OCR total extraction success rate: {success_rate:.2f}%")

Total receipt images: 626
Successfully extracted totals: 592
OCR total extraction success rate: 94.57%


In [None]:
!ls

drive  sample_data  sroie_ocr_output_v2.csv


In [None]:
import shutil
shutil.copy(
    "/content/sroie_ocr_output_v2.csv",
    "/content/drive/MyDrive/sroie_ocr_output_v2.csv"
)
print("Saved to Google Drive")

Saved to Google Drive
