In [24]:
from PIL import Image
import cv2
import pytesseract
import re
import pandas as pd
import csv

In [2]:
from matplotlib import pyplot as plt
#https://stackoverflow.com/questions/28816046/
#displaying-different-images-with-actual-size-in-matplotlib-subplot

# Given the file path to an image, display the image
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()


# Convert the given image to grayscale
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


def noise_removal(image):
    import numpy as np
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.dilate(image, kernel, iterations=1)
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.erode(image, kernel, iterations=1)
    
    # Main things that get rid of noise in the background
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image, 3)
    
    return (image)

# OCR Text Processing

## Extract Date
(there is only 1 date on the entire receipt hence the indexing [0])

## Extract Serial Number, Item Name, Price

In [70]:
receipt_file_path = "data/costco.jpg"
receipt = cv2.imread(receipt_file_path)
thresh, receipt_bw = cv2.threshold(grayscale(receipt), 127, 255, cv2.THRESH_BINARY)

ocr_result = pytesseract.image_to_string(receipt_bw)

new_out = ocr_result.replace("NF ", "").replace("NE ", "")

# regex to match (serial number, item name, price) from each line
info_regex = r"(?P<serial>\d{4,})\s?(?P<name>.*[a-zA-Z].*)\s(?P<price>\$?\d+\.\d{2})"
matches = re.findall(info_regex, new_out, re.MULTILINE)

# Populate lists
item_names = []
prices = []
for match in matches:
    # Some receipts do not have a serial number but all have name and price
    if len(match) == 2:
        name, price = match
    if len(match) == 3:
        serial, name, price = match
    item_names.append(name)
    prices.append(price)


# Clean up prices from str to float
for i, price in enumerate(prices):
    prices[i] = float(price.replace("$", ""))

for i, item in enumerate(item_names):
    item_names[i] = item
    item_names[i] = "".join(c for c in item if c.isalnum() or c == " ")


# Extract date from receipt    
date_regex = "(0[1-9]|1[0-2])\/(0[1-9]|[12][0-9]|3[01])\/([0-9]{4})"
month, day, year = re.findall(date_regex, ocr_result)[0] if len(re.findall(date_regex, ocr_result)) > 0 else ["", "", ""]
date = f"{month}/{day}/{year}" if month != "" and day != "" and year != "" else ""

In [71]:
data = {
    "date": date,
    "item_name": item_names,
    "price": prices,
}

df = pd.DataFrame(data)

In [72]:
df.to_csv('receipt_data.csv', mode='a', index=False, header=False)