In [767]:
# import libraries
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from os import listdir
from os.path import join
from natsort import natsorted # needed for sorting filenames of the receipts
from datetime import datetime
from scipy.signal import argrelmin, argrelmax
import numpy as np
import re

SA_KEY=os.getenv("GOOGLE_SA_KEY")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SA_KEY

In [768]:
# Googles OCR function
def detect_text(path):
    """Detects text in the file."""
    from google.cloud import vision

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
    
    '''
    # commented out to supress printed output of the function
    print("Texts:")
    for text in texts:
        print(f'\n"{text.description}"')

        vertices = [
            f"({vertex.x},{vertex.y})" for vertex in text.bounding_poly.vertices
        ]

        print("bounds: {}".format(",".join(vertices)))
    '''
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
            )
    return response

In [769]:
# find all receipts in the specified path and sort them in ascending order
path = '../rewe_scanned/'
files = natsorted(listdir(path))
files.pop(0) # remove DSstore file
files

['Rewe_1.jpg',
 'Rewe_2.jpg',
 'Rewe_3.jpg',
 'Rewe_4.jpg',
 'Rewe_5.jpg',
 'Rewe_6.jpg',
 'Rewe_7.jpg',
 'Rewe_8.jpg',
 'Rewe_9.jpg',
 'Rewe_10.jpg',
 'Rewe_11.jpg',
 'Rewe_12.jpg',
 'Rewe_13.jpg',
 'Rewe_14.jpg',
 'Rewe_15.jpg',
 'Rewe_16.jpg',
 'Rewe_T5.jpg',
 'Rewe_T7.jpg',
 'Rewe_T9.jpg',
 'Rewe_T12.jpg',
 'Rewe_T16.jpg',
 'Rewe_T17.jpg',
 'Rewe_T18.jpg']

In [1216]:
    filename = 'Rewe_T12.jpg'

    # Apply function to an receipt
    response = detect_text(join(path,filename))

    # The text_annotations contain the recognized text and the corresponding bounding boxes
    # the first entry contains the whole text from the receipt and the consecutive entries
    # contain the text/coordinates from the individual bounding boxes
    texts = response.text_annotations

    # Build dataframe, where bl: bottom_left, br: bottom_right, tr: top_right, tl: top_left
    # denote the corners of the BBs
    columns = ["String", "x_bl", "y_bl", "x_br", "y_br","x_tr","y_tr","x_tl","y_tl"] # uncomment if you need x coords as well
    #columns = ["String", "y_bl", "y_br","y_tr","y_tl"]
    df = pd.DataFrame(columns=columns)

    for i, text in enumerate(texts[1:]):
        df.loc[i, "String"] = text.description
        for j in range(4):
            df.iloc[i,2*j+1] = text.bounding_poly.vertices[j].x  # uncomment if you need x coords as well 
            #df.iloc[i,j+1] = text.bounding_poly.vertices[j].y
            df.iloc[i,2*j+2] = text.bounding_poly.vertices[j].y  # uncomment if you need x coords as well

    # convert the coords to integers for calculation of the mean BB positions
    df[['y_bl','y_br','y_tr','y_tl']] = df[['y_bl','y_br','y_tr','y_tl']].astype('int')
    # calulate mean BB positions
    df['mean_y'] = df.eval('(y_bl+y_br+y_tr+y_tl)/4')

    # sort DF by mean height to match text that appears in the same line
    df = df.sort_values(by=['mean_y']).reset_index(drop=True)

    # select only the block of the receipt where the products are listed
    product_list_start_ind = int(df[df.String== 'EUR'].index.values[0])+1
    try:
        product_list_end_ind = int(df[df.String=='SUMME'].index.values)
    except:
        product_list_end_ind = int(df[df.String=='SUM'].index.values)

    df_products = df[product_list_start_ind:product_list_end_ind]

In [1217]:
df_products.reset_index(drop=True,inplace=True)

In [1218]:
x = np.linspace(1,df_products.shape[0],num=df_products.shape[0])

In [1219]:
slope = (max(df_products.mean_y)-min(df_products.mean_y))/df_products.shape[0]

In [1220]:
y_flat = df_products.mean_y - slope*x-min(df_products.mean_y)

In [1221]:
# for local maxima
max_ind = argrelmax(y_flat.to_numpy())[0]
max_ind = np.append(0,max_ind[:-1])

# for local minima
min_ind = argrelmin(y_flat.to_numpy())[0]

In [1222]:
#pd.options.mode.chained_assignment = None  # default='warn'

# label the rows of the dataframe with the corresponding lines on the receipt
df_products['line']=''
for i in range(len(max_ind)):
    df_products['line'].iloc[max_ind[i]:min_ind[i]+1] = i
df_products['line'].iloc[min_ind[i]+1:df_products.shape[0]] = i+1 # make sure the last line gets labeled as well


In [1223]:
df_products = df_products.sort_values(by=['line','x_bl']).reset_index(drop=True)

In [1224]:
df_sorted = df_products.groupby('line')['String'].apply(lambda x: ' '.join(x)).reset_index()

In [1225]:
# sort out lines that do not contain any price information
df_sorted = df_sorted[df_sorted['String'].str.contains(' B',case=True)|df_sorted['String'].str.contains(' A *',case=True)].reset_index(drop=True)

In [1226]:
# remove the tax remarks at the end of the strings
df_sorted['String'] = df_sorted['String'].str.replace(r' B$','',regex=True)
df_sorted['String'] = df_sorted['String'].str.replace(r' A \*$','',regex=True)
df_sorted['String'] = df_sorted['String'].str.replace(r' A$','',regex=True)


In [1227]:
def extract_price(input_str):
    # Search for pattern: [whitespace][letter or percentage symbol]
    match = re.search(r' [A-Za-z%]', input_str[::-1])
    if match:
        # position of the matching pattern
        position = match.start()
        # residual string starting from the position of the matched pattern
        res_str = input_str[-position:]

    return res_str

In [1228]:
df_sorted['price'] = df_sorted['String'].apply(extract_price)
# dirty-fix section
df_sorted['price'] = df_sorted['price'].apply(lambda x: re.sub(f'^{re.escape("14 ")}', '',x))
df_sorted['price'] = df_sorted['price'].apply(lambda x: re.sub(f'^{re.escape("2.0 ")}', '',x))
df_sorted['price'] = df_sorted['price'].apply(lambda x: re.sub(f'^{re.escape("W. ")}', '',x))
df_sorted['price'] = df_sorted['price'].apply(lambda x: re.sub(f'^{re.escape("102 ")}', '',x))

In [1229]:
# remove the price from the string
def replace_str(row):
    return row['String'].replace(row['price'],'')

df_sorted['String'] = df_sorted.apply(replace_str,axis=1)

In [1230]:
# formatting of price column
df_sorted['price'] = df_sorted['price'].str.lstrip('.B')
df_sorted['price'] = df_sorted['price'].str.replace(',','.')
df_sorted['price'] = df_sorted['price'].str.replace(' ','')
df_sorted['price'] = df_sorted['price'].astype('float')

In [1231]:
df_sorted.drop('line',axis=1,inplace=True)
df_sorted.rename(columns={'String':'product_abbr'},inplace=True)

In [1232]:
df_sorted['receipt_id'] = filename
df_sorted['processing_date'] = datetime.today().strftime('%d-%m-%Y')

In [1233]:
df_sorted

Unnamed: 0,product_abbr,price,receipt_id,processing_date
0,CASHEWKERNE,2.19,Rewe_T12.jpg,22-02-2024
