### Import libraries

In [188]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np

SA_KEY=os.getenv("GOOGLE_SA_KEY")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SA_KEY

In [6]:
# Googles OCR function
def detect_text(path):
    """Detects text in the file."""
    from google.cloud import vision

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
    print("Texts:")

    for text in texts:
        print(f'\n"{text.description}"')

        vertices = [
            f"({vertex.x},{vertex.y})" for vertex in text.bounding_poly.vertices
        ]

        print("bounds: {}".format(",".join(vertices)))

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return response

In [417]:
# Apply function to an receipt
response = detect_text("rewe_scanned/Rewe2.jpg")

Texts:

"REWE
REWE Markt
Sabine Klitsch oHG
Barbaraweg 7
BANANE
1,034 kg x 1,99 EUR/kg
ROTE BETE FRISCH
06773 Gräfenhainichen
Tel. 034953-859060
UID Nr.: DE285931933
0,406 kg x 1,19 EUR/kg
ZUCCHINI GRUEN
0, 776 kg x 2,29 EUR/kg
CHAMP. WEISS
Rabatt 30%
KRUSTENBROT
WELTM. -MEHRKORN
1,49
TORTILLA WEIZEN
2 Stk x
HONEYPOMELO PINK
ZWIEBEL LAUCH
BROCCOLI NEUTRAL
2 Stk x 1,39
EISBERGSALAT
RUCOLA
PORREE
CHERRYROMATOMATE
GURKE MINI
PAPRIKA ROT
BIO EIER OKT
PIZZA MARGHERITA
BUD & TERENCE
AUFSTRICH PAP. ZU
BROTAUFSTRICH ME
NUTELLA
STERNB. EXPORT
PFAND 3, 10 EUR
AUSLESE MILD
SCHWAMMTUCH
2 Stk x
SUMME
Geg. BAR
Rückgeld BAR
Steuer %
A= 19,0%
B= 7,0%
Gesamtbetrag
TSE-Signatur:
0,95
***X
Netto
10.50
43,62
54,12
TSE-Signaturzähler: 1981533
945156
TSE-Transaktion:
EUR
EUR
EUR
Steuer
1,99
3,05
5,04
EUR
2,06 B
0,48 B
1,78 B
1,99 B
-0, 60 B
0,76 B
1,99 B
2,98 B
2,29 B
0.95 B
2,78 B
0,89 B
1, 19 B
0,79 B
0,99 B
0,59 B
1,89 B
3. 19 B
2,79 B
4.69 B
1,35 B
2,49 B
4,59 B
7,49 A
3,10 A *
3,77 B
1,90 A
59,16
70, 1

In [418]:
# The text_annotations contain the recognized text and the corresponding bounding boxes
# the first entry contains the whole text from the receipt and the consecutive entries
# contain the text/coordinates from the individual bounding boxes
texts = response.text_annotations

In [419]:
# Build dataframe, where bl: bottom_left, br: bottom_right, tr: top_right, tl: top_left
# denote the corners of the BBs

columns = ["String", "x_bl", "y_bl", "x_br", "y_br","x_tr","y_tr","x_tl","y_tl"] # uncomment if you need x coords as well
#columns = ["String", "y_bl", "y_br","y_tr","y_tl"]
df = pd.DataFrame(columns=columns)

for i, text in enumerate(texts):
    df.loc[i, "String"] = text.description
    for j in range(4):
        df.iloc[i,2*j+1] = text.bounding_poly.vertices[j].x  # uncomment if you need x coords as well 
        #df.iloc[i,j+1] = text.bounding_poly.vertices[j].y
        df.iloc[i,2*j+2] = text.bounding_poly.vertices[j].y  # uncomment if you need x coords as well

In [420]:
# convert the coords to integers for calculation of the mean BB positions
df[['y_bl','y_br','y_tr','y_tl']] = df[['y_bl','y_br','y_tr','y_tl']].astype('int')
# calulate mean BB positions
df['mean_y'] = df.eval('(y_bl+y_br+y_tr+y_tl)/4')

# sort DF by mean height to match text that appears in the same line
df = df.sort_values(by=['mean_y']).reset_index(drop=True)

In [421]:
# select only the block of the receipt where the products are listed
product_list_start_ind = int(df[df.String== 'EUR'].index.values[0])+1
product_list_end_ind = int(df[df.String=='SUMME'].index.values)
df_products = df[product_list_start_ind:product_list_end_ind]

In [422]:
# Eine leere Liste erstellen, um die bereits angezeigten Indizes zu speichern
shown_indices = []
product_list = []
columns = ['product_name','price']
df_cleaned = pd.DataFrame(columns=columns)

counter = 1
for i in df_products['mean_y']:
    condition = (df_products['mean_y'] >= i) & (df_products['mean_y'] < i + 10)
    indices = df_products.index[condition]
    
    # Überprüfen, ob die aktuellen Indizes bereits angezeigt wurden
    if not any(idx in shown_indices for idx in indices):

        chars = ['A','B','*']
        #print(df.loc[indices].sort_values(by=['x_bl'])['String'])
        selected = df_products.loc[indices].sort_values(by=['x_bl'])['String']
        if selected.iloc[-1] in chars:
            if selected.iloc[-1] == '*':
                df_cleaned.loc[counter,'product_name'] = ' '.join(selected.iloc[:-3])
                df_cleaned.loc[counter,'price'] = selected.iloc[-3].replace(',','.')
                #print(selected.iloc[:-1])
            else:
                df_cleaned.loc[counter,'product_name'] = ' '.join(selected.iloc[:-2])
                df_cleaned.loc[counter,'price'] = selected.iloc[-2].replace(',','.')
                #print(selected)
            counter += 1
        shown_indices.extend(indices)

In [423]:
df_cleaned.price = df_cleaned.price.astype('float')

In [424]:
df_cleaned

Unnamed: 0,product_name,price
1,BANANE,2.06
2,ROTE BETE FRISCH,0.48
3,ZUCCHINI GRUEN,1.78
4,CHAMP . WEISS,1.99
5,"Rabatt 30 % -0 ,",60.0
6,KRUSTENBROT,0.76
7,WELTM . -MEHRKORN,1.99
8,TORTILLA WEIZEN,2.98
9,HONEYPOMELO PINK,2.29
10,ZWIEBEL LAUCH,0.95


In [333]:
# even out slight y-position differences
for i in range(df.shape[0]-1):
    if abs(df.mean_y[i+1]-df.mean_y[i]) > 10:
        df.loc[i,'y_position'] = df.mean_y[i]
    else:
        df.loc[i,'y_position'] = df.mean_y[i+1]
 

In [327]:
int(df[df.String=='EUR'].index.values)

TypeError: only size-1 arrays can be converted to Python scalars

In [330]:
int(df[df.String=='SUMME'].index.values)

106

In [310]:
df.iloc[:int(df[df.String=='SUMME'].index.values)+3]


Unnamed: 0,String,y_bl,y_br,y_tr,y_tl,mean_y,y_position
0,REWE,220,223,391,388,305.5,305.5
1,REWE,391,391,424,424,407.5,408.5
2,Markt,391,392,426,425,408.5,408.5
3,Sabine,430,431,464,463,447.0,447.5
4,Klitsch,430,431,465,464,447.5,447.5
...,...,...,...,...,...,...,...
103,*,1361,1361,1393,1393,1377.0,1377.0
104,-025,1361,1361,1393,1393,1377.0,1377.0
105,SUMME,1439,1440,1471,1470,1455.0,1455.0
106,EUR,1440,1441,1470,1469,1455.0,1455.0


In [335]:
df.tail(40)

Unnamed: 0,String,y_bl,y_br,y_tr,y_tl,mean_y,y_position
258,Gleich,2491,2491,2527,2527,2509.0,2509.0
259,in,2491,2491,2527,2527,2509.0,2509.0
260,der,2491,2491,2527,2527,2509.0,2509.0
261,REWE,2491,2491,2527,2527,2509.0,2509.0
262,App,2491,2491,2527,2527,2509.0,2509.0
263,oder,2491,2491,2527,2527,2509.0,2509.0
264,auf,2491,2491,2527,2527,2509.0,2509.0
265,.,2529,2529,2562,2562,2545.5,2546.0
266,anmelden,2530,2529,2562,2563,2546.0,2548.5
267,www.rewe.de/payback,2533,2530,2564,2567,2548.5,2548.5


In [290]:
for i in range(df.shape[0]-2):
    if abs(df.y_position[df.shape[0]-i] - df.y_position[df.shape[0]-i-1]) > 3:
        df.loc[df.shape[0]-i,'y_position2'] = df.y_position[df.shape[0]-i-1]
    else:
        df.loc[df.shape[0]-i,'y_position_2'] = df.y_position[df.shape[0]-i-2]

KeyError: 298

In [282]:
df.y_position[df.shape[0]-1]

nan

In [253]:
#for i in range(df.shape[0]-1):
 #  df.loc[i,'Y'] = df.mean_y[i+1] if abs(df.mean_y[i]-df.mean_y[i+1]) < 5 else df.mean_y[i]

#df = df.sort_values(by=['Y']).reset_index(drop=True)

In [194]:
#for i in range(df.shape[0]-1):
 #  df.loc[i,'line'] = (df.Y[i]+df.Y[i+1])/2 if abs(df.Y[i]-df.Y[i+1]) < 3 else df.Y[i]

In [291]:
df.head(40)

Unnamed: 0,String,y_bl,y_br,y_tr,y_tl,mean_y,y_position
0,REWE,220,223,391,388,305.5,305.5
1,REWE,391,391,424,424,407.5,408.5
2,Markt,391,392,426,425,408.5,408.5
3,Sabine,430,431,464,463,447.0,447.5
4,Klitsch,430,431,465,464,447.5,447.5
5,oHG,431,431,464,464,447.5,447.5
6,Barbaraweg,471,471,507,507,489.0,489.0
7,7,471,471,507,507,489.0,489.0
8,06773,509,510,544,543,526.5,527.5
9,Gräfenhainichen,509,511,546,544,527.5,527.5


In [56]:

response.text_annotations[-1]

description: "AURI"
bounding_poly {
  vertices {
    x: 453
    y: 3422
  }
  vertices {
    x: 814
    y: 3420
  }
  vertices {
    x: 814
    y: 3487
  }
  vertices {
    x: 453
    y: 3487
  }
}

In [None]:
import json
with open("data/response_json.json", "w") as fp:
    json.dump(response.text_annotations, fp)

TypeError: Object of type RepeatedComposite is not JSON serializable

In [None]:
# JSON-String in ein Python-Dictionary umwandeln
jdata = json.loads(response.text_annotations)

# Dictionary in DataFrame umwandeln
df = pd.DataFrame(jdata)

print(df.T)

TypeError: the JSON object must be str, bytes or bytearray, not RepeatedComposite

In [None]:
import pandas as pd

# Erstellen eines leeren DataFrames mit den gewünschten Spaltennamen
columns = ["String", "Bounds1", "Bounds2", "Bounds3", "Bounds4"]
df = pd.DataFrame(columns=columns)

# Hinzufügen der Beispielzeile
df.loc[1] = ["Tel", (238,495), (293,495), (293,529), (238,529)]

print(df)

315

In [41]:
end = len(response.text_annotations) + 1
texts = response.text_annotations[1:end]

# Erstellen eines leeren DataFrames mit den gewünschten Spaltennamen
columns = ["String", "Bounds1", "Bounds2", "Bounds3", "Bounds4"]
df = pd.DataFrame(columns=columns)

for i, text in enumerate(texts):
    print(f'\n"{text.description}"')
    df.loc[i, "String"] = text.description
    vertices = [
        f"({vertex.x},{vertex.y})" for vertex in text.bounding_poly.vertices
    ]

    print("bounds: {}".format(",".join(vertices)))
    df.loc[i, "String"] = text.description


"REWE"
bounds: (124,220),(784,223),(783,391),(123,388)

"REWE"
bounds: (305,391),(389,391),(389,424),(305,424)

"Markt"
bounds: (404,391),(505,392),(505,426),(404,425)

"Sabine"
bounds: (228,430),(347,431),(347,464),(228,463)

"Klitsch"
bounds: (368,430),(508,431),(508,465),(368,464)

"oHG"
bounds: (528,431),(588,431),(588,464),(528,464)

"Barbaraweg"
bounds: (287,471),(488,471),(488,507),(287,507)

"7"
bounds: (507,471),(529,471),(529,507),(507,507)

"1,29"
bounds: (288,721),(370,721),(370,756),(288,756)

"2"
bounds: (286,802),(308,802),(308,836),(286,836)

","
bounds: (308,802),(319,802),(319,836),(308,836)

"19"
bounds: (328,802),(370,802),(370,836),(328,836)

"VOLLKORNTORTILLA"
bounds: (27,681),(348,683),(348,719),(27,717)

"2"
bounds: (65,721),(89,721),(89,755),(65,755)

"Stk"
bounds: (107,721),(171,721),(171,755),(107,755)

"x"
bounds: (184,721),(208,721),(208,755),(184,755)

"TOFU"
bounds: (28,759),(109,761),(108,795),(27,793)

"NATUR"
bounds: (128,761),(227,763),(226,797),(127

In [None]:
text.bounding_poly.vertices

[x: 745
y: 3430
, x: 817
y: 3430
, x: 817
y: 3446
, x: 745
y: 3446
]

In [None]:
detect_text("/Users/tobiaspoetzl/coding/creamCheese_backup/images/Rewe_Bons_Scans_ToP/Rewe_T01.jpg")

Texts:

"ZIEGELSTR.
23556 LÜBECK
Tel. 0451 48065836
UID Nr.: DE277457805
JA! BUTTERKAESE
SALAT LOLLO BION
PICCOLIN. BREZEL
2 Stk x
LAUGENSTANGEN
3,49
2 Stk x 1,99
BUTTERCROISSANTS
NUTELLA 450G
CHIPSFRISCH CHIL
LIMONCELLO
1,09
0, 15
LOSCHER CLUBMATE
3 Stk x
PFAND 0, 15 EUR
3 Stk x
ROTK. ALKOHOLFREI
GLUECKSSCHWEIN
4 Stk x 0,80
FOLIENBALLONS
GIRLANDE DEKO
KONFETTI
16X FORM-SERVIET
WACHSGIESSEN
Wachsgiessen 2, 69
Funny Frisch 0,89
SUMME
Geg. VISA
Datum:
EUR
EUR
* * Kundenbeleg * *
EUR
2,79 B
2,29 B
6,98 B
3,98 B
3,69 B
3, 29 B
0,99 B
13, 79 A
3,27 A
0,45 A *
4,99 A
3,20 B
3,99 A
3,99 A
2,49 A
2,49 A
2,99 A
-0,30 A
-0,10 B
65,26
65,26
30.12.2023"
bounds: (61,0),(1655,0),(1655,2787),(61,2787)

"ZIEGELSTR"
bounds: (466,2),(803,2),(803,61),(466,61)

"."
bounds: (811,2),(824,2),(824,61),(811,61)

"23556"
bounds: (580,65),(767,64),(767,126),(580,127)

"LÜBECK"
bounds: (806,64),(1034,63),(1034,125),(806,126)

"Tel"
bounds: (390,139),(491,138),(491,201),(390,202)

"."
bounds: (502,139),(522,139),(

In [None]:
detect_text("/Users/tobiaspoetzl/coding/creamCheese_backup/images/Rewe_Bons_Scans_ToP/Rewe_T18.jpg")

Texts:

"REWE MARKT Grühn oHG
ZIEGELSTR. 7 - 13
23556 LÜBECK
Tel.: 0451 - 48065836
UID Nr.: DE277457805
SENF MITTELSCH.
TOMATENMARK
TOM.MARK KONZ.
GEHACKTE TOMATEN
3 Stk x 0,39
SFB PH86 DP
TINTEN.DESCHER
SENSEO SWITCH
SUMME
Geg EWE Guthaben
Restbetrag
Geg. VISA
Datum:
Uhrzeit:
EUR
EUR
EUR
EUR
0,49 B
0,49 B
1,49 B
1,17 B
* * Kundenbeleg * *
9,99 A
2,99 A
66,00 A
82,62
10,00
72,62
72,62
23.04.2018
20:03:31 Uhr"
bounds: (50,234),(1199,234),(1199,1977),(50,1977)

"REWE"
bounds: (258,241),(456,239),(456,289),(258,291)

"MARKT"
bounds: (486,239),(632,238),(632,288),(486,289)

"Grühn"
bounds: (660,238),(803,237),(803,287),(660,288)

"oHG"
bounds: (833,237),(919,236),(919,285),(833,286)

"ZIEGELSTR"
bounds: (343,309),(602,306),(603,360),(344,363)

"."
bounds: (610,307),(624,307),(625,360),(611,360)

"7"
bounds: (659,306),(687,306),(688,359),(660,359)

"-"
bounds: (718,306),(745,306),(746,359),(719,359)

"13"
bounds: (774,305),(835,304),(836,357),(775,358)

"23556"
bounds: (428,382),(571,380),(

"1,17"
bounds: (1004,938),(1118,938),(1118,991),(1004,991)

"GEHACKTE"
bounds: (56,943),(281,942),(281,994),(56,995)

"TOMATEN"
bounds: (310,942),(513,941),(513,993),(310,994)


"1,49"
bounds: (1005,872),(1118,870),(1119,921),(1006,923)

"TOM.MARK"
bounds: (56,873),(280,872),(280,925),(56,926)

"KONZ"
bounds: (310,871),(425,870),(425,923),(310,924)

In [None]:
detect_text("/Users/tobiaspoetzl/coding/wesplit-receipt-ocr/raw/IMG_4855.jpg")

Texts:

"Netto
Marken-Discount
6772 Zschornewitz, Straße des Friedens
WWW.NETTO-ONLINE.DE
KN Toasties sort. 335g
KN Toasties sort. 335g
Bisc. Schokobroetchen360g
Rue.veg.SchinkenspickerPap.809
Rabatt 30%
Naggi GA Gespenstersuppe 750ml
Bio BB Kontepulciano0,75L
Liebl. Sahnepudding sort. 500g
Clarkys Kesselchips sort. 150g
Spitzkohl
Sellerie
0.420 kg x 1.19 EUR/kg
Feldsalat 150g
Preisaenderung
Paprika rot 500g
2 X
Gurken
Lauchzwiebel
SUNNE [15]
SUMME
Bar EUR
A 19%
P
7%
MWST
0.45
1.15
€
ㅁ
1.19
Struer
-EUR
1,19 B
1.19 B
1,89 B
1,59 B
BRUTTO
2.79
17.64
-0.48
0,99 B
2,79 A
1,39 B
1.59 B
1.99 B
0,50 B
1.49 B
-0.75
NETTO
2,34
16.49
Nit der DeutschlandCard hätten Sie
10 Punkte erhalten!
1.89 B
2.38 B
0,79 B
20.43 -
20.43
20.43
TSE Transaktionsnummer.: 359242
Seriennr. Kasse:NTO-000-04868-3
Prüfuert:nYiITDCCGZQ/zVyvne0jsetRvU9xbCe
xizOPClazosMgPtBivSda5C5pCUKiih834Z1+CHG
le0nkcWtcBMyPS23N0e2041 Byqq0jWJcZN+13S/a
4e6eekD0gj8UI9Dvq
Signaturzähler:809479
TSE-Start:2023-11-27715:21:16.000Z
TSE-Stop 