In [1]:
import pandas as pd
import numpy as np
import pytesseract
import PIL.Image
from PIL import Image, ImageEnhance
import cv2 as cv2
import os
import re

# Pytesseract for Optical Character Recognition

In [2]:
#embed the location of tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
path = r"..\Data\raw_data\sample_receipt"
list_fn = os.listdir(path)


# Set the enhancement factors
brightness_factor = 1.1
contrast_factor = 1.5
sharpness_factor = 1.6

# Loop through all the files in the directory
for filename in os.listdir(path):
    if filename.endswith(".jpg"):
        # Load the image using OpenCV
        image = cv2.imread(os.path.join(path, filename))

        # Convert the image to Pillow format
        pil_image = Image.fromarray(image)

        # Adjust the brightness of the image using Pillow
        enhancer = ImageEnhance.Brightness(pil_image)
        brightened = enhancer.enhance(brightness_factor)

        # Adjust the contrast of the image using Pillow
        enhancer = ImageEnhance.Contrast(brightened)
        contrasted = enhancer.enhance(contrast_factor)

        # Adjust the sharpness of the image using Pillow
        enhancer = ImageEnhance.Sharpness(contrasted)
        sharpened = enhancer.enhance(sharpness_factor)

        # Convert the image back to OpenCV format
        improved_image = cv2.cvtColor(np.array(sharpened), cv2.COLOR_RGB2BGR)

        # Save the improved image
        cv2.imwrite(os.path.join(path, filename.split(".")[0]+"_adjQuality.jpg"), improved_image)

print("Done")

Done


PyTesseract differentiates between psm (Page segmentation modes) and oem (OCR Engine modes):

in ***psm*** 14 modes are provided:

- Orientation and script detection (OSD) only.
- Automatic page segmentation with OSD.
- Automatic page segmentation, but no OSD, or OCR. (not implemented)
- Fully automatic page segmentation, but no OSD. (Default)
- Assume a single column of text of variable sizes.
- Assume a single uniform block of vertically aligned text.
- Assume a single uniform block of text.
- Treat the image as a single text line.
- Treat the image as a single word.
- Treat the image as a single word in a circle.
- Treat the image as a single character.
- Sparse text. Find as much text as possible in no particular order.
- Sparse text with OSD.
- Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.


in ***oem*** 4 modes are provided:

- Legacy engine only.
- Neural nets LSTM engine only.
- Legacy + LSTM engines.
- Default, based on what is available.


***psm*** is adjusted to the best fit, while ***oem*** is kept at mode 3 (default, based on what is available)

In [4]:
#test read two different image qualities
#potential modes for psm (1,3,4,5,6,11,12)
myconfig_Edeka = r"--psm 6--oem 3"


text_Edeka = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Edeka_1_adjQuality.jpg"), config=myconfig_Edeka)   ### 6 / 3
print(text_Edeka)



Edeka Fil. 2139
Ohstallee 28-30
13593 Berlin
Tel. 030 36403 330
wt
G&G Haferflocken 0,
383 Kakaorolle 1,49 A
G&G Naturjoghurt 1,09 4
BANANEN G+G 1,45 A
1,000 kg x 1,45 EUR/kg
Posten: 4 Tones
SUMME EUR 4,82
Bar EUR 5,00
Ruckge Id EUR -0,18
BevwSt =. NETTO. = MwS?-=—sUMSATZ
ms 4,50 0,32 4,82
~ flr Ihren Einkauf von 4,82 Euro auf
~» rabatt fahige Artike! hatten Sie
2 DeutschlandCard Punkte erhalten!
Helden Ste sich gleich hier im Markt
oder unter ww.deutschlandcard.de an.
alii iit
Datum Uhrveit Pos (
17.02.23 18:07 001 A at,
oteuernumier : NE266067317
ISE Trangakt LAV Abilimmens tA oe



In [5]:
myconfig_EuroShop = r"--psm 1--oem 3"
text_Euroshop = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Euroshop_1_adjQuality.jpg"), config=myconfig_EuroShop)
print(text_Euroshop)

EuroShop Berlin
Car|1-Schurz-StraBe 23
13597 Berlin
www. euroshop-on! ine. de

Menge EUR

Milka Choco Moo 200g 1 1,50
Total EUR 1,50
Bar 1,50
MwSt % Netto MwSt Brutto
7.20 1,40 0,10 1,56

17.11.2022 13:49 #:58234 0p:13294 C:1 5:62
Wir danken fdr Ihren Einkauf
USt-IdNr. DE 237 646 296

TSE Seriennummer :

TIAFOACCBC 7CC3ASODE4C77BS0BE26345F 95248864
819040 1064F 0D 7ABEA1BB1

TSE Start: 9022-11-12712:49:05 .G00Z +01 00
TSE Ende : 2022-11-12712:50:04.0002+01 00

TSE Identifikation: 62-1)
TSE Transaktionsnr.: 199095
TSE Signaturzahler: 420600

TSE Signatur: iM
DhwPCnNePIBIRFm)EPW4UL wmbXc9J7 JUL YAPIEGR3R

bFTWw4zrOB7NOIS2TatyELfZfrPSQ+/z jmeBWucdtg
SPJaZFo2hdTqMWodgt iMDEatAbJHPomBh j 2Vf2dMFz
6s



In [6]:
myconfig_Lidl = r"--psm 3--oem 3"


text_Lidl = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Lidl_1_adjQuality.jpg"), config=myconfig_Lidl)
print(text_Lidl)



L¢DE.

Seeburger Str. 18
13581 Berlin Spandau

EUR
Strauchtomaten 0,52 A
0,524 kg x 0, 99 EUR/kg
Gurken 0,59 A
Birnen 1,59 A
H-Milch 1,5% 0,99 A
Tomatenmark 0,59 A
Direxteéfte Orange 1,39 B
ZU zahlen 5 67
Bar 6,00
Riickgeld -0,33
MWST% MWST + Netto = Brutto
A 7% (0,28 4,00 4,28
B 19 Big — 0,22 Me u : "

0,50

alll Rasad otpennoseiian +a wah fj

Prifwert : ton] tert hey eT ee
pFVL2hVr Leg? 7BqWOKhovF rONncOZyVTSynBa3tKSt
B4N4mKaG IW) goX6cF f305b/uOCBmNaNdx/GKAtE fut tt
weWr47jfekXu

Signaturzahler: 246687
-2021-06-17T19:23:45.000+0200
“2021-06- 1771924; 12.000+

J 6395
“ fe




In [7]:
myconfig_Primark = r"--psm 3--oem 3"

text_Primark = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Primark_1_adjQuality.jpg"), config=myconfig_Primark)
print(text_Primark)

# text2 = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Edeka_1_sharp1.jpg"), config=myconfig)
# print(text2)

Primark

Primark Mode Ltd & CO KG
Alexanderplatz 7
10178 Berlin
USt-Id.Nr.: DE276960607

VERKAUFT

210423655 Blickdicht warm 8,00 B
210422997 Blickdicht warm 8,00 B
210413648 Langarm-Shirt 6,00 B
Summe € 22,00
‘Debit MasterCard € 22,00

Karten-Nr. KK KKKKKKKKKKB IGG

Handler-Nr. ¥¥¥**13619

Auth.-Nr. NURERS

App. Nr. © A0000000041010

Kassenterminal-ID ****0312

PAN-Laufnr.

Quelle kontaktlos

Verifizieruns nicht ausgsefuhrt

(MwSt.-Obersicht )
Cong MuSt .-Satz pp iketbetraa MwSt.-Betr.

19,00% 4 3,51
SUMMEN 18,49 3,51
Filiale:0808 Kasse:002 Trans. :007322

Datum:25.11.2022 Zeit: 15:02 Ben. :10291577

Verkaufte Artikel: 3
Zurlickgegebene Artikel: 0

a i jeu 0 el oi m= 88 8 @eeeem ome lm oe



In [8]:
myconfig_Rewe = r"--psm 3--oem 3"


text_Rewe = pytesseract.image_to_string(PIL.Image.open(path+"\\"+"Rewe_receipt_02_adjQuality.jpg"), config=myconfig_Rewe)
print(text_Rewe)

REWE

Rheinstr.

12161 Berlin
Telefon: 030 _/ 85074370
UID Nr. : DE812706034
RUCOTELLA KAESE 4°03 B
See =Handeingabe E-Bon 0, 193 kg
meee FEIGENSENFSAUCE 4,29 B
fees Versch. Sorten Bile
fees CIABATTA 0,99 B
memes PANE RUSTICO 1,39 B
ee CRACKER PAPRIKA 2,49 B
See 15 CRACK. SALT 2,49 B -
Seems HUMMUS NATUR 2,70 B
Sees ZTTRONE BIO [Pasi
me 2 otk x 20, (9
SALAT SALATTRIO 1,79 B
GURKE MINI eee
4Stkx 0,49
BIO EIER KL. M-L 2,30 8
FR. BIO HEUMI. 1,99 B
ESSIGESSENZ HELL 1,49 B
SALTLETTS _ 2, lan
UNIV MEGAPERLS 4,59 A
KLAPPBOX 8,99 A
SUMME EUR 47,54
Geg. Maestro EUR 47,54

* * Kundenbeleg * *
11. 02. 2023

Datum: :

Uhrzeit 14:01:06 Uhr

Beleg-Nr 1181

race-N 671124
Ge a aed Mey a ractless




In [9]:
print(text_Edeka.split("\n")[0])  # Customer
print(text_Euroshop.split("\n")[0])  # Customer
print(text_Lidl.split("\n")[0])  # Customer
print(text_Primark.split("\n")[0])  # Customer
print(text_Rewe.split("\n")[0])  # Customer

Edeka Fil. 2139
EuroShop Berlin
L¢DE.
Primark
REWE


In [10]:
files_list= [text_Edeka,text_Euroshop,text_Lidl,text_Primark,text_Rewe]

In [11]:
shopping_location = []

for i in files_list:

    shopping_location.append(i.split("\n")[0])

In [12]:
#Dates

In [13]:
shopping_date = []


for i in files_list:

    date_pattern = r"\d{2}\-\d{2}\-\d{4}|\d{2}\.\d{2}\.\d{4}|\d{2}\. \d{2}\. \d{4}|\d{4}\-\d{2}\-\d{2}|\d{2}\.\d{2}\.\d{2} "
    dates = re.findall(date_pattern, i)
    if dates:
        print("Date:", dates[0].replace('. ', '.'))
    else:
        print('No date found')

    shopping_date.append(dates[0])

Date: 17.02.23 
Date: 17.11.2022
Date: 2021-06-17
Date: 25.11.2022
Date: 11.02.2023


In [14]:
#converting dates to '%Y-%m-%d'
from datetime import datetime
date_format = ['%d.%m.%y', '%d.%m.%Y', '%Y-%m-%d', '%d.%m.%Y', '%d. %m. %Y']

# Convert date strings to datetime objects
datetime_list = [datetime.strptime(date_str.strip(), fmt) for date_str, fmt in zip(shopping_date, date_format)]

# Convert datetime objects to desired strftime format
strftime_format = '%Y-%m-%d'  # Example format: YYYY-MM-DD
formatted_dates = [date_obj.strftime(strftime_format) for date_obj in datetime_list]
formatted_dates

['2023-02-17', '2022-11-17', '2021-06-17', '2022-11-25', '2023-02-11']

In [15]:
# extracting the sum per receipt

In [16]:
# use regular expression to find the sum after "SUMME EUR"
sum_perReceipt = []


for i in files_list:
    match = re.search(r' EUR\s*(\d+,\d{2})|Summe € \d+,\d{2}|ZU zahlen ', i)
    if match:
        sum_str = match.group(1)
        print("Sum:", sum_str, "€")
    else:
        print("Sum not found")
    sum_perReceipt.append(sum_str)

Sum: 4,82 €
Sum: 1,50 €
Sum: None €
Sum: None €
Sum: 47,54 €


In [17]:
sum_perReceipt

['4,82', '1,50', None, None, '47,54']

In [18]:
# Create dataframe
df_OCR = pd.DataFrame({'Location': shopping_location, 'Time': formatted_dates, 'Cost': sum_perReceipt})

# Print dataframe
print(df_OCR)

          Location        Time   Cost
0  Edeka Fil. 2139  2023-02-17   4,82
1  EuroShop Berlin  2022-11-17   1,50
2            L¢DE.  2021-06-17   None
3          Primark  2022-11-25   None
4             REWE  2023-02-11  47,54
