In [333]:
# sudo apt install tesseract-ocr-ita

In [334]:
import cv2
import pytesseract
import json
import easyocr
import difflib
import os

In [335]:
def load_image(image_path):
    image = cv2.imread(image_path)
    return image

In [336]:
def denoise(image):
    image = cv2.fastNlMeansDenoisingColored(image,None,10,10,7,21)
    return image

In [337]:
def BGR2GRAY(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return image

In [338]:
def get_header(image):
    header = image[0: int(image.shape[0]/3), :]
    return header

In [339]:
def get_body(image):
    body = image[int(image.shape[0]/3):4*int(image.shape[0]/5), :]
    return body

In [340]:
def get_footer(image):
    footer = image[4*int(image.shape[0]/5):, :]
    return footer

In [341]:
def save_image(image, image_path):
    cv2.imwrite(image_path, image)
    return

In [342]:
def parse_header(image_header, image_data):
    data = pytesseract.image_to_string(image_header)
    data = data.strip().split('\n')
    l = ['nome', 'ragione_sociale', 'indirizzo', 'citta', 'info1', 'info2']
    for key, value in zip(l, data):
        image_data[key] = value.strip()
    return

In [343]:
def parse_body(image_body, image_data):

    data1 = pytesseract.image_to_string(image_body)
    data1 = data1.strip().split('\n')
    data1 = [row for row in data1 if row != '']

    reader = easyocr.Reader(['it'])
    result = reader.readtext(image_body, detail = 0)

    data2 = []
    s = ''
    for item in result:
        try:
            price = float(item.replace(',', '.'))
            data2.append((s.strip(), price))
            s = ''
        except:
            s += item+' '

    body = [(z, q[0], q[1]) for z, q in zip(data1, data2)]
    totale = None
    index_totale = None
    for index, row in enumerate(body):
        perc1, perc2 = 0, 0
        for word in row[0].upper().split():
            perc1 = max(difflib.SequenceMatcher(None, 'TOTALE',word[:6]).ratio()*100, perc1)
        for word in row[1].upper().split():
            perc2 = max(difflib.SequenceMatcher(None, 'TOTALE',word[:6]).ratio()*100, perc2)
        if perc1 > 50 or perc2 > 50:
            totale = row[2]
            index_totale = index
    image_data['totale'] = totale
    if index_totale is not None:
        image_data['prodotti'] = body[:index_totale]
    else:
        image_data['prodotti'] = body
    return

In [344]:
def parse_footer(image_footer, image_data):
    reader = easyocr.Reader(['it'])
    result = reader.readtext(image_footer, detail = 0)
    date = None
    time = None
    for item in result:
        if item.count('/') == 2 or item.count('-') == 2:
            date = item.replace('-', '/')
        if item.count(':') > 0:
            time = item
    image_data['date'] = date
    image_data['time'] = time

In [345]:
def save_image_data(image_data, image_data_path):
    with open(image_data_path, 'w') as f:
        json.dump(image_data, f, indent=4)
    return

In [346]:
def main(image_path):
    image = load_image(image_path)
    image = denoise(image)
    image = BGR2GRAY(image)
    image_header = get_header(image)
    image_body = get_body(image)
    image_footer = get_footer(image)
    save_image(image_header, os.path.join(os.path.dirname(image_path), 'header.png'))
    save_image(image_body, os.path.join(os.path.dirname(image_path), 'body.png'))
    save_image(image_footer, os.path.join(os.path.dirname(image_path), 'footer.png'))
    image_data = {}
    parse_header(image_header, image_data)
    parse_body(image_body, image_data)
    parse_footer(image_footer, image_data)
    save_image_data(image_data, os.path.join(os.path.dirname(image_path), 'image_data.json'))
    return

In [347]:
image_paths = [
    'test1/Esempio scontrino - ristorante XYZ.png',
    'test2/download.jpeg',
    'test3/il-conto-a-comprova-dei.jpg'
    ]
for image_path in image_paths:
    main(image_path)