In [20]:
import cv2
import re
import json
import pytesseract
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [21]:
import cv2
import numpy as np
import pytesseract

# Se estiver no Windows, descomente a linha abaixo:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def preprocess_receipt(path):
    img = cv2.imread(path)

    if img is None:
        raise ValueError("Imagem não encontrada")
    
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    cv2.imwrite("debug_1_resize.png", img)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.imwrite("debug_2_gray.png", gray)

    blur = cv2.GaussianBlur(gray, (3,3), 0)
    cv2.imwrite("debug_3_blur.png", blur)

    thresh = cv2.adaptiveThreshold(
        blur,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        21, 
        10
    )
    cv2.imwrite("debug_7_thresh.png", thresh)

    
    cv2.imwrite("debug_final.png", thresh)

    return thresh

def extract_text(path):
    img_processed = preprocess_receipt(path)

    config = r'--oem 3 --psm 4 -l por'
    
    text = pytesseract.image_to_string(img_processed, config=config)
    return text


try:
    img_path = "comp02.jpeg"
    texto = extract_text(img_path)
    
    print("-" * 30)
    print("TEXTO EXTRAÍDO:")
    print("-" * 30)
    print(texto)
except Exception as e:
    print(f"Erro: {e}")

------------------------------
TEXTO EXTRAÍDO:
------------------------------
Tr mat
tam meneame

a

“4

VIA CLIENTE
AUTU:0bh643

= COMPRA CREDITO MASTERCARD
| os revi À CREDITO MASTERCARD

: crobasassar ea 4753 50 00

| POSTO ROCHA dad
:RUA PERU 311

MONTE CARMELD - ME CHPJ:55.327.368/0091 50
CCVSADAPTIVEPO SNPOS:PRALBAGGISIO

Des




In [22]:
import re

texto_ocr = texto

def parse_receipt_final(text):
    data = {}

    candidatos_valor = [] 

    lines = text.split('\n')
    for line in lines:
        clean_line = line.strip()
        clean_line = clean_line.replace(' ', '-')
        print(f"Analisando linha: {clean_line}")
        
        if "CNPJ" in clean_line.upper() or "CHPJ" in clean_line.upper() or "C.N.P.J" in clean_line.upper():
            cnpj_match = re.search(r'(\d{2}\.\d{3}\.\d{3})/(\d{4})-\d{2}', clean_line)
            print(f"Match CNPJ: {cnpj_match}")
            if cnpj_match:
                raiz = cnpj_match.group(1)
                filial = cnpj_match.group(2)
                
                print(f"Raiz: {raiz}, Filial: {filial}")
                if filial == '0091': 
                    filial = '0001'
                data['cnpj'] = f"{raiz}/{filial}-{cnpj_match.group()[-2:]}"
            
            continue 

        if "POSTO" in clean_line or "LTDA" in clean_line:
            data['estabelecimento'] = re.sub(r'^[^A-Z0-9]+', '', clean_line).strip()

        if "MONTE CARMEL" in clean_line:
            data['cidade'] = "MONTE CARMELO"

        price_match = re.search(r'(\d+)[.,\s](\d{2})\s*$', clean_line)
        if price_match:
            try:
                valor = float(f"{price_match.group(1)}.{price_match.group(2)}")
                # Adiciona à lista correta
                candidatos_valor.append(valor)
            except:
                pass

    if candidatos_valor:
        data['valor'] = candidatos_valor[0]

    return data

# Teste
resultado = parse_receipt_final(texto_ocr)
print(resultado)

Analisando linha: Tr-mat
Analisando linha: tam-meneame
Analisando linha: 
Analisando linha: a
Analisando linha: 
Analisando linha: “4
Analisando linha: 
Analisando linha: VIA-CLIENTE
Analisando linha: AUTU:0bh643
Analisando linha: 
Analisando linha: =-COMPRA-CREDITO-MASTERCARD
Analisando linha: |-os-revi-À-CREDITO-MASTERCARD
Analisando linha: 
Analisando linha: :-crobasassar-ea-4753-50-00
Analisando linha: 
Analisando linha: |-POSTO-ROCHA-dad
Analisando linha: :RUA-PERU-311
Analisando linha: 
Analisando linha: MONTE-CARMELD---ME-CHPJ:55.327.368/0091-50
Match CNPJ: <re.Match object; span=(24, 42), match='55.327.368/0091-50'>
Raiz: 55.327.368, Filial: 0091
Analisando linha: CCVSADAPTIVEPO-SNPOS:PRALBAGGISIO
Analisando linha: 
Analisando linha: Des
Analisando linha: 
Analisando linha: 
{'estabelecimento': 'POSTO-ROCHA-dad', 'cnpj': '55.327.368/0001-50'}
