### **Optical Character Recognition (OCR) using AWS Textract**

In [1]:
import pathlib
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
from groq import Groq
import warnings
import os
from PIL import Image
from IPython.display import display
from dotenv import load_dotenv
warnings.filterwarnings("ignore")

In [56]:
dotenv_path = pathlib.Path().resolve().parents[1] / ".env"
load_dotenv(dotenv_path=dotenv_path)
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

In [82]:
def create_client_groq():
  client = Groq(
		api_key=groq_api_key)
  return client

def generate_groq_query(query: str):
  client = create_client_groq()
  chat_completion = client.chat.completions.create(
		model="llama3-8b-8192",
		messages=[
			{
				"role": "system",
				"content": (
                    f"You are a professional accountant and financier."
                    f"I passed you an array of strings where the array contains the invoices data."
                    f"You need to extract the information from the invoice array of strings and return a JSON object with the following keys:"
                    f"Número de la boletay/o factura, RUC o identificación del vendedor, Nombre del vendedor., productos"
                    f"The products keys is an array with 'product_name' and its 'price."
                    f"Only return the JSON object, do not return anything else."
                    f"Do not add any other text."
                    f"You must recognize the title of each item in the invoice and connect with similar items in the array of strings."
                    f"The JSON object must be in Spanish language."
                    f"If a product name is split into multiple lines, you must merge them before extracting its price. Usually, the price appears at the end, so assume previous lines are part of the same item."
                )
			},
			{
				"role": "user",
				"content": f"{query}"
			}
		]
	)
  return json.loads(chat_completion.choices[0].message.content)

In [21]:
def create_session_and_client():
  session = boto3.Session(
		region_name="us-east-1",
		aws_access_key_id=aws_access_key_id,
		aws_secret_access_key=aws_secret_access_key,
	)
  client = session.client(
    "textract", 
    region_name="us-east-1"
  )
  return client

In [22]:
def process_file(file_path):
  client = create_session_and_client()
  with open(file_path, "rb") as document:
    image_bytes = document.read()
  response = client.detect_document_text(
    Document={
			'Bytes': image_bytes,
		}
  )
  return response
  

In [81]:
img = Image.open(r"../files/invoice.jpeg")
# display(img)

In [24]:
result = process_file(r"../files/invoice.jpeg")

In [86]:
data = []
for block in result["Blocks"]:
    if block["BlockType"] == "LINE":
        data.append(block["Text"])

In [87]:
data

['ZHONG YUSHU',
 'R.U.C. 15262071429',
 'TEL: 0',
 'Av. Los Quechuas 1013',
 'CHIFA',
 '- ATE - UMA - LIMA',
 'LONG FUNG with',
 'Casa Matriz',
 'FACTURA ELECTRÓNICA',
 'F003 N° 00002016',
 'FECHA EMISIÓN : 13/04/2025 19:53:01',
 'RUC',
 ':',
 '20516122723',
 'RAZÓN SOCIAL',
 ':',
 'GEOURBI SOCIEDAD COMERCIAL',
 'DE RESPONSABILIDAD LIMITADA',
 'DIRECCIÓN',
 ': CAL. EFESTOS NRO.497 - ATE -',
 'LIMA - LIMA',
 'T. MONEDA',
 ': SOLES',
 'FORMA PAGO',
 ': Contado',
 'FECHA PAGO',
 ': 13/04/2025',
 'CANT. U.M.',
 'DESCRIP',
 'P.U. IMPORTE',
 'Menu 4 Combinado Poll',
 '1.00 UNIDA',
 '18.00',
 '18.00',
 'o',
 'Menu 15 Chancho c/ T',
 '1.00',
 'UNIDA',
 '20.00',
 '20.00',
 'amarindo',
 '1.00',
 'UNIDA Gaseosa 1 It',
 '7.00',
 '7.00',
 'OP. GRAVADA',
 'S/',
 '40.91',
 'OP. INAFECTA',
 'S/ 0.00',
 'OP. EXONERADA',
 'S/ 0.00',
 'OP. EXPORTACIÓN',
 'S/ 0.00',
 'OP. GRATUITA',
 'S/ 0.00',
 'TOT. DSCTO',
 'S/ 0.00',
 'I.S.C.',
 'S/ 0.00',
 'I.G.V. 10%',
 'S/ 4.09',
 'OTROS CARGOS',
 'S/ 0.00',
 'OTRO

In [83]:
res_json = generate_groq_query(query=data)
res_json

{'Número de la boleta/factura': 'F003 N° 00002016',
 'RUC o identificación del vendedor': '15262071429',
 'Nombre del vendedor': 'ZHONG YUSHU',
 'productos': [{'product_name': 'Menu 4 Combinado Poll', 'price': '18.00'},
  {'product_name': 'Menu 15 Chancho c/ T amarindo', 'price': '20.00'},
  {'product_name': 'Gaseosa 1 It', 'price': '7.00'}]}