In [2]:
from rich.pretty import pprint
import torch


def pretty_print(title: str = None, content: str = None):
    if title is None:
        print(content)
        return
    print(title)
    pprint(content)

# Image used for try: https://images.app.goo.gl/oqnKNSLtzeN9hvco8

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

class ImageDescriber:
    def __init__(self, model_name: str, device: str) -> None:
        self._device = device

        self._processor = BlipProcessor.from_pretrained(model_name)
        self._model = BlipForConditionalGeneration.from_pretrained(model_name).to(
            device
        )

    def __call__(self, image_path: str) -> str:
        image_obj = Image.open(image_path).convert("RGB")
        inputs = self._processor(image_obj, return_tensors="pt").to(self._device)
        output = self._model.generate(max_new_tokens=1024, **inputs)
        return self._processor.decode(output[0], skip_special_tokens=True)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
image_describer = ImageDescriber("Salesforce/blip-image-captioning-base", device)
result = image_describer("assets/images/Elon-musk-biopic.webp")   
result

'a receipt with a lottoo on it'

In [11]:
image_describer = ImageDescriber("Salesforce/blip-image-captioning-large", device)
result = image_describer("assets/images/Elon-musk-biopic.webp")  
result

'a close up of a receipt with a price of $ 1, 000'

In [12]:
pprint(type(result))

import base64

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_community.llms.ollama import Ollama
from langchain_core.messages import HumanMessage, SystemMessage

with open("assets/images/receipt/WechatIMG147.jpg", "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")
    prompt = ChatPromptTemplate.from_messages(
        [
            SystemMessage(
                content="""As a useful aaistant you provide the user with the receipt data information related to what they have consumed, bought, and paid for. 
                    Include the product name, price, and quantity if provided. 
                    Avoid any personal information, locations, addresses (even store's), sensitive data, and numbers. 
                    ONLY format the response in a reasonable TABLE in HTML format, no other paragraphs, phgrases or sentences are allowed.
                    Above the table, give the store or the brand name.
                    """,
            ),
            # HumanMessage(
            #     content=[
            #         {
            #             "type": "image_url",
            #             "image_url": {
            #                 "url": f"data:image/jpeg;base64,{base64_image}",
            #             },
            #         },
            #     ]
            # ),
        ]
    )
    model = Ollama(
        base_url="http://localhost:11434",
        model="llava",
        temperature=0,
    )
    res = (prompt | model.bind(images=[base64_image]) | StrOutputParser()).invoke({})
    #res = (prompt | model | StrOutputParser()).invoke({"base64_image": base64_image})
    display(HTML(res))

In [None]:
import base64
from os import listdir
from os.path import isfile, join

from IPython.display import Markdown,HTML
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from tqdm.asyncio import tqdm

model = ChatOpenAI(
    model="gpt-4-vision-preview", temperature=0, max_tokens=1024 * 2
)

async def read_receipt(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode("utf-8")
        # pretty_print("base64_image", base64_image)
        prompt = ChatPromptTemplate.from_messages(
            [
                SystemMessage(
                    content="""As a useful aaistant you provide the user with the receipt data information related to what they have consumed, bought, and paid for. 
                    Include the product name, price, and quantity if provided. 
                    Avoid any personal information, locations, addresses (even store's), sensitive data, and numbers. 
                    ONLY format the response in a reasonable TABLE in HTML format, no other paragraphs, phgrases or sentences are allowed.
                    In the table, the first row, in bold font, is the summe of the total payment, afterwards the product name, price, and quantity if provided.
                    Above the table, give the store or the brand name.
                    """,
                ),
                HumanMessage(
                    content=[
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                    ]
                ),
            ]
        )

        description = (prompt | model | StrOutputParser()).invoke(
            {"base64_image": base64_image}
        )

        # pretty_print("description", description)
        return description


image_dir = "assets/images/receipt"
image_files = [f for f in listdir(image_dir) if isfile(join(image_dir, f))]
tasks = list(
    map(lambda image_file: read_receipt(f"{image_dir}/{image_file}"), image_files)
)
descriptions = await tqdm.gather(*tasks)


In [3]:
for image_file, description in zip(image_files, descriptions):
    pretty_print("image", f"{image_dir}/{image_file}")
    display(HTML(description))

image


0,1
Total,"EUR 24,07"


image


0,1,2
Total,"68,47 EUR",
Super,"1,879 EUR/Liter","36,44 Liter"
MWST. A,"19,00%","10,93 EUR"
Summe Netto,,"57,54 EUR"


image


0,1
Total,"33,45 EUR"
Banane Chiquita,168
E. Kuklapytus Ment.,069
Broccoli 500g,099
Lauchzwiebeln,049
Lachsfilets,1198
Mü. Milchreis,267
Knoblauch,199
Eier Bodenhaltung,199
Hohes C Vitamin D,299


image


0,1
Total,"8,66 EUR"
Leergut Getränke,-190
Champ. braun 400g,080
S-Hackfleisch,270
Edamame,179
K-Pur P.Geschnetzeltes,399
Chinakohl,128
0.856 kg,


image


0,1
Total,€16.99
Ch n Sh 18er HW,€16.99
18 Hot Wings,
Uptrade Bucket,
Subtotal,€15.88
VAT (7%),€1.11
