In [1]:
import base64
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.utils.json import parse_json_markdown

In [2]:
def load_image(image_path: str) -> dict:
    """Load image from file and encode it as base64."""
    image_path = image_path
  
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
        
    image_base64 = encode_image(image_path)
    return {"image": image_base64}

In [3]:
image_dict = load_image("../data/pizza_damaged.png")
ref_image_dict = load_image("../data/pizza_incomplete.png")

In [4]:
image_dict

{'image': 'iVBORw0KGgoAAAANSUhEUgAAAYkAAAK1CAYAAAA9jRL2AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAEnQAABJ0Ad5mH3gAAP+lSURBVHhe7P1Zs2Vbdt+HrczTZ+bNvH1XtzoUUIAAokiCJNhZAikrSFumHWJIYkAPfHD40eEIhx3hB4cjrAiH/W38oE8gfQHJsoImCVIkgCoUQFSh6t6692Zzeo/fGOu319gz1z55Mm8WoIf6nxw55xxzdHPMueZqdnfnk08+uZ5+gV/gF/gFfoFfYAV35/IX+AV+gV/gF/gFnsPmTuLx5z9KRsfdu9vnkDsheee6bjwuLy+TaN65k6wbcSeEpOtQgvb39ze8q6ur5FHvuL6+nO5EGMTS9WirB92Z9jc2QLcJkN/buzMdHh1M5+fn2YaQsTw9PU35g/2j0F/8QcZFnXFfXFxMBwcHyaNvb28v7WhDHqW5oo5O52Xss+wIdSSAjzEueFtydy7Q3vQBctZ9gsOIBb+0IfMKaCN7cRHy13ezj7HR3t+vsRoD2Lt7kH3kFr/YXRsTqDFfh42Ib495wV72TGdnZ9PlVYzj7mHK7k3XYTvWzCU6MY6oIxeOp2t0DypmbOKb/B4fH0+PHz/OOXr44I3p/Mmz6fL6KvqOwt/BdB2GrqL95Omz6fyq1tP5+d3p8589jtyZ26uIi3wTIz7N9/qYNnkIpTt3iI/8R5s5SN1pOj7am+4fn+R6zmMp4r6McRH/0VGN1/mhzFxESdzmGlnyy1jlQXdiHESZ84dM5AZcRR9jh6I6XcbSQOZu2Mi/iHN/P+Y3xngV+UohBk7/hpiju5lb5vjp0ycZI36BOT88PEx54j47Iz5yuJ8yZadKQM6Zn4w3cH5+ETqnOV7sQPRVPq4zP44dv+hTf/z4ScgeZDzmDXnsuRbFycnJdO/evbDF8R1xXl1On/7ss+nZs2cRC+Mp2bsxXubZP

In [10]:
llm = ChatOpenAI(temperature=0.5, model="gpt-4o")

In [11]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class ImageInformation(BaseModel):
    """Information about an image."""
    image_description: str = Field(description="a short description of the image")
    is_complaint_genuine: bool = Field(description="Is the complaint genuine")
    count: int = Field(description="Count of images")

parser = JsonOutputParser(pydantic_object=ImageInformation)

In [19]:
vision_prompt = """
   The 1st image is the customer image and the 2nd one is the reference image
   The customer has provided us their image and complained stating ```The pizza was damaged during delivery```. Provide me the following information:
    - A description of the image
    - Is the complaint genuine
    - Count of images
   """

In [20]:
msg = llm.invoke(
             [HumanMessage(
             content=[
             {"type": "text", "text": vision_prompt},
             {"type": "text", "text": parser.get_format_instructions()},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ref_image_dict['image']}"}},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ref_image_dict['image']}"}},
             ])]
             )

In [21]:
parse_json_markdown(msg.content)

{'image_description': 'Two images of a pizza in an open cardboard box. The pizza appears to be intact with no visible damage.',
 'is_complaint_genuine': False,
 'count': 2}