In [1]:
from dotenv import load_dotenv
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage
import base64
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
api_key = os.environ.get("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(
    model= "gemini-2.0-flash",
    convert_system_message_to_human=True,
    temperature=0.0,
)

In [3]:
def img_to_llm(image_file_path):
    with open(image_file_path, "rb") as image_file:
        image_data = image_file.read()
    encoded_image = base64.b64encode(image_data).decode('utf-8')
    output_parser = StrOutputParser()

    message = HumanMessage(
        content=[
            {"type": "text", "text": "Describe the image I am sending."},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{encoded_image}"}
            }
        ]
    )

    response = llm.invoke([message])
    parsed_response = output_parser.invoke(response)
    return (response.content)

In [5]:
output_path = './output_images'
for i in range(5):
    llm_output = ""
    image_filename = os.path.join(output_path,f'page_{i+1:03}.png')
    llm_output = img_to_llm(image_filename)
    text_filename = os.path.join(output_path, f'page_{i+1:03}.txt')
    with open(text_filename, 'w', encoding='utf-8') as f:
        f.write(llm_output)
    