<a href="https://colab.research.google.com/github/Sohum-Prime/CompRobo-VisionLLM/blob/main/Polished_Gradio_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Intall Required Libraries and Packages

In [None]:
!pip install -U gradio
# !pip install -U gradio_tools
!pip install -U openai
!pip install -U torch torchvision
# !pip install -U langchain
!pip install -U pydantic

## Import Requisite Libraries and Packages

In [None]:
import gradio as gr
import torch
from torchvision import models, transforms
from torchvision.models import vit_b_16, ViT_B_16_Weights
from PIL import Image
from openai import OpenAI
import json

## VLN App Implementations

### OpenAI API Direct Implementation

In [None]:
# Load a pre-trained PyTorch model (Vision Transformer in this case)
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.eval()

# Function to preprocess the image for Vision Transformer
def preprocess_image(image):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(0.5, 0.5)
    ])
    image = transform(image).unsqueeze(0)
    return image

# Load ImageNet class labels
with open('imagenet_classes.json') as f:
    imagenet_classes = json.load(f)

# Function to get image predictions
def predict_image(image, topk=5):
    processed_image = preprocess_image(image)
    with torch.no_grad():
        outputs = model(processed_image)
    _, indices = outputs.topk(topk)
    labels = [imagenet_classes[index] for index in indices[0].tolist()]  # Use this if keys are integers
    return labels

# Function to generate text response using GPT-4
def generate_text(prompt):
    client = OpenAI(api_key = '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')  # Replace with your OpenAI API key

    # Updated API call
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",  # Updated model name
        messages=[
            {"role": "system", "content": "You are a helpful assistant for blind and visually-impaired people, who uses POV images they take to provide detailed, natural, and human-like instructions to help them navigate their environment."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=600
    )

    # Convert the response to a dictionary
    response_dict = response.model_dump()
    return response_dict['choices'][0]['message']['content'].strip()

# Gradio app interface
def vln_app(image, text):
    labels_detected = predict_image(image)
    combined_prompt = f"Objects detected: {', '.join(labels_detected)}. User instruction: {text}"
    response = generate_text(combined_prompt)
    return labels_detected, response  # Return both the labels and the response

# Create the Gradio interface
iface = gr.Interface(
    fn=vln_app,
    inputs=[gr.Image(label="Upload Image", type="pil"), gr.Textbox(label="Enter your instruction")],
    outputs=[gr.Textbox(label="Detected Labels", lines=2), gr.Textbox(label="Response", lines=10)],
    title="Vision and Language Navigation Demo",
    description="This app detects objects in images and generates responses based on user instructions."
)

# Run the app
iface.launch(debug=True)

### LangChain Verbose Agent Implementation

In [None]:
import gradio as gr
import torch
from torchvision import models, transforms
from torchvision.models import vit_b_16, ViT_B_16_Weights
from PIL import Image
import json
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, initialize_agent
from gradio_tools import BarkTextToSpeechTool

In [None]:
from langchain.agents.load_tools import load_tools
# Load a pre-trained PyTorch model (Vision Transformer in this case)
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.eval()

# Function to preprocess the image for Vision Transformer
def preprocess_image(image):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(0.5, 0.5)
    ])
    image = transform(image).unsqueeze(0)
    return image

# Load ImageNet class labels
with open('imagenet_classes.json') as f:
    imagenet_classes = json.load(f)

# Function to get image predictions
def predict_image(image, topk=5):
    processed_image = preprocess_image(image)
    with torch.no_grad():
        outputs = model(processed_image)
    _, indices = outputs.topk(topk)
    labels = [imagenet_classes[index] for index in indices[0].tolist()]
    return labels

# Initialize the OpenAI model with verbose output
openai_api_key = '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'  # Replace with your OpenAI API key
llm = ChatOpenAI(api_key=openai_api_key, model_name="gpt-4-1106-preview", temperature=0)
tools = BarkTextToSpeechTool().langchain
agent = initialize_agent([tools], llm, AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)

# Function to generate text response using OpenAI with LangChain
def generate_text(prompt):
    response = agent.run(
        messages=[
            {"role": "system", "content": "You are a helpful assistant for blind and visually-impaired people, who uses POV images they take to provide detailed, natural, and human-like instructions to help them navigate their environment."},
            {"role": "user", "content": prompt}
        ],
    )
    return response['choices'][0]['message']['content'].strip()

# Gradio app interface
def vln_app(image, text):
    labels_detected = predict_image(image)
    combined_prompt = f"Objects detected: {', '.join(labels_detected)}. User instruction: {text}"
    response = generate_text(combined_prompt)
    return labels_detected, response

# Create the Gradio interface
iface = gr.Interface(
    fn=vln_app,
    inputs=[gr.Image(label="Upload Image", type="pil"), gr.Textbox(label="Enter your instruction")],
    outputs=[gr.Textbox(label="Detected Labels", lines=2), gr.Textbox(label="Response", lines=10)],
    title="Vision and Language Navigation Demo",
    description="This app detects objects in images and generates responses based on user instructions."
)

# Run the app
iface.launch(debug=True)

### LangChain Chat LCEL Implementation

In [None]:
# Load a pre-trained PyTorch model (Vision Transformer in this case)
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.eval()

# Function to preprocess the image for Vision Transformer
def preprocess_image(image):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(0.5, 0.5)
    ])
    image = transform(image).unsqueeze(0)
    return image

# Load ImageNet class labels
with open('imagenet_classes.json') as f:
    imagenet_classes = json.load(f)

# Function to get image predictions
def predict_image(image, topk=5):
    processed_image = preprocess_image(image)
    with torch.no_grad():
        outputs = model(processed_image)
    _, indices = outputs.topk(topk)
    labels = [imagenet_classes[index] for index in indices[0].tolist()]  # Use this if keys are integers
    return labels

# Define the system and user messages for the chat prompt
system_message = "You are a helpful assistant for blind and visually-impaired people, who uses POV images they take to provide detailed, natural, and human-like instructions to help them navigate their environment."
user_message_template = "Objects detected: {}. User instruction: {}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", user_message_template),
])

# Initialize the LangChain with OpenAI Chat Model
openai_api_key = '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'  # Replace with your OpenAI API key
llm = ChatOpenAI(api_key=openai_api_key, set_verbose=True)

# Compose the chain
chain = chat_prompt | llm

# Function to generate text response using LangChain
def generate_text(labels, instruction):
    response, verbose_output = chain.invoke({"labels": labels, "instruction": instruction})
    return response, verbose_output

# Gradio app interface
def vln_app(image, text):
    labels_detected = predict_image(image)
    combined_prompt = f"Objects detected: {', '.join(labels_detected)}. User instruction: {text}"
    response, verbose_output = generate_text(labels_detected, text)
    return labels_detected, response, verbose_output

# Create the Gradio interface
iface = gr.Interface(
    fn=vln_app,
    inputs=[gr.Image(label="Upload Image", type="pil"), gr.Textbox(label="Enter your instruction")],
    outputs=[gr.Textbox(label="Detected Labels", lines=2), gr.Textbox(label="Response", lines=10), gr.Textbox(label="Verbose Output", lines=10)],
    title="Vision and Language Navigation Demo",
    description="This app detects objects in images and generates responses based on user instructions."
)

# Run the app
iface.launch(debug=True)

### 2-Level Instruction-based Prompting for VLN App

In [None]:
!pip install -U gradio
!pip install -U transformers
!pip install -U torch
!pip install -U openai  # If using GPT-4

In [None]:
def generate_high_level_instructions(start_location, end_location):
    # Placeholder for high-level instruction generation
    return f"Navigate from {start_location} to {end_location} through the main road."

def generate_low_level_instructions(high_level_plan, frame):
    # Placeholder for low-level instruction generation
    return f"Based on your current position {frame}, follow the path: {high_level_plan}"

from openai import OpenAI
client = OpenAI()

# openai.api_key = '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'  # Replace with your actual API key

def refine_instructions_with_gpt4(instructions):
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",  # Specify the GPT-4 Turbo model
            prompt=instructions,
            max_tokens=100,  # Adjust as needed
            temperature=0.7  # Adjust for creativity level
        )
        return response.choices[0].text.strip()
    except Exception as e:
        return str(e)

import gradio as gr

def vln_app(start_location, end_location, frame):
    high_level_plan = generate_high_level_instructions(start_location, end_location)
    low_level_instructions = generate_low_level_instructions(high_level_plan, frame)
    refined_instructions = refine_instructions_with_gpt4(low_level_instructions)
    return refined_instructions

iface = gr.Interface(
    fn=vln_app,
    inputs=[
        gr.Textbox(label="Start Location"),
        gr.Textbox(label="End Location"),
        gr.Textbox(label="Current Frame")
    ],
    outputs=gr.Textbox(label="Refined Instructions")
)

iface.launch()


ImportError: ignored

In [None]:
!pip install -U openai



In [None]:
from openai import OpenAI
client = OpenAI(api_key = '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')

completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

Collecting openai
  Using cached openai-1.3.9-py3-none-any.whl (221 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.27.7
    Uninstalling openai-0.27.7:
      Successfully uninstalled openai-0.27.7
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.
tree-of-thoughts-llm 0.1.0 requires openai==0.27.7, but you have openai 1.3.9 which is incompatible.[0m[31m
[0mSuccessfully installed openai-1.3.9
ChatCompletionMessage(content="In the land of computation, a concept reigns supreme,\nWhere functions call themselves, a coder's dream.\nThis act of looping self-reference, so sly,\nIs known as recursion, a spiral staircase to the sky.\n\nA problem to solve, both complex and vast,\nWe break i

In [None]:
from openai import OpenAI

client = OpenAI(api_key = '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')

response = client.chat.completions.create(
  model="gpt-4-vision-preview",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='The image shows a wooden boardwalk stretching into the distance through a lush green field. The grass is tall and the foliage dense on either side of the walkway. In the background, there is a mix of trees and shrubs under a vibrant blue sky scattered with white clouds. The overall scene conveys a sense of openness and tranquility, typical of a natural, rural landscape. The light and shadow suggest it might be a sunny day in the late afternoon.', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})


In [None]:
from openai import OpenAI

# Initialize the OpenAI client with your API key
client = OpenAI(api_key = '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')

def generate_high_level_instruction(current_location, desired_destination):
    """
    Generates a high-level instruction on the general heading and distance
    that needs to be traveled to get from the current location to the destination.

    Parameters:
    current_location (int): The current room number.
    desired_destination (int): The desired room number to reach.

    Returns:
    tuple: A tuple containing the direction to travel as a string and the distance required as an integer.
    """
    # Determine the direction based on the relative position of the current location and the desired destination
    direction = 'right' if current_location > desired_destination else 'left'

    # Calculate the distance by finding the difference between the two room numbers and multiplying by the distance between rooms
    distance = abs(current_location - desired_destination) * 10  # Each room is 10 m apart

    return direction, distance

# # Example usage:
# print(generate_high_level_instruction(113, 128))  # Output: ('left', 150)
# print(generate_high_level_instruction(124, 116))  # Output: ('right', 80)


def process_image_and_generate_instruction(image_path, high_level_instruction):
    """
    Uses an image capture and the high-level instruction to generate a low-level,
    highly actionable set of navigation instructions.
    """
    # You would replace the following with a call to the OpenAI's GPT-4 Vision API
    # Here is the pseudocode for calling the LLM
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Generate low-level navigation instructions based on the image and high-level instruction."},
                    {
                        "type": "file",  # Assuming you can send a file. If not, you would need to upload the image to a URL and send the URL instead.
                        "file": image_path
                    },
                ],
            },
            {
                "role": "system",
                "content": {"type": "text", "text": high_level_instruction}
            }
        ],
        max_tokens=300,
    )
    return response.choices[0].message.content

def augment_navigation_instruction(instruction):
    """
    Augments the quality of the navigation instruction using the Six Step Method Summary.
    """
    # Use the principles from the Six Step Method Summary to augment the instruction
    # For example, you might use comparison language (WESSST points),
    # establish positions (EYP), and so on.
    # The actual implementation would depend on the specifics of the instruction
    # and how you want to apply the Six Step Method.
    enhanced_instruction = instruction  # Placeholder for the actual enhancement logic
    return enhanced_instruction

# Example usage
current_location = 113
desired_destination = 128
high_level_instruction = generate_high_level_instruction(current_location, desired_destination)

# Assume we have an image path from the capture
image_path = '/path/to/your/image.jpg'
low_level_instruction = process_image_and_generate_instruction(image_path, high_level_instruction)

enhanced_instruction = augment_navigation_instruction(low_level_instruction)

# Output the enhanced instruction to the user
print(enhanced_instruction)


#### Tests

In [None]:
import openai
import os

# Set your OpenAI API key
openai.api_key = 'your-api-key'

def process_image_and_generate_instruction(image_path, direction, distance):
    """
    Uses an image capture and the high-level instruction to generate a low-level,
    highly actionable set of navigation instructions.
    """
    # Check if the image path exists to avoid errors during API call
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"The image file at {image_path} does not exist.")

    # direction, distance = high_level_instruction
    instruction_text = (
        "Certified orientation and mobility specialists (COMS) work with clients who are blind "
        "or visually impaired (BVI) to help them travel independently with confidence. Part of this "
        "process involves creating a narrative description of a route and using specific techniques "
        "to help the client internalize it. "
        )

    # Construct the message for the API call
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction_text},
                # Assuming OpenAI API accepts binary image data directly.
                # If not, you would need to encode this as base64 or use a URL.
                {"type": "image", "data": image_data}
            ],
        },
        {
            "role": "system",
            "content": {"type": "text", "text": "Imagine three different COMS are answering this question. "
        "All COMS will write down 1 step of their thinking, and then share it with the group. "
        "Then all COMS will go on to the next step, etc. "
        "If any COMS realizes they're wrong at any point then they leave. "
        "The question is: What is the most effective and efficient manner to guide a BVI client through the current environment seen in {image_path} as they move towards {direction} for {distance} meters?"}
        }
    ]

    # Make the API call to OpenAI
    try:
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=messages,
            max_tokens=300
        )
    except openai.error.OpenAIError as e:
        # Handle any errors that occur during the API call
        return f"An error occurred while calling the OpenAI API: {e}"

    # Extract the content from the API response
    if response and response.choices and response.choices[0].message:
        return response.choices[0].message.content
    else:
        return "No instruction was generated by the API."

# This function call is just for demonstration and will not actually execute here
# Replace '/path/to/image.jpg' with the actual path to your image file
# low_level_instruction = process_image_and_generate_instruction('/path/to/image.jpg', ('right', 150))
# print(low_level_instruction)

SyntaxError: ignored

### Functions

In [None]:
from openai import OpenAI
import os
import base64

# Initialize the OpenAI client with your API key
client = OpenAI(api_key = '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')

def generate_high_level_instruction(current_location, desired_destination):
    """
    Generates a high-level instruction on the general heading and distance
    that needs to be traveled to get from the current location to the destination.

    Parameters:
    current_location (int): The current room number.
    desired_destination (int): The desired room number to reach.

    Returns:
    tuple: A tuple containing the direction to travel as a string and the distance required as an integer.
    """
    # Determine the direction based on the relative position of the current location and the desired destination
    direction = 'right' if current_location > desired_destination else 'left'

    # Calculate the distance by finding the difference between the two room numbers and multiplying by the distance between rooms
    distance = abs(current_location - desired_destination) * 10  # Each room is 10 m apart

    return direction, distance

# Function to encode the image in base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

client = OpenAI(api_key = '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')

def process_image_and_generate_instruction(image_path, direction, distance):
    """
    Uses an image capture and the high-level instruction to generate a low-level,
    highly actionable set of navigation instructions.
    """
    # # Check if the image path exists to avoid errors during API call
    # if not os.path.exists('/content/MAC 116.jpeg'):
    #     raise FileNotFoundError(f"The image file at {/content/MAC 116.jpeg} does not exist.")

    # # Encode the image
    # base64_image = encode_image(image_path)

    # Construct the message for the API call
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "Certified orientation and mobility specialists (COMS) work with clients who are blind "
                        "or visually impaired (BVI) to help them travel independently with confidence. Part of this "
                        "process involves creating a narrative description of a route and using specific techniques "
                        "to help the client internalize it."
                    )
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_path}"
                    }
                },
            ],
        },
        {
            "role": "system",
            "content": {
                "type": "text",
                "text": (
                    "Imagine three different COMS are answering this question. "
                    "All COMS will write down 1 step of their thinking, and then share it with the group. "
                    "Then all COMS will go on to the next step, etc. "
                    "If any COMS realizes they're wrong at any point then they leave. "
                    f"The question is: What is the most effective and efficient manner to guide a BVI client through the current environment seen in the image as they move forwards with an overall goal of going {direction} for {distance} meters?"
                )
            }
        }
    ]

    # Make the API call to OpenAI
    # try:
    response = response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=messages,
            max_tokens=300
        )

    return response.choices[0].message.content

    # except openai.error.OpenAIError as e:
    #     # Handle any errors that occur during the API call
    #     return f"An error occurred while calling the OpenAI API: {e}"

    # # Extract the content from the API response
    # if response and response.choices and response.choices[0].message:
    #     return response.choices[0].message.content
    # else:
    #     return "No instruction was generated by the API."

# Replace the 'image_path' with the actual path to the image file and provide the direction and distance.
# The function call below is just an example and will not execute here due to the lack of an actual API key and image file.
# example_instruction = process_image_and_generate_instruction('/path/to/image.jpg', 'right', 150)
# print(example_instruction)

In [None]:
direction, distance = generate_high_level_instruction(126, 113)
current_image = "/content/MAC 116.jpeg"
image_path = encode_image(current_image)
print(image_path)
hope = process_image_and_generate_instruction(image_path, direction, distance)

/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wgARCAgABgADASIAAhEBAxEB/8QAGwAAAwEBAQEBAAAAAAAAAAAAAAECAwQFBgf/xAAYAQEBAQEBAAAAAAAAAAAAAAAAAQIDBP/aAAwDAQACEAMQAAACsCAAEwrbm0s1AAAAAAAAQAACBCmpGBK6kNbxs1edGjhlksollOQpyFksblo/O9BnzT1xmm00AFYAAAwBoGAAMBMAEGmAAAAADAAAAAGJgAAAAAAMTABAxAnhqWAJgAADAAAAAAYAAAAAAADTAAAAGCGhgAAoDEniLoy2JTZIwQ0AAAAAAAAAAgAAAAKAAAAAAAACaBoGgAAALAAABDRprza2aAAAAAIAABNAmhTUjAhgKVLLqGaVnRbiinDLJZQgoQUJjcsw8P6TyTiAlYAMAAGAAAwAABgAAwAAQAAGAAAAADAAAAAABAxA0wTAGmc+uehoAIYJgAAAADEwAAAAAAAAAYAAAAAA0AwAAAFfN0cx0lSAAAAANAIpCGgAAAAAABMAABAxCMBQTAAEAAAAAAAWAAAAmCGjXTm1s0AAENAAAgATQpqRtOABRgFSyqhl1DLcUU5ZQmNyxuWU5Y5oPn59XyoYmrEwAGAAAwAAGJgAMAAEAAYAAAMQwAACSlLAGJjEwAGAAAzC4s0AAAAAAAAGgYAAAAAAMBMAAAAAYgBpgAAAoDMpW5cACaBgAAAAAMQNMILRLQNMAQAAAAAAANA0MQAAAAAADLBMENAACYIaNr5t7KEAAAAgBAglobTlABgA00GhbcUVUMtyynLKExiY3LKcsfi+

OSError: ignored

In [None]:
import base64
import requests

# OpenAI API Key
api_key = "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"

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "/content/MAC 116.jpeg"

# Getting the base64 string
base64_image = encode_image(image_path)

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

payload = {
  "model": "gpt-4-vision-preview",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What’s in this image?"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          }
        }
      ]
    }
  ],
  "max_tokens": 300
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json())

{'id': 'chatcmpl-8VPBk6OBy3Gd6LvW67BhaYHTrejqD', 'object': 'chat.completion', 'created': 1702495684, 'model': 'gpt-4-1106-vision-preview', 'usage': {'prompt_tokens': 778, 'completion_tokens': 208, 'total_tokens': 986}, 'choices': [{'message': {'role': 'assistant', 'content': 'The image shows an indoor setting that appears to be a workspace or office area. Features of this space include:\n\n1. Large windows that provide natural light.\n2. A carpeted floor with a pattern that gives the impression of tire tracks or some abstract design.\n3. Desks and tables equipped with computers and multiple monitors, suggesting this could be an area where people work with technology or perform collaborative tasks.\n4. A variety of chairs and a standing desk setup, indicating an environment that supports ergonomic working positions.\n5. Shelving units on the left-hand side with various office supplies and materials.\n6. An overhead circular lighting fixture, as well as recessed ceiling lights.\n7. Signa

In [None]:
print(base64_image)

/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wgARCAgABgADASIAAhEBAxEB/8QAGwAAAwEBAQEBAAAAAAAAAAAAAAECAwQFBgf/xAAYAQEBAQEBAAAAAAAAAAAAAAAAAQIDBP/aAAwDAQACEAMQAAACsCAAEwrbm0s1AAAAAAAAQAACBCmpGBK6kNbxs1edGjhlksollOQpyFksblo/O9BnzT1xmm00AFYAAAwBoGAAMBMAEGmAAAAADAAAAAGJgAAAAAAMTABAxAnhqWAJgAADAAAAAAYAAAAAAADTAAAAGCGhgAAoDEniLoy2JTZIwQ0AAAAAAAAAAgAAAAKAAAAAAAACaBoGgAAALAAABDRprza2aAAAAAIAABNAmhTUjAhgKVLLqGaVnRbiinDLJZQgoQUJjcsw8P6TyTiAlYAMAAGAAAwAABgAAwAAQAAGAAAAADAAAAAABAxA0wTAGmc+uehoAIYJgAAAADEwAAAAAAAAAYAAAAAA0AwAAAFfN0cx0lSAAAAANAIpCGgAAAAAABMAABAxCMBQTAAEAAAAAAAWAAAAmCGjXTm1s0AAENAAAgATQpqRtOABRgFSyqhl1DLcUU5ZQmNyxuWU5Y5oPn59XyoYmrEwAGAAAwAAGJgAMAAEAAYAAAMQwAACSlLAGJjEwAGAAAzC4s0AAAAAAAAGgYAAAAAAMBMAAAAAYgBpgAAAoDMpW5cACaBgAAAAAMQNMILRLQNMAQAAAAAAANA0MQAAAAAADLBMENAACYIaNr5t7KEAAAAgBAglobTlABgA00GhbcUVUMtyynLKExiY3LKcsfi+