In [None]:
from langchain.llms import OpenAI
from langchain.tools import BaseTool
from langchain.agents import AgentType, initialize_agent

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import requests
from PIL import Image

## Setting the LLM

In [None]:
llm = OpenAI(
    model_name = "gpt-3.5-turbo-instruct",
    temperature = 0,
    openai_api_key = open("openai_api.txt", "r").read()
)

We will take an existing open-source model that has been trained for a specific task that out LLM can't do. That model is going to be the `Salesforce/blip-image-captioning-large` from HuggingFace. That model is an expert for describing images in text.


The process goes as follows:
1. `Download` Image
2. `Open` it as a PIL object
3. Resize and Normalize it using the `processor`
4. Create a caption using the `model`


In [None]:
## Loading Model

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
## Step 1 and 2

img_url = "https://images.unsplash.com/photo-1616128417859-3a984dd35f02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2372&q=80"
image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
image

In [None]:
## Step 3 and 4

inputs = processor (image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
out = model.generate(**inputs, max_new_tokens=20)

print(processor.decode(out[0], skip_special_tokens=True))

In [None]:
## Creating the Tool Class

class ImageCaptioningTool(BaseTool):
    name = "Image Captioner"
    description = "Use this tool when given a URL of an image that you'd like to descibe. It will return a simple caption describing the image."

    def _run(self, url: str):
        # Loading the Opening the Imahe
        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        # Processing the Image
        inputs = processor(image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        # Generate Caption Indexes
        out = model.generate(**inputs, max_new_tokens=20)
        # Returning the Caption as a String
        return processor.decode(out[0], skip_special_tokens=True)

    def _arun(self, query: str):
        raise NotImplementedError("[ERROR] This tool does not support async.")

In [None]:
tools = [ImageCaptioningTool()]

agent = initialize_agent(
    tools = tools,
    llm = llm,
    agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose = True
)

In [None]:
agent(f"What does this image show?\n{img_url}")

In [None]:
img_url = "https://images.unsplash.com/photo-1502680390469-be75c86b636f?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2370&q=80"
agent(f"what is in this image?\n{img_url}")

In [None]:
img_url = "https://images.unsplash.com/photo-1680382948929-2d092cd01263?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2365&q=80"
agent(f"what is in this image?\n{img_url}")