## CompRobo LLM-based Visual Transformer

In [19]:
from langchain.tools import BaseTool
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
from PIL import Image
import torch
#
import os
from tempfile import NamedTemporaryFile
from langchain.agents import initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

In [20]:
class ImageCaptionTool(BaseTool):
    name = "Image captioner"
    description = "Use this tool when given the path to an image that you would like to be described. " \
                  "It will return a simple caption describing the image."

    def _run(self, img_path):
        image = Image.open(img_path).convert('RGB')

        model_name = "Salesforce/blip-image-captioning-large"
        device = "cpu"  # cuda

        processor = BlipProcessor.from_pretrained(model_name)
        model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)

        inputs = processor(image, return_tensors='pt').to(device)
        output = model.generate(**inputs, max_new_tokens=20)

        caption = processor.decode(output[0], skip_special_tokens=True)

        return caption

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")


class ObjectDetectionTool(BaseTool):
    name = "Object detector"
    description = "Use this tool when given the path to an image that you would like to detect objects. " \
                  "It will return a list of all detected objects. Each element in the list in the format: " \
                  "[x1, y1, x2, y2] class_name confidence_score."

    def _run(self, img_path):
        image = Image.open(img_path).convert('RGB')

        processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
        model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

        inputs = processor(images=image, return_tensors="pt")
        outputs = model(**inputs)

        # convert outputs (bounding boxes and class logits) to COCO API
        # let's only keep detections with score > 0.9
        target_sizes = torch.tensor([image.size[::-1]])
        results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

        detections = ""
        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            detections += '[{}, {}, {}, {}]'.format(int(box[0]), int(box[1]), int(box[2]), int(box[3]))
            detections += ' {}'.format(model.config.id2label[int(label)])
            detections += ' {}\n'.format(float(score))

        return detections

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [21]:
def get_image_caption(image_path):
    """
    Generates a short caption for the provided image.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: A string representing the caption for the image.
    """
    image = Image.open(image_path).convert('RGB')

    model_name = "Salesforce/blip-image-captioning-large"
    device = "cpu"  # cuda

    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)

    inputs = processor(image, return_tensors='pt').to(device)
    output = model.generate(**inputs, max_new_tokens=20)

    caption = processor.decode(output[0], skip_special_tokens=True)

    return caption


def detect_objects(image_path):
    """
    Detects objects in the provided image.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: A string with all the detected objects. Each object as '[x1, x2, y1, y2, class_name, confindence_score]'.
    """
    image = Image.open(image_path).convert('RGB')

    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # convert outputs (bounding boxes and class logits) to COCO API
    # let's only keep detections with score > 0.9
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    detections = ""
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        detections += '[{}, {}, {}, {}]'.format(int(box[0]), int(box[1]), int(box[2]), int(box[3]))
        detections += ' {}'.format(model.config.id2label[int(label)])
        detections += ' {}\n'.format(float(score))

    return detections

In [22]:
import openai
from getpass import getpass
#set the openai_api_key
openai_api_key = getpass()

# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# import os
# import openai
 
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [23]:
#initialize the agent
tools = [ImageCaptionTool(), ObjectDetectionTool()]

conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)

llm = ChatOpenAI(
    openai_api_key= openai_api_key,
    temperature=0,
    model_name="gpt-3.5-turbo"
)

agent = initialize_agent(
    agent="chat-conversational-react-description",
    tools=tools,
    llm=llm,
    max_iterations=5,
    verbose=True,
    memory=conversational_memory,
    early_stopping_method='generate'
)

In [24]:
#download the image
!wget https://www.smartcitiesworld.net/AcuCustom/Sitename/DAM/019/Parsons_PR.jpg

--2023-10-27 14:42:51--  https://www.smartcitiesworld.net/AcuCustom/Sitename/DAM/019/Parsons_PR.jpg
Resolving www.smartcitiesworld.net (www.smartcitiesworld.net)... 108.129.13.134, 54.76.134.71
Connecting to www.smartcitiesworld.net (www.smartcitiesworld.net)|108.129.13.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 299237 (292K) [image/jpeg]
Saving to: ‘Parsons_PR.jpg’


2023-10-27 14:42:52 (1.07 MB/s) - ‘Parsons_PR.jpg’ saved [299237/299237]



In [25]:
image_path = "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
user_question = "generate a caption for this iamge?"
response = agent.run(f'{user_question}, this is the image path: {image_path}')
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Image captioner",
    "action_input": "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
}[0m
Observation: [36;1m[1;3mcars are driving down the street in traffic at a green light[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The caption for the image is: 'cars are driving down the street in traffic at a green light'"
}[0m

[1m> Finished chain.[0m
The caption for the image is: 'cars are driving down the street in traffic at a green light'


In [26]:
image_path = "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
user_question = "Please tell me what are the items present in the image."
response = agent.run(f'{user_question}, this is the image path: {image_path}')
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Object detector",
    "action_input": "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
}[0m

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Observation: [33;1m[1;3m[518, 40, 582, 110] car 0.9300956726074219
[188, 381, 311, 469] car 0.9253584146499634
[1068, 223, 1104, 342] person 0.9871610403060913
[828, 233, 949, 329] car 0.9450380802154541
[1076, 263, 1106, 347] bicycle 0.9070360660552979
[635, 71, 713, 135] car 0.9211900234222412
[0, 433, 100, 603] car 0.9781932830810547
[151, 747, 339, 799] car 0.9839043617248535
[389, 267, 493, 367] car 0.9801322817802429
[192, 478, 341, 633] car 0.995318591594696
[578, 117, 828, 550] traffic light 0.9860709309577942
[802, 666, 1028, 798] car 0.982887327671051
[0, 639, 84, 799] car 0.9630054831504822
[1057, 608, 1199, 766] car 0.965281069278717
[988, 218, 1031, 347] person 0.9471637606620789
[751, 524, 909, 675] car 0.9911808371543884
[489, 560, 670, 749] car 0.9970001578330994
[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The items present in the image are: cars, persons, bicycles, and a traffic light."
}[0m

[1m> Finished chain.[0m
The items pr

In [27]:
image_path = "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
user_question = "Please tell me the bounding boxes of all detected objects in the image."
response = agent.run(f'{user_question}, this is the image path: {image_path}')
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Object detector",
    "action_input": "/home/sohum/CompRobo-VisionLLM/Parsons_PR.jpg"
}[0m

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Observation: [33;1m[1;3m[518, 40, 582, 110] car 0.9300956726074219
[188, 381, 311, 469] car 0.9253584146499634
[1068, 223, 1104, 342] person 0.9871610403060913
[828, 233, 949, 329] car 0.9450380802154541
[1076, 263, 1106, 347] bicycle 0.9070360660552979
[635, 71, 713, 135] car 0.9211900234222412
[0, 433, 100, 603] car 0.9781932830810547
[151, 747, 339, 799] car 0.9839043617248535
[389, 267, 493, 367] car 0.9801322817802429
[192, 478, 341, 633] car 0.995318591594696
[578, 117, 828, 550] traffic light 0.9860709309577942
[802, 666, 1028, 798] car 0.982887327671051
[0, 639, 84, 799] car 0.9630054831504822
[1057, 608, 1199, 766] car 0.965281069278717
[988, 218, 1031, 347] person 0.9471637606620789
[751, 524, 909, 675] car 0.9911808371543884
[489, 560, 670, 749] car 0.9970001578330994
[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The bounding boxes of all detected objects in the image are as follows:\n\n- Car: [518, 40, 582, 110], [188, 381, 311, 469], [82