In [12]:
import os
import requests
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
import glob
import numpy as np


img_dir           = "/WorkSpace-2/aroy/data/datasets/coco_ooc/OOC_images_selected/"
caption_dir       = "/WorkSpace-2/aroy/data/datasets/coco_ooc/OOC_images_selected_captions/"
captions_filepath = '/WorkSpace-2/aroy/data/OOC/LLM/coco_ooc_captions.npy'

os.makedirs(caption_dir, exist_ok=True)

# hyperparameters
max_new_tokens=20



In [1]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

# processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b-coco")
# by default `from_pretrained` loads the weights in float32
# we load in float16 instead to save memory
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) 
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", torch_dtype=torch.float16) 
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [3]:


image_path_list = glob.glob(os.path.join(img_dir, '*.jpg'))
image_name_list = ['']*len(image_path_list)
caption_list    = ['']*len(image_path_list)

for indx, image_path in enumerate(image_path_list):
    
    image_name = os.path.splitext(os.path.split(image_path)[1])[0]
    image_name_list[indx] = image_name
    
    image = Image.open(image_path)
    inputs = processor(image, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()    
    caption_list[indx] = caption
    
captions_dict = {'image_name_list': image_name_list, 'caption_list':caption_list}
np.save(captions_filepath, captions_dict)

In [4]:
# Language model for query

In [5]:

import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")


captions_filepath = '/WorkSpace-2/aroy/data/OOC/LLM/coco_ooc_captions.npy'
captions_dict = np.load(captions_filepath, allow_pickle=True)
captions_dict = captions_dict.item()

image_name_list = captions_dict['image_name_list']
caption_list    = captions_dict['caption_list']

In [11]:
for img_name, caption in zip(image_name_list, caption_list):
    print(image_name + ' : ' + caption)

COCO_val2014_000000004296_var1_512_0 : a zebra is standing in a room with a laptop on a table
COCO_val2014_000000004296_var1_512_0 : a tennis players are playing a game on a blue court
COCO_val2014_000000004296_var1_512_0 : a laptop computer sitting on a desk with a giraffe on it
COCO_val2014_000000004296_var1_512_0 : two men riding horses on the beach with a suitcase in the background
COCO_val2014_000000004296_var1_512_0 : a large sheep is standing in a living room with a table and chairs
COCO_val2014_000000004296_var1_512_0 : a baseball game with a slice of pizza on the field
COCO_val2014_000000004296_var1_512_0 : a baseball game with a batter, catcher, umpire and pitcher
COCO_val2014_000000004296_var1_512_0 : a bird sitting on top of a kitchen counter in a remodeled kitchen


In [6]:

query_string = " - is this normal?"
gpt_response_list = ['']*len(caption_list)
for indx, caption in enumerate(caption_list):
    
    content_str = "\"" + caption + "\"" + query_string
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": content_str}
      ]
    )

    response = completion.choices[0].message.content
    gpt_response_list[indx] = response



In [7]:

query_string = " - is this normal? yes or no"
gpt_response_list_YN = ['']*len(caption_list)
for indx, caption in enumerate(caption_list):
    
    content_str = "\"" + caption + "\"" + query_string
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": content_str}
      ]
    )

    response = completion.choices[0].message.content
    gpt_response_list_YN[indx] = response



In [9]:
query_string = " - is this normal? percentage of possibility"
gpt_response_list = ['']*len(caption_list)
for indx, caption in enumerate(caption_list):
    
    content_str = "\"" + caption + "\"" + query_string
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": content_str}
      ]
    )

    response = completion.choices[0].message.content
    gpt_response_list[indx] = response
    
gpt_response_list

['It is not normal for a zebra to be standing in a room with a laptop on a table. The percentage of possibility is extremely low, probably less than 1%. Zebras are wild animals typically found in grasslands and savannas, not in human environments like rooms with technology.\n',
 "Yes, it is normal for tennis players to play a game on a blue court. Blue courts are quite common, particularly in hard court surfaces, such as those used in the Australian Open and the US Open. It's difficult to assign an exact percentage of possibility, but it is certainly a common occurrence in the world of tennis.",
 "It is not normal to see a laptop computer sitting on a desk with a giraffe on it. This could be interpreted as a small toy or figurine of a giraffe, which would be more plausible. However, if it's meant to be a real-life giraffe, that would be highly improbable. \n\nIn the case of a small toy or figurine: 75%\nIn the case of a real-life giraffe: <1%",
 "It is not common to see two men riding 

In [10]:
query_string = " - how likely is this in a scale between 0 to 10"

gpt_response_list = ['']*len(caption_list)
for indx, caption in enumerate(caption_list):
    
    content_str = "\"" + caption + "\"" + query_string
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": content_str}
      ]
    )

    response = completion.choices[0].message.content
    gpt_response_list[indx] = response
    
gpt_response_list

['I would say this is a 1 or 2 on a scale of 0 to 10, as it is highly unlikely but still within the realm of possibility if it were set up intentionally for entertainment or an art installation.',
 '7',
 'I would say it is a 2. While it is possible to have a giraffe figurine or a giraffe sticker on a laptop, having an actual giraffe on a laptop is highly unlikely due to size and weight constraints.',
 "I would say a 3, as it's not impossible but rather uncommon to see men riding horses on the beach with a suitcase in the background.",
 '3',
 'I would say a 2, as it is quite unlikely but not impossible that someone might accidentally drop a slice of pizza on the field.',
 '10',
 "It is difficult to provide a precise number on the likelihood of this, as it depends on a number of factors such as whether the bird is a pet or managed to fly in accidentally, or if the kitchen is open to the outdoors in some way. I would say it's a 4 or 5 out of 10, as it's not completely unlikely but also no

In [None]:
query_string = " - whats is the most likely object that is out of context?"

gpt_response_list = ['']*len(caption_list)
for indx, caption in enumerate(caption_list):
    
    content_str = "\"" + caption + "\"" + query_string
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": content_str}
      ]
    )

    response = completion.choices[0].message.content
    gpt_response_list[indx] = response
    
gpt_response_list

In [60]:
# Language model for query

import os
import torch
import glob
import numpy as np
import openai
import time
openai.api_key = os.getenv("OPENAI_API_KEY")

# captions_filepath = "/WorkSpace-2/aroy/data/OOC/COCO_OOC/blip2_caption/out_of_context_MIT_captions_opt-6.7b-coco.npy"
captions_filepath = "/WorkSpace-2/aroy/data/OOC/COCO_OOC/blip2_caption/coco_val2014_captions_opt-6.7b-coco.npy"
response_dir      = "/WorkSpace-2/aroy/data/OOC/COCO_OOC/LLM_OP/GPT-4/out_of_context_MIT/possible_0_10_gpt4_token_1000_num"
query_string = " - how normal is this in a scale between 0 to 10? The rating is:"

start_indx = 0
end_indx   = 30

os.makedirs(response_dir, exist_ok=True)

# load captions
captions_dict = np.load(captions_filepath, allow_pickle=True)
captions_dict = captions_dict.item()
image_name_list = captions_dict['image_name_list']
caption_list    = captions_dict['caption_list']


# get LLM responses
gpt_response_list = []
for indx in range(start_indx, end_indx):


    # print(indx)
    image_name = image_name_list[indx]
    caption = caption_list[indx]
    
    if caption: # caption is not empty
        content_str = "\"" + caption + "\"" + query_string
        print (content_str)
        completion = openai.ChatCompletion.create(
          #model="gpt-3.5-turbo",
          model="gpt-4",
          messages=[{"role": "user", "content": content_str}],
          temperature = 0.5,
          max_tokens=100
        )

    response = completion.choices[0].message.content
    gpt_response_list.append(response)
    print(response)
    print('-'*40)
    # write the response
    response_filepath = os.path.join(response_dir, image_name + '.txt')
    with open(response_filepath, "w") as text_file:
        text_file.write(response)

    time.sleep(2)
# response_dict = {'image_name_list': image_name_list, 'response_list':gpt_response_list}
# np.save(response_filepath, response_dict)

"a man standing on a street corner talking on a cell phone" - how normal is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a horse pulling a small car down a road" - how normal is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how normal is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a red car parked on top of a roof next to a house" - how normal is this in a scale between 0 to 10? The rating is:
2
----------------------------------------
"a plane that is in a cemetery with people walking by" - how normal is this in a scale between 0 to 10? The rating is:
2
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how normal is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a truck is stuck in a 

KeyboardInterrupt: 

In [None]:
gpt4_list_100 = gpt_response_list


In [59]:
gpt4_list_1000 = gpt_response_list


In [52]:
gpt4_list

['5',
 '9',
 '8',
 '8',
 '10',
 '7',
 '1',
 '5',
 '7',
 '8',
 '7',
 '10',
 '7',
 '5',
 '4',
 '8',
 '8',
 '5',
 '8',
 '5',
 '6',
 '7',
 '8',
 '8',
 '8',
 '10',
 '9',
 '8',
 '10',
 '7']

In [55]:
for g3, g4 in zip(gpt4_list, gpt_response_list):
    print(g3, g4)

5 3
9 8
8 5
8 5
10 7
7 5
1 1
5 3
7 6
8 5
7 5
10 10
7 5
5 3
4 2
8 5
8 7
5 3
8 5
5 4
6 3
7 5
8 7
8 6
8 5
10 8
9 7
8 5
10 5
7 5


In [57]:
np.mean(np.array(np.float64(gpt4_list)))

5.1

In [21]:
gpt_response_list

['6',
 '10 - This is a very common setup for a living room in many households.',
 '6. It is a fairly common sight, especially in areas where citrus fruits are grown or in season. However, it may not be as common in areas where these fruits are not readily available or in areas where other types of fruits are more popular.',
 '8',
 '10 - extremely common.',
 '6',
 '2',
 '2 - While it is not impossible to see a group of three bears walking through a grassy field, it is not a common occurrence. Bears are generally solitary animals and only come together in groups during mating season or when a mother is caring for her cubs.',
 '6',
 '7',
 '7',
 '10',
 '8',
 '5. It is a somewhat common image, as skiing is a popular activity and arches are a common architectural feature. However, it is not so common that it would be immediately recognizable or iconic.',
 '2 - While it is possible for a man to be standing next to an elephant wearing a blue hat and blue shirt, it is not a particularly common 

In [None]:
query_string = " - how common is this in a scale between 0 to 10? The rating is:"

model="gpt-4",
messages=[{"role": "user", "content": content_str}],
temperature = 0.5,
max_tokens=100

"a man standing on a street corner talking on a cell phone" - how common is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a horse pulling a small car down a road" - how common is this in a scale between 0 to 10? The rating is:
1
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how common is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a red car parked on top of a roof next to a house" - how common is this in a scale between 0 to 10? The rating is:
1
----------------------------------------
"a plane that is in a cemetery with people walking by" - how common is this in a scale between 0 to 10? The rating is:
1
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how common is this in a scale between 0 to 10? The rating is:
4
----------------------------------------
"a truck is stuck in a drive thru at a fast food restaurant" - how common is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a car is in the front of a store that is on fire" - how common is this in a scale between 0 to 10? The rating is:
2
----------------------------------------
"a black sports car is upside down in a grassy area" - how common is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a toilet sitting on top of a mailbox on a street" - how common is this in a scale between 0 to 10? The rating is:
0
----------------------------------------

In [None]:
query_string = " - how likely is this in a scale between 0 to 10? The rating is:"

model="gpt-4",
messages=[{"role": "user", "content": content_str}],
temperature = 0.5,
max_tokens=100

"a man standing on a street corner talking on a cell phone" - how likely is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a horse pulling a small car down a road" - how likely is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how likely is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
"a red car parked on top of a roof next to a house" - how likely is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a plane that is in a cemetery with people walking by" - how likely is this in a scale between 0 to 10? The rating is:
3
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how likely is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
"a truck is stuck in a drive thru at a fast food restaurant" - how likely is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a car is in the front of a store that is on fire" - how likely is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a black sports car is upside down in a grassy area" - how likely is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a toilet sitting on top of a mailbox on a street" - how likely is this in a scale between 0 to 10? The rating is:
1
----------------------------------------

In [None]:
query_string = " - how possible is this in a scale between 0 to 10? The rating is:"

model="gpt-4",
messages=[{"role": "user", "content": content_str}],
temperature = 0.5,
max_tokens=100

"a man standing on a street corner talking on a cell phone" - how possible is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a horse pulling a small car down a road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how possible is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a red car parked on top of a roof next to a house" - how possible is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a plane that is in a cemetery with people walking by" - how possible is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a truck is stuck in a drive thru at a fast food restaurant" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a car is in the front of a store that is on fire" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a black sports car is upside down in a grassy area" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a toilet sitting on top of a mailbox on a street" - how possible is this in a scale between 0 to 10? The rating is:
2
----------------------------------------

In [None]:
" - how possible is this in a scale between 0 to 10? The rating is:"
model="gpt-4",
          messages=[{"role": "user", "content": content_str}],
          temperature = 0.5,
          max_tokens=1000
            
"a man standing on a street corner talking on a cell phone" - how possible is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a horse pulling a small car down a road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a red car parked on top of a roof next to a house" - how possible is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a plane that is in a cemetery with people walking by" - how possible is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a truck is stuck in a drive thru at a fast food restaurant" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a car is in the front of a store that is on fire" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a black sports car is upside down in a grassy area" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a toilet sitting on top of a mailbox on a street" - how possible is this in a scale between 0 to 10? The rating is:
2
----------------------------------------

In [None]:
"a man standing on a street corner talking on a cell phone" - how possible is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a horse pulling a small car down a road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a group of people standing around a bunk bed with a ladder" - how possible is this in a scale between 0 to 10? The rating is:
10
----------------------------------------
"a red car parked on top of a roof next to a house" - how possible is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
"a plane that is in a cemetery with people walking by" - how possible is this in a scale between 0 to 10? The rating is:
4
----------------------------------------
"a man walking down a street with a yellow machine on the road" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a truck is stuck in a drive thru at a fast food restaurant" - how possible is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
"a car is in the front of a store that is on fire" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a black sports car is upside down in a grassy area" - how possible is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
"a toilet sitting on top of a mailbox on a street" - how possible is this in a scale between 0 to 10? The rating is:
2

In [None]:
0
"a man on horseback with a flag in front of a crowd" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
1
"a living room with a couch, chair, table and a television" - how common is this in a scale between 0 to 10? The rating is:
8
----------------------------------------
2
"a pile of oranges and grapefruit sitting on a table" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
3
"a young boy swinging a baseball bat in a field" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
4
"a woman sitting at a table with a plate of food" - how common is this in a scale between 0 to 10? The rating is:
7
----------------------------------------
5
"a large black and white photo of a building with a sign" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
6
"a man standing in a brick oven with a fire burning" - how common is this in a scale between 0 to 10? The rating is:
1
----------------------------------------
7
"a group of three bears walking through a grassy field" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
8
"a desk with a laptop and a desktop computer on it" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------
9
"two small ponies standing in a field of green grass" - how common is this in a scale between 0 to 10? The rating is:
5
----------------------------------------