In [1]:
import json
import random
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib.image as mpimg
import os
from openai import OpenAI
import base64
import requests
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
import http.client
import typing
import urllib.request

import IPython.display
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps

import torch
import clip
from PIL import Image

In [2]:
data_path = 'magic_brush/test'

In [3]:
f = open(f'{data_path}/edit_turns.json')
instructions = json.load(f)
f = open(f'{data_path}/global_descriptions.json')
global_descriptions = json.load(f)
f = open(f'{data_path}/local_descriptions.json')
local_descriptions = json.load(f)

In [15]:
len(instructions)

# Create all pairs

In [None]:
clip_L, clip_L_preprocess = clip.load("ViT-L/14", device='cuda')

In [40]:
def compute_similarity(image1, image2, model=clip_L, preprocess=clip_L_preprocess):
    image1_preprocess = preprocess(Image.open(image1)).unsqueeze(0).to('cuda')
    image1_features = model.encode_image( image1_preprocess)

    image2_preprocess = preprocess(Image.open(image2)).unsqueeze(0).to('cuda')
    image2_features = model.encode_image( image2_preprocess)
    
    cos = torch.nn.CosineSimilarity(dim=0)
    similarity = cos(image1_features[0],image2_features[0]).item()
    #print("Image similarity", similarity)
    return similarity

In [77]:
count = 0
data = []
for edit_idx, sample in enumerate(instructions):
    data.append({"path_input": f'{data_path}/images/{sample["input"].split("-")[0]}/{sample["input"]}',
    "input_global": f'{global_descriptions[sample["input"].split("-")[0]][sample["input"]]}',
    "path_output":f'{data_path}/images/{sample["input"].split("-")[0]}/{sample["output"]}',
    "output_local": f'{local_descriptions[sample["input"].split("-")[0]][sample["output"]]}',
    "output_global": f'{global_descriptions[sample["input"].split("-")[0]][sample["output"]]}',
    "instruction": sample['instruction'],
    "CLIP_similarity": compute_similarity(f'{data_path}/images/{sample["input"].split("-")[0]}/{sample["input"]}', f'{data_path}/images/{sample["input"].split("-")[0]}/{sample["output"]}')})
    count += 1

In [42]:
count

In [48]:
sorted_data = sorted(data, key=lambda d: d['CLIP_similarity'], reverse=True)

In [67]:
sorted_data[-500:]

In [50]:
with open('magic_brush_test.json', 'w') as fout:
    json.dump(sorted_data, fout)

# Setup models

In [27]:
api_key = os.getenv('OPENAI_API_KEY')
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}
gpt = OpenAI()

In [28]:
genai.configure(api_key='AIzaSyA_kbfVsa65btu37xRBPb9UyYytEHLKhd8')
gemini = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

In [58]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [51]:
def gpt_inference_two(image_1, image_2, prompt):
    en_1  = encode_image(image_1)
    en_2 = encode_image(image_2)
    payload = {
          "model": "gpt-4-turbo",
          "messages": [
            {
              "role": "user",
              "content": [
                {
                  "type": "text",
                  "text": prompt
                },
                {
                  "type": "image_url",
                  "image_url": {
                    "url": f"data:image/jpeg;base64,{en_1}"
                  }
                },
                {
                  "type": "image_url",
                  "image_url": {
                    "url": f"data:image/jpeg;base64,{en_2}"
                  }
                }
              ]
            }
          ],
          "max_tokens": 300
        }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    answer = response.json()['choices'][0]['message']['content']
    return answer

In [54]:
def gpt_inference_one(image, prompt):
    en  = encode_image(image)
    payload = {
          "model": "gpt-4-turbo",
          "messages": [
            {
              "role": "user",
              "content": [
                {
                  "type": "text",
                  "text": prompt
                },
                {
                  "type": "image_url",
                  "image_url": {
                    "url": f"data:image/jpeg;base64,{en}"
                  }
                }
              ]
            }
          ],
          "max_tokens": 300
        }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    answer = response.json()['choices'][0]['message']['content']
    return answer

In [56]:
prompt = '''
I want to extract as many components as possible from the provided images. Component examples are shown below. However, components are not limited to the following components. Please only provide the component name without any explanation and separate the component names with commons. If a human or an animal is shown in the images and hair, eye, hand, mouth, ear, and leg, etc. are visible, ensure to include them, Similarly, try to find all the components as detailed as possible. 
1. leg, 2. eye, 3. ear, 4. food, 5. pillow, 6. flower, 7. plate, 8. window, 9. door, 10. chair, 11. dining table, 12. sofa, 13. banana, 14. bowl, 15. sugar, 16. blender, 17. berry, 18. lizard, 19. watermelon, 20. motorcycle, 21. apple, 22. curtain, 23, cookies, 24, cake, 25. hair, 26, hat, 27, dresses, 28. bacon, 29. butter, 30, jam, 31, bread 32, surfboard, 33, t-shirt, 34, pants, 35, hands, 36. fridge, 37, plants, 38. cabinet, 39, sink, 40, car, 41, girl, 42, boy
'''

In [69]:
option_data = []
for idx, pair in enumerate(sorted_data):
    if pair['CLIP_similarity'] >= 0.94:
        answer = gpt_inference_one(pair['path_output'], prompt)
        pair['GPT_option'] = answer
    else:
        pair['GPT_option'] = None
    option_data.append(pair)
    print(idx)

In [70]:
answer

In [71]:
option_data

In [81]:
instruction_data = []
for pair in option_data:
    pair['instruction'] = tmp[pair['path_input']]
    instruction_data.append(pair)

In [82]:
with open('magic_brush_test.json', 'w') as fout:
    json.dump(option_data, fout)

In [22]:
val = json.load(open('/home/marco/PhD/VL_fine-grained/magic_brush_val_94clip.json'))

In [23]:
counter = 0
for i in val:
    if i['GPT_option'] is not None:
        counter += 1

In [27]:
val[314]

In [24]:
counter

In [28]:
f, axarr = plt.subplots(2, 1, figsize=(15, 15))
f.tight_layout(pad=5.0)
axarr[0].imshow(mpimg.imread(val[314]['path_input']))
axarr[1].imshow(mpimg.imread(val[314]['path_output']))