In [1]:
import base64
import replicate
import os
from PIL import Image
import json

import dotenv
dotenv.load_dotenv()

os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")

def get_user_intent(query, image_path):
    image = open(image_path, "rb")
    
    json_output = {
        'user_intent': {
            'action': "What the user wants to do. One word. (enum: 'buy', 'sell')",
        },
        'items': [
            {
                'name': 'Name of the item',
                'type': "Type of the item (enum: 'furniture', 'clothing', 'electronics', 'food', 'other')",
                'description': 'Extremely detailed description of the item. DO NOT relate the description to other objects.',
                'bounding_box': '(x1, y1, height, width)',
            },
            {
                'name': 'Name of the second item... and so on'
            },
        ],
    }
    
    input = {
        "image": image,
        "prompt": f"Please interpret the query to determine the users intent. Then, use the image to further determine what the user wants. If there is no image, then output solely based on the given text input. If the user is specific (asking for all items in the image is specific) or there is only one item in the image, please give as much separate detail about each item as possible. DO NOT include any other commentary. Please format your response in a json format: {json.dumps(json_output)}. Query: {query}",
    }
    
    output = replicate.run(
        "yorickvp/llava-v1.6-34b:41ecfbfb261e6c1adf3ad896c9066ca98346996d7c4045c5bc944a79d430f174",
        # "yorickvp/llava-13b:b5f6212d032508382d61ff00469ddda3e32fd8a0e75dc39d8a4191bb742157fb",
        input=input
    )
    output = "".join(output)
    # convert the output to a json object
    json_output = json.loads(output.strip().replace('\n', '').replace('`', '').replace('json{', '{'))
    return json_output

In [5]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import torchvision.transforms as T

# Initialize Faster R-CNN
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Function to detect objects using Faster R-CNN
def detect_objects(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = T.Compose([T.ToTensor()])
    image_tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        predictions = model(image_tensor)
    
    return image, predictions[0]



In [5]:
# Function to crop an object from the image
def crop_object(image, box):
    return image.crop((int(box[0]), int(box[1]), int(box[2]), int(box[3])))

In [70]:
# Function to describe the object using LLaVA
# I wanted to use gpt, but I forgot that Images was deprecated
def describe_object_gpt(cropped_image):    
    input = {
        "image": encode_image_to_base64(cropped_image),
        "prompt": "Please describe the object in the image in as much detail as possible. DO NOT include any other commentary.",
    }

    output = replicate.run(
        "yorickvp/llava-v1.6-34b:41ecfbfb261e6c1adf3ad896c9066ca98346996d7c4045c5bc944a79d430f174",
        # "yorickvp/llava-v1.6-vicuna-13b:0603dec596080fa084e26f0ae6d605fc5788ed2b1a0358cd25010619487eae63",
        input=input
    )
    output = "".join(output)
    return str(output)

In [7]:
import io
# Function to encode the image to base64 for the MultiOn API
def encode_image_to_base64(image):

    # Save the image as cropped_object_1.jpg
    image.save("cropped_object1.jpg")
    
    # Open it to get the Binary IO
    return open("cropped_object1.jpg", "rb")

In [8]:
# Uploading the image to imgur
import requests

def upload_image(image_path, image_name):
    url = "https://api.imgur.com/3/image"
    
    payload={'type': 'image',
    'title': 'Simple upload',
    'description': 'This is a simple image upload in Imgur'}
    files=[
      ('image',(image_name,encode_image_to_base64(image_path),'image/jpeg'))
    ]
    headers = {
      'Authorization': 'Client-ID {{clientId}}'
    }
    
    response = requests.request("POST", url, headers=headers, data=payload, files=files)

    return response.json()['data']['link']

In [63]:
from multion.client import MultiOn

multion = MultiOn(api_key=os.getenv("MULTION_API_KEY"))

def search_multion(description, image_path=None):
    # Use MultiOn API to search for the object

    if image_path:
        image_url = upload_image(image_path, "cropped_object.jpg")
        url = "https://images.google.com/"
        cmd = f"Go to this url: https://images.google.com/. Click on the search by image button. In the paste image link here box, input the url: {image_url}. Please find the best matching image url and site url based on the initial given image (left half of the page when you load the results. DO NOT OUTPUT THE GIVEN URL) and it's price. Output a SINGLE tuple (url, image_url) and no other text or commentary. If at any time you are redirected to the main google search page, please restart the process. DO NOT PASTE THE URL INTO THE SEARCH BAR."
    else:
        url = "https://www.google.com/"
        cmd = f"Please find the best matching image url and site url based on the following description and it's price. Output a SINGLE tuple (url, image_url) and no other text or commentary. Description: {description}"
    
    response = multion.browse(
        cmd=cmd,
        url=url,
        local=True
    )
    
    return response

In [4]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import requests
from io import BytesIO

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name)
clip_processor = CLIPProcessor.from_pretrained(model_name)

# Function to get image embeddings using CLIP
def get_image_embedding(image):
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embeddings = clip_model.get_image_features(**inputs)
    return embeddings

# Function to get similarity between two image embeddings
def calculate_similarity(embedding1, embedding2):
    # Normalize the embeddings
    embedding1 = embedding1 / embedding1.norm(p=2, dim=-1, keepdim=True)
    embedding2 = embedding2 / embedding2.norm(p=2, dim=-1, keepdim=True)
    # Cosine similarity
    similarity = torch.matmul(embedding1, embedding2.T).item()
    return similarity

# Function to verify the best match using CLIP
def verify_image_similarity(cropped_image, search_results):
    cropped_embedding = get_image_embedding(cropped_image)
    best_match = None
    highest_similarity = 0

    for result in search_results:
        try:
            response = requests.get(result)
            if response.status_code == 200:
                search_image = Image.open(BytesIO(response.content)).convert("RGB")
                search_embedding = get_image_embedding(search_image)
                similarity = calculate_similarity(cropped_embedding, search_embedding)
                if similarity > highest_similarity:
                    highest_similarity = similarity
                    best_match = result
        except:
            continue

    return best_match

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-06-07 06:33:58.756710: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(


In [45]:
def feedback_loop(cropped_image, initial_results, feature_extractor, preprocess, max_retries=3):
    best_match = verify_image_similarity(cropped_image, initial_results)
    retries = 0
    print(f"Feedback (#{retries}: {best_match}")
    while best_match is None and retries < max_retries:
        retries += 1
        new_results = search_multion(describe_object_gpt(cropped_image))
        best_match = verify_image_similarity(cropped_image, new_results)
    return best_match

In [3]:
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT

image_path = "imgs/furniture1.jpg"
query = "Please buy the couch and the table for me."
print(f"Query: {query}")

# Deciphering user intent
json_output = get_user_intent(query, image_path)

# Object Detection
image, predictions = detect_objects(image_path)
predicted_boxes = predictions['boxes']
predicted_labels = predictions['labels']
decoded_labels = [weights.meta["categories"][label] for label in predicted_labels]
predicted_scores = predictions['scores']

# Processing the user intent
user_intent = json_output['user_intent']['action']
items = json_output['items']

if user_intent == "buy":
    for item in items:
        print(f"Processing item: {item['name']}")

        # Find the corresponding box for the item
        item_box = None
        for box, label in zip(predicted_boxes, decoded_labels):
            if item['name'].lower() in label.lower():
                item_box = box
                break

        if item_box is not None:
            cropped_image = crop_object(image, item_box)
            description = describe_object_gpt(cropped_image)
            search_results = search_multion(description, cropped_image)


            search_res_imgs = [result.replace('(', '').replace(')', '').replace('\n', '')[1] for result in search_results.message.split(',') if result]

            if search_results:
                best_match = feedback_loop(cropped_image, search_res_imgs, get_image_embedding, clip_processor)
                if best_match:
                    print(f"Best match for {item['name']}: {best_match['link']}")
                else:
                    print(f"No suitable match found for {item['name']}")
            else:
                print(f"No search results found for {item['name']}")
        else:
            print(f"No object found for {item['name']}")

Query: Please buy the couch and the table for me.
Best match for couch: https://www.kardiel.com/swoosh-modular-60-2-seater-arm-left-fossil-velvet/
Best match for table: https://www.cgtrader.com/3d-models/furniture/table/cylinder-dining-table-set-by-davidson-london
