<a href="https://colab.research.google.com/github/SoumyaTeotia/Triplets/blob/main/OWLV2_Triplets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from PIL import Image, ImageDraw
import requests
import torch
from nltk import ngrams
import matplotlib.pyplot as plt
import numpy as np
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

In [None]:
processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/620M [00:00<?, ?B/s]

In [None]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
caption = "Cats are sleeping with remotes on a sofa"
image

In [None]:
# Define function to filter n-grams
def filter_ngrams(ngrams_list):
    filtered_ngrams = []
    uninformative_words = ["image", "jpg", "background", "wallpaper"]
    start_words = ["of", "on", "in"]
    end_words = ["A", "a", "the", "to", "on"]
    for ngram_list in ngrams_list:
        ngram_list = list(ngram_list)
        for ngram_tuple in ngram_list:
            is_informative = all(word not in uninformative_words for word in ngram_tuple)
            if is_informative and ngram_tuple[0] not in start_words and ngram_tuple[-1] not in end_words:
                filtered_ngrams.append(" ".join(ngram_tuple))
    return filtered_ngrams

In [None]:
# Generate n-grams from the caption
unigrams_to_eightgrams = [ngrams(caption.split(), n) for n in range(1, 9)]
filtered_ngrams = filter_ngrams(unigrams_to_eightgrams)
print("Filtered n-grams:", filtered_ngrams)

In [None]:
# Use OWLv2 to get bounding boxes
inputs = processor(text=filtered_ngrams, images=image, return_tensors="pt")
outputs = model(**inputs)
target_sizes = torch.Tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)

In [None]:
# Generate triplets
print("Before generating triplets:")
print("Filtered n-grams length:", len(filtered_ngrams))
print("Results length:", len(results))
print("Triplets generated:")

triplets = []
# Iterate over both results and filtered n-grams
for i, (result, ngram) in enumerate(zip(results, filtered_ngrams)):
    # Check if there are more bounding boxes than filtered n-grams
    if i < len(results):
        boxes = result["boxes"]
        for box in boxes:
            # Create a triplet for each bounding box
            triplet = {
                "image": image,
                "bounding_box": box,
                "caption": ngram
            }
            triplets.append(triplet)
            # Increment the n-gram index
            i += 1
    else:
        break

print("Triplets length:", len(triplets))


In [None]:
# Define function to preprocess the image
def get_preprocessed_image(pixel_values):
    pixel_values = pixel_values.squeeze().numpy()
    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
    unnormalized_image = Image.fromarray(unnormalized_image)
    return unnormalized_image


In [None]:
# Preprocess the image and obtain bounding boxes
unnormalized_image = get_preprocessed_image(inputs.pixel_values)
target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)

In [None]:
# Print the results and unnormalized image for verification
print("Results:", results)
print("Unnormalized image size:", unnormalized_image.size)

In [None]:
# Load the original image
#image_path = 'path_to_your_image.jpg'  # Replace 'path_to_your_image.jpg' with the actual path to your image
original_image = unnormalized_image.copy()

# Create a draw object
draw = ImageDraw.Draw(original_image)

# Iterate through the detected objects and draw bounding boxes
for result, ngram in zip(results, filtered_ngrams):
    boxes = result["boxes"]
    labels = result["labels"]
    for box, label in zip(boxes, labels):
        box = [round(coord, 2) for coord in box.tolist()]
        x1, y1, x2, y2 = box
        draw.rectangle(xy=((x1, y1), (x2, y2)), outline="red")
        draw.text(xy=(x1, y1), text=ngram, fill="red")

# for result in results:
#     boxes = result['boxes']
#     for box in boxes:
#         # Unpack the box coordinates
#         x1, y1, x2, y2 = box.tolist()
#         # Draw a rectangle
#         draw.rectangle(xy=[(x1, y1), (x2, y2)], outline="red", width=2)

# Show the image with bounding boxes
original_image.show()

In [None]:
# Show the visualized image using matplotlib
resized_image = original_image.resize((10000, 10000))  # Adjust the size as needed

# Show the resized image
#resized_image.show()
plt.imshow(resized_image)
plt.axis('off')  # Hide axis
plt.show()