How are we achieving image to prompts?

Using the pretrained BLIP (Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation)[https://huggingface.co/docs/transformers/main/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.forward.example] model from huggingface. Originally from LAVIS (Language-Vision Intelligence)[https://github.com/salesforce/LAVIS] by Salesforce

In [5]:
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import keras

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", 
)
model.to(device)
url = "/Users/rajathdb/Image2Prompts/sdip-images/images/20057f34d.png"
image = Image.open(url)

inputs = processor(images=image, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



a large circular hole in the ground


Following the pipeline, we pass it to a question generator from the generated caption, which is used to further enhance the prompt by focusing on the potential region of interest.

In [7]:
from PIL import Image
import requests
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
)
model.to(device)
url = "/Users/rajathdb/Image2Prompts/sdip-images/images/20057f34d.png"
image = Image.open(url)

prompt = f"Question: {generated_text}? Answer:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device,)

generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

yes


In [4]:
generated_text

''

In [2]:
import glob
import os
import numpy as np


comp_path = 'sdip-images'
imag_path = glob.glob(f'{comp_path}/images/*.png')
images = os.listdir(os.path.join(comp_path, 'images'))
imgIds = [i.split('.')[0] for i in images]

EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))
]

In [21]:
imag_path

['sdip-images/images/d8edf2e40.png',
 'sdip-images/images/a4e1c55a9.png',
 'sdip-images/images/227ef0887.png',
 'sdip-images/images/20057f34d.png',
 'sdip-images/images/92e911621.png',
 'sdip-images/images/f27825b2c.png',
 'sdip-images/images/c98f79f71.png']

In [6]:
import tensorflow as tf

def read_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, 3)
    image = tf.image.resize(image, (384, 384))
    return image

def tf_preprocess(images):
    x = keras.layers.Rescaling(
        scale=processor.image_processor.rescale_factor
    )(images)
    x = keras.layers.Normalization(
        mean=processor.image_processor.image_mean,
        variance=[x ** 2 for x in processor.image_processor.image_std],
        axis=3
    )(x)
    x = keras.layers.Permute(
        dims=(3, 1, 2)
    )(x)
    return x

def dataloader(
    image_paths, batch_size=1
):
    dataset = tf.data.Dataset.from_tensor_slices(
        (image_paths)
    )
    dataset = dataset.map(read_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size, drop_remainder=False)
    dataset = dataset.map(tf_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.prefetch(tf.data.AUTOTUNE)

In [8]:
test_ds = dataloader(
    imag_path, batch_size=32
)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-04-09 06:08:37.129889: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-09 06:08:37.133349: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
test_ds

<PrefetchDataset element_spec=TensorSpec(shape=(None, 3, 384, 384), dtype=tf.float32, name=None)>

In [10]:
from sentence_transformers import SentenceTransformer, models
st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [19]:
def predict(test_ds):

    global pred_prompts
    submissions = []

    for i, batch in enumerate(test_ds):
        inputs = processor(images=batch, return_tensors="pt").to(device)
        output = model.generate(**inputs)
        pred_prompts = processor.batch_decode(output, skip_special_tokens=True)
        
        submissions.append(
            model.encode(pred_prompts)
        )
    
    return submissions

In [20]:
prompt_embeddings = predict(test_ds)