## Importing Libraries and Paths


In [1]:
import numpy as np 
import pandas as pd 


import requests
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import torch
from transformers import AutoProcessor, BlipForConditionalGeneration

import os
import sys
sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer, models

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Setup Model

In [2]:
comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')
folder_path = "/kaggle/working/" 
image_files=[]
for dirname, _, filenames in os.walk('/kaggle/input/stable-diffusion-image-to-prompts/images/'):

    for filename in sorted(filenames):

        image_files.append(os.path.join(dirname, filename))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
images = sorted(os.listdir(comp_path / 'images'))
imgIds = [i.split('.')[0] for i in images]

EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))

imgId_eId = [
    '_'.join(map(str, i)) for i in zip(
        np.repeat(imgIds, EMBEDDING_LENGTH),
        np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]

## Calling Transformer Model

In [4]:
st_model = SentenceTransformer('/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2')

## Diffusion Process through BLIP

In [5]:
processor = AutoProcessor.from_pretrained("/kaggle/input/salesforceblip-image-caption")

model = BlipForConditionalGeneration.from_pretrained("/kaggle/input/salesforceblip-image-caption")

model.to(device)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-23): 24 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1024,),

In [6]:
def image_to_prompt(raw_image):
    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs, max_new_tokens=32)
    generated_prompt = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
    return generated_prompt

In [7]:
generated_prompts =[]

for idx, file in enumerate(tqdm(image_files, desc='Generating prompts')):


    image = Image.open(file)

    prompt = image_to_prompt(image)


    generated_prompts.append(prompt)
    

Generating prompts: 100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


### Prompts generated

In [8]:
generated_prompts

['arafed view of a circular hole in the middle of a desert',
 'a close up of a wooden plate with a swirl design on it',
 'cartoon dinosaur with a piece of cheese in its mouth in a forest',
 'there is a drawing of a robot holding a hammer',
 'painting of a man with a lizard on his head and a lizard on his shoulder',
 'arafed astronaut walking down a path in a park with cherry trees',
 'there is a man standing in front of a counter with a pizza']

### Embedding Calculation

In [9]:

prompt_embeddings = st_model.encode(generated_prompts).flatten()
submission = pd.DataFrame(
                index=imgId_eId,
                data=prompt_embeddings,
                columns=['val']).rename_axis('imgId_eId')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
submission

Unnamed: 0_level_0,val
imgId_eId,Unnamed: 1_level_1
20057f34d_0,0.058347
20057f34d_1,0.081329
20057f34d_2,-0.042682
20057f34d_3,0.033359
20057f34d_4,0.017850
...,...
f27825b2c_379,0.089900
f27825b2c_380,-0.010372
f27825b2c_381,-0.007558
f27825b2c_382,-0.021953


## Final File Submission

In [11]:
submission.to_csv('submission.csv')