## Name- Keshatwar Pratham Naresh
## 20115052


# Install libraries 


In [98]:
!pip install transformers
!pip install git+https://github.com/openai/CLIP.git
!pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-8hj3k6cx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-8hj3k6cx
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25ldone
[0mhuggingface/tokenizers: The current pr

In [99]:
import clip
import os
from torch import nn
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as nnf
import sys
from typing import Tuple, List, Union, Optional
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
from sentence_transformers import SentenceTransformer
import skimage.io as io
import PIL.Image
from IPython.display import Image 
from sentence_transformers import SentenceTransformer


N = type(None)
V = np.array
ARRAY = np.ndarray
ARRAYS = Union[Tuple[ARRAY, ...], List[ARRAY]]
VS = Union[Tuple[V, ...], List[V]]
VN = Union[V, N]
VNS = Union[VS, N]
T = torch.Tensor
TS = Union[Tuple[T, ...], List[T]]
TN = Optional[T]
TNS = Union[Tuple[TN, ...], List[TN]]
TSN = Optional[TS]
TA = Union[T, ARRAY]


D = torch.device
CPU = torch.device('cpu')


def get_device(device_id: int) -> D:
    if not torch.cuda.is_available():
        return CPU
    device_id = min(torch.cuda.device_count() - 1, device_id)
    return torch.device(f'cuda:{device_id}')


CUDA = get_device


In [100]:
is_gpu = True #use gpu

# Model

In [101]:
class MLP(nn.Module):

    def forward(self, x: T) -> T:
        return self.model(x)

    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) -1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
        self.model = nn.Sequential(*layers)


class ClipCaptionModel(nn.Module):

    #@functools.lru_cache #FIXME
    def get_dummy_token(self, batch_size: int, device: D) -> T:
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, tokens: T, prefix: T, mask: Optional[T] = None, labels: Optional[T] = None):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
        #print(embedding_text.size()) #torch.Size([5, 67, 768])
        #print(prefix_projections.size()) #torch.Size([5, 1, 768])
        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    def __init__(self, prefix_length: int, prefix_size: int = 512):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        if prefix_length > 10:  # not enough memory
            self.clip_project = nn.Linear(prefix_size, self.gpt_embedding_size * prefix_length)
        else:
            self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2, self.gpt_embedding_size * prefix_length))


class ClipCaptionPrefix(ClipCaptionModel):

    def parameters(self, recurse: bool = True):
        return self.clip_project.parameters()

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self

In [102]:
def generate_beam(model, tokenizer, beam_size: int = 5, prompt=None, embed=None,
                  entry_length=67, temperature=1., stop_token: str = '.'):

    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = torch.ones(beam_size, device=device)
    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
    with torch.no_grad():
        if embed is not None:
            generated = embed
        else:
            if tokens is None:
                tokens = torch.tensor(tokenizer.encode(prompt))
                tokens = tokens.unsqueeze(0).to(device)
                generated = model.gpt.transformer.wte(tokens)
        for i in range(entry_length):
            outputs = model.gpt(inputs_embeds=generated)
            logits = outputs.logits
            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
            logits = logits.softmax(-1).log()
            if scores is None:
                scores, next_tokens = logits.topk(beam_size, -1)
                generated = generated.expand(beam_size, *generated.shape[1:])
                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
                if tokens is None:
                    tokens = next_tokens
                else:
                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
                    tokens = torch.cat((tokens, next_tokens), dim=1)
            else:
                logits[is_stopped] = -float(np.inf)
                logits[is_stopped, 0] = 0
                scores_sum = scores[:, None] + logits
                seq_lengths[~is_stopped] += 1
                scores_sum_average = scores_sum / seq_lengths[:, None]
                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
                next_tokens_source = next_tokens // scores_sum.shape[1]
                seq_lengths = seq_lengths[next_tokens_source]
                next_tokens = next_tokens % scores_sum.shape[1]
                next_tokens = next_tokens.unsqueeze(1)
                tokens = tokens[next_tokens_source]
                tokens = torch.cat((tokens, next_tokens), dim=1)
                generated = generated[next_tokens_source]
                scores = scores_sum_average * seq_lengths
                is_stopped = is_stopped[next_tokens_source]
            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
            generated = torch.cat((generated, next_token_embed), dim=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break
    scores = scores / seq_lengths
    output_list = tokens.cpu().numpy()
    output_texts = [tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
    order = scores.argsort(descending=True)
    output_texts = [output_texts[i] for i in order]
    return output_texts

In [103]:
device = CUDA(0) if is_gpu else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [104]:
model_path="/kaggle/input/conceptual-weights/conceptual_weights (1).pt" #load weights

# Load weights

In [105]:
prefix_length = 10

model = ClipCaptionModel(prefix_length)

model.load_state_dict(torch.load(model_path), strict=False)

model = model.eval() 
device = CUDA(0) if is_gpu else "cpu"
model = model.to(device)


In [106]:
from pathlib import Path
comp_path = Path('/kaggle/input/stable-diffusion-image-to-prompts/')

In [107]:
st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')#model to create sentencce encodings

# Create Dataframe

In [108]:
images = os.listdir(comp_path / 'images')
data = {'imgId_eId': [],
        'val':[]
       }
df = pd.DataFrame(data)
for image in images:
    img=image.split('.')[0]
    image = io.imread(comp_path/'images'/image)
    pil_image = PIL.Image.fromarray(image)
    #pil_img = Image(filename=UPLOADED_FILE
    image = preprocess(pil_image).unsqueeze(0).to(device)
    with torch.no_grad():
        prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
        prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
        generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
#     print('\n')
#     print(generated_text_prefix)
    embeddings=st_model.encode(generated_text_prefix)
    
    for i in range(384):
        df1={'imgId_eId':[f"{img}_{i}"],
                          'val':[embeddings[i]]}
        df=df.append(df1,ignore_index=True)
# print(df)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.append(df1,ignore_index=True)
  df=df.appe

In [109]:
df.head()

Unnamed: 0,imgId_eId,val
0,[f27825b2c_0],[0.009334797]
1,[f27825b2c_1],[0.028381407]
2,[f27825b2c_2],[-0.010940161]
3,[f27825b2c_3],[0.054522116]
4,[f27825b2c_4],[-0.07354231]


In [110]:
df.to_csv('submission.csv')
df = pd.read_csv('/kaggle/working/submission.csv')
df = df.drop(df.columns[0], axis=1)

In [111]:
df.head()

Unnamed: 0,imgId_eId,val
0,['f27825b2c_0'],[0.009334797]
1,['f27825b2c_1'],[0.028381407]
2,['f27825b2c_2'],[-0.010940161]
3,['f27825b2c_3'],[0.054522116]
4,['f27825b2c_4'],[-0.07354231]


In [112]:
df.to_csv('submission.csv')#final submission file