In [None]:
import torch
import torch.nn as nn
from transformers import BlipForConditionalGeneration, AutoProcessor, ViTModel

class CustomBLIP(nn.Module):
    def __init__(self, vision_encoder_name="vit-small-patch16-224",
                 base_blip_model="blip-image-captioning-base"):
        super().__init__()

        # Load pretrained BLIP
        self.blip = BlipForConditionalGeneration.from_pretrained(base_blip_model)

        # Load a new smaller vision encoder
        self.new_vision_encoder = ViTModel.from_pretrained(vision_encoder_name)
        
        blip_proj_in = self.blip.vision_model.encoder.config.projection_dim
        # Replace vision encoder
        self.blip.vision_model = self.new_vision_encoder

        # Match dimensions if needed
        vision_hidden_size = self.new_vision_encoder.config.hidden_size

        if vision_hidden_size != blip_proj_in:
            print(f"Projecting vision features from {vision_hidden_size} -> {blip_proj_in}")
            self.blip.vision_model.encoder.config.projection_dim = vision_hidden_size

    def forward(self, pixel_values, input_ids, attention_mask, labels=None):
        return self.blip(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=True
        )

    def generate(self, pixel_values, **gen_kwargs):
        return self.blip.generate(pixel_values=pixel_values, **gen_kwargs)

In [39]:
from PIL import Image
from transformers import AutoProcessor
import requests

#image = Image.open("example.jpg").convert("RGB")
url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
processor = AutoProcessor.from_pretrained("blip-image-captioning-base")
processor.image_processor = AutoProcessor.from_pretrained("vit-small-patch16-224")

inputs = processor(images=image, return_tensors="pt").to("cuda")
model = CustomBLIP().to("cuda")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTModel were not initialized from the model checkpoint at vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Projecting vision features from 384 -> 512


In [40]:
out = model.generate(pixel_values=inputs["pixel_values"])
caption = processor.tokenizer.batch_decode(out, skip_special_tokens=True)[0]
print("Caption:", caption)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (197x384 and 768x768)

In [53]:
blip = BlipForConditionalGeneration.from_pretrained("blip-image-captioning-base").to("cuda")
#blip.vision_model.encoder.config.projection_dim
#blip.vision_model.encoder.add_module(
#    "projection",
#    nn.Linear(blip.vision_model.encoder.config.hidden_size, blip.vision_model.encoder.config.projection_dim)
#)

In [54]:
print(blip)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [30]:
print(blip.text_decoder.bert.embeddings.position_embeddings.weight.shape)

torch.Size([512, 768])


In [35]:
print(blip.vision_model.embeddings.patch_embedding)

Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))


In [51]:
processor = AutoProcessor.from_pretrained("blip-image-captioning-base")
print(processor.image_processor)

BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "BlipProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 384,
    "width": 384
  }
}



In [110]:
vit = ViTModel.from_pretrained("vit-small-patch16-224").to("cuda")

print(vit)

Some weights of ViTModel were not initialized from the model checkpoint at vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=384, out_features=1536, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): ViTOutput(
          (d

In [124]:
vit_processor = AutoProcessor.from_pretrained("vit-small-patch16-224")
inputs = vit_processor(images=image, return_tensors="pt").to("cuda")
vit_out = vit(**inputs)

layernorm_proj = nn.Linear(vit_out[0].shape[-1], 768).to("cuda")

vit_out.last_hidden_state = layernorm_proj(vit_out[0])

vit_out[0].shape, vit_out[1].shape

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


(torch.Size([1, 197, 768]), torch.Size([1, 384]))

In [69]:
inputs = processor(images=image, return_tensors="pt").to("cuda")
blip_out = blip.vision_model(**inputs)
blip_out[0].shape

torch.Size([1, 577, 768])

In [83]:
type(blip_out)

transformers.modeling_outputs.BaseModelOutputWithPooling

In [125]:
image_embeds = vit_out[0]
image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

input_ids = (
    torch.LongTensor([[blip.decoder_input_ids, blip.config.get_text_config().eos_token_id]])
    .to(image_embeds.device)
)

input_ids[:, 0] = blip.config.get_text_config().bos_token_id
attention_mask = None

In [126]:
outputs = blip.text_decoder.generate(
    input_ids=input_ids[:, :-1],
    eos_token_id=blip.config.get_text_config().sep_token_id,
    pad_token_id=blip.config.get_text_config().pad_token_id,
    attention_mask=attention_mask,
    encoder_hidden_states=image_embeds,
    encoder_attention_mask=image_attention_mask,
)

In [127]:
processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

'sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc sc'