In [5]:
import torch
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration, Blip2VisionModel

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", cache_dir="./cache")

raw_image = Image.open("./images/demo.jpg").convert('RGB')

question = "how many dogs are in the picture?"
inputs = processor(raw_image, question, return_tensors="pt")

### BLIP2

#### Vision models

In [6]:
vision_model = Blip2VisionModel.from_pretrained("blip2-opt-2.7b-vision", device_map='cpu', torch_dtype=torch.float32, cache_dir="./cache")

In [7]:
vision_model = vision_model.eval()
vision_model

Blip2VisionModel(
  (embeddings): Blip2VisionEmbeddings(
    (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (encoder): Blip2Encoder(
    (layers): ModuleList(
      (0-38): 39 x Blip2EncoderLayer(
        (self_attn): Blip2Attention(
          (dropout): Dropout(p=0.0, inplace=False)
          (qkv): Linear(in_features=1408, out_features=4224, bias=True)
          (projection): Linear(in_features=1408, out_features=1408, bias=True)
        )
        (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Blip2MLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        )
        (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
)

In [17]:
input_pixels = inputs['pixel_values']
with torch.no_grad():
    vision_embeddings = vision_model(input_pixels)
print(f'Input pixels shape: {input_pixels.shape}')
print(f'Vision embeddings shape: {vision_embeddings.last_hidden_state.shape}')

Input pixels shape: torch.Size([1, 3, 224, 224])
Vision embeddings shape: torch.Size([1, 257, 1408])


In [19]:
input_pixels = inputs['pixel_values']
print(f'Shape before extend: {input_pixels.shape}') # [batch_size, 3, height, width]
# extend to [batch_size, 1, 3, height, width]
input_pixels = input_pixels.unsqueeze(1)
print(f'Shape after extend: {input_pixels.shape}')
input_pixels = input_pixels.repeat(1, 5, 1, 1, 1)
print(f'Shape after repeat: {input_pixels.shape}')

Shape before extend: torch.Size([1, 3, 224, 224])
Shape after extend: torch.Size([1, 1, 3, 224, 224])
Shape after repeat: torch.Size([1, 5, 3, 224, 224])


In [20]:
with torch.no_grad():
    vision_embeddings = vision_model(input_pixels)
print(f'Input pixels shape: {input_pixels.shape}')
print(f'Vision embeddings shape: {vision_embeddings.last_hidden_state.shape}')

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 5, 3, 224, 224]

In [3]:
with torch.no_grad():
    vision_embeddings = vision_model(inputs['pixel_values'].cpu())
vision_embeddings.last_hidden_state.shape

torch.Size([1, 257, 1408])

In [2]:
generative_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float32, device_map="cpu", cache_dir='./cache')

# out = generative_model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True).strip())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Generative models

In [4]:
generative_model.vision_model.save_pretrained("./blip2-opt-2.7b-vision")

In [4]:
processor.push_to_hub("blip2-opt-2.7b-vision", private=True, token="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")

CommitInfo(commit_url='https://huggingface.co/tmnam20/blip2-opt-2.7b-vision/commit/4f8a5e1ca831a240e52812b6ad99bcff5c17e9bc', commit_message='Upload processor', commit_description='', oid='4f8a5e1ca831a240e52812b6ad99bcff5c17e9bc', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
generative_model.vision_model.push_to_hub("tmnam20/blip2-opt-2.7b-vision", private=True, token="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")

model.safetensors:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tmnam20/blip2-opt-2.7b-vision/commit/265bfa5f488f9ceca1c296f63fbeee30b70497d8', commit_message='Upload model', commit_description='', oid='265bfa5f488f9ceca1c296f63fbeee30b70497d8', pr_url=None, pr_revision=None, pr_num=None)

### CLIP image features

In [1]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("images/demo.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

x.shape (before transformer) = torch.Size([50, 1, 768])
x.shape (after transformer) = torch.Size([1, 50, 768])
x.shape (before transformer) = torch.Size([50, 1, 768])
x.shape (after transformer) = torch.Size([1, 50, 768])
Label probs: [[0.0123  0.962   0.02563]]


In [2]:
image_features.shape

torch.Size([1, 512])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VideoLlama(nn.Module):

    def __init__(self):
        pass
    
    def forward(self, x):
        pass