In [17]:
import torch
import torch.nn as nn
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration, Blip2VisionModel

raw_image = Image.open("./images/demo.jpg").convert('RGB')

question = "how many dogs are in the picture?"

### BLIP2

In [None]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", cache_dir="./cache")

inputs = processor(raw_image, question, return_tensors="pt")

#### Conditional Generation model

In [None]:
generative_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float32, device_map="cpu", cache_dir='./cache')

In [None]:
generative_model.vision_model.save_pretrained("./blip2-opt-2.7b-vision")

- Push vision model to hub

In [None]:
generative_model.vision_model.push_to_hub("tmnam20/blip2-opt-2.7b-vision", private=True, token="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")

- Push processor to hub

In [None]:
processor.push_to_hub("tmnam20/blip2-opt-2.7b-vision", private=True, token="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")

#### Vision models

In [2]:
vision_processor = Blip2Processor.from_pretrained("tmnam20/blip2-opt-2.7b-vision",torch_dtype=torch.float32, cache_dir="./cache")
vision_model = Blip2VisionModel.from_pretrained("tmnam20/blip2-opt-2.7b-vision", device_map='cpu', torch_dtype=torch.float32, cache_dir="./cache")
vision_model = vision_model.eval()
vision_model

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/708 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Blip2VisionModel(
  (embeddings): Blip2VisionEmbeddings(
    (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (encoder): Blip2Encoder(
    (layers): ModuleList(
      (0-38): 39 x Blip2EncoderLayer(
        (self_attn): Blip2Attention(
          (dropout): Dropout(p=0.0, inplace=False)
          (qkv): Linear(in_features=1408, out_features=4224, bias=True)
          (projection): Linear(in_features=1408, out_features=1408, bias=True)
        )
        (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Blip2MLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        )
        (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      )
    )
  )
  (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
)

In [6]:
inputs = vision_processor(raw_image, question, return_tensors="pt")

In [7]:
input_pixels = inputs['pixel_values']
with torch.no_grad():
    vision_embeddings = vision_model(input_pixels)
print(f'Input pixels shape: {input_pixels.shape}')
print(f'Vision embeddings shape: {vision_embeddings.last_hidden_state.shape}')

Input pixels shape: torch.Size([1, 3, 224, 224])
Vision embeddings shape: torch.Size([1, 257, 1408])


In [8]:
input_pixels = inputs['pixel_values']
print(f'Shape before extend: {input_pixels.shape}') # [batch_size, 3, height, width]
# extend to [batch_size, 1, 3, height, width]
input_pixels = input_pixels.unsqueeze(1)
print(f'Shape after extend: {input_pixels.shape}')
input_pixels = input_pixels.repeat(2, 4, 1, 1, 1)
print(f'Shape after repeat: {input_pixels.shape}')

Shape before extend: torch.Size([1, 3, 224, 224])
Shape after extend: torch.Size([1, 1, 3, 224, 224])
Shape after repeat: torch.Size([2, 4, 3, 224, 224])


In [24]:
vision_model.train()

encoded = []
for batch_idx in range(input_pixels.size(0)):
    print(batch_idx)
    item = input_pixels[batch_idx].contiguous()
    encoded_item = vision_model(pixel_values=item)
    encoded.append(encoded_item.last_hidden_state)
encoded = torch.stack(encoded)

0
1


- Check backward compatibility of vision models

In [13]:
encoded.sum().backward()

In [25]:
print(f'Shape of encoded: {encoded.shape}')

Shape of encoded: torch.Size([2, 4, 257, 1408])


In [26]:
# [batch_size, num_frames, num_pathces, hidden_size]
# get the first token in num_patches
encoded = encoded[:, :, 0, :]
encoded.shape

torch.Size([2, 4, 1408])

In [27]:
projection_layer = nn.Linear(vision_model.config.hidden_size, 5)

In [28]:
scaled_encoded_features = projection_layer(encoded)
print(f'Projected image features shape = {scaled_encoded_features.shape}')

Projected image features shape = torch.Size([2, 4, 5])


### CLIP image features

In [None]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("images/demo.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

In [None]:
image_features.shape

### Llama 2

In [34]:
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM

In [35]:
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", cache="./cache")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device="cpu", cache_dir="./cache")

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

### Video Llama

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VideoLlama(nn.Module):

    def __init__(self):
        pass
    
    def forward(self, frames, tokenized_input, **kwargs):
        """Forward pass of VideoLlama

        Args:
            frames (torch.Tensor): Input in shape (batch_size, seq_len, channels, width, height)
            tokenized_input (dict): input dictionary of the prompts
        """
        pass