## 使用HuggingFace Transformers进行推理

In [1]:
import transformers
model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [2]:
!wget https://cdn.britannica.com/79/232779-050-6B0411D7/German-Shepherd-dog-Alsatian.jpg

--2024-10-09 03:29:41--  https://cdn.britannica.com/79/232779-050-6B0411D7/German-Shepherd-dog-Alsatian.jpg
Resolving cdn.britannica.com (cdn.britannica.com)... 18.164.154.38, 18.164.154.49, 18.164.154.6, ...
Connecting to cdn.britannica.com (cdn.britannica.com)|18.164.154.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 344524 (336K) [image/jpeg]
Saving to: ‘German-Shepherd-dog-Alsatian.jpg’


2024-10-09 03:29:41 (11.5 MB/s) - ‘German-Shepherd-dog-Alsatian.jpg’ saved [344524/344524]



In [4]:
ls

German-Shepherd-dog-Alsatian.jpg  [0m[01;34msample_data[0m/


In [18]:
from PIL import Image

images = [Image.open("./German-Shepherd-dog-Alsatian.jpg")]
possible_classes = ["an image of a bird", "an image of a cat", "an image of a dog"]

In [19]:
import torch

with torch.no_grad():
    inputs = processor(text=possible_classes, images=images, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)

In [28]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

在 CLIP (Contrastive Language–Image Pretraining) 模型中，输出字典中的各个键对应不同的模型输出信息。以下是这些键的具体含义：

1. **`logits_per_image`**:
   - 这是一个张量，形状为 `[batch_size, num_texts]`。
   - 表示每个图像对每个文本的相似度分数。这些分数可以用来计算图像和文本之间的匹配概率。

2. **`logits_per_text`**:
   - 这是一个张量，形状为 `[num_texts, batch_size]`。
   - 表示每个文本对每个图像的相似度分数。这些分数可以用来计算文本和图像之间的匹配概率。

3. **`text_embeds`**:
   - 这是一个张量，形状为 `[num_texts, embed_dim]`。
   - 表示每个文本的嵌入向量（embedding）。这些嵌入向量是文本的固定长度表示，用于后续的对比学习任务。

4. **`image_embeds`**:
   - 这是一个张量，形状为 `[batch_size, embed_dim]`。
   - 表示每个图像的嵌入向量（embedding）。这些嵌入向量是图像的固定长度表示，用于后续的对比学习任务。

5. **`text_model_output`**:
   - 这是一个包含文本模型输出的字典。
   - 具体内容取决于文本模型的实现，通常包括中间层的输出、注意力权重等。

6. **`vision_model_output`**:
   - 这是一个包含视觉模型输出的字典。
   - 具体内容取决于视觉模型的实现，通常包括中间层的输出、特征图等。

In [20]:
dot_products_per_image = outputs.logits_per_image
dot_products_per_text = outputs.logits_per_text

In [21]:
dot_products_per_image

tensor([[17.7861, 20.5859, 26.9440]])

In [22]:
dot_products_per_text

tensor([[17.7861],
        [20.5859],
        [26.9440]])

In [23]:
# 表示哪个文本与图像的相似度最高
probabilities = dot_products_per_image.softmax(dim=1)
max_indices = torch.argmax(probabilities, dim=1)

print(max_indices)

tensor([2])


In [29]:
# 表示每个文本与所有图像之间相似度最高的图像的索引。在这个例子中，由于只有一个图像，所以每个文本的最大相似度索引都是 0。
probabilities_text = dot_products_per_text.softmax(dim=1)
max_indices_text = torch.argmax(probabilities_text, dim=1)

print(max_indices_text)

tensor([0, 0, 0])


## 多模态模型中的visual encoder探究

参考：https://github.com/jingyaogong/minimind-v/blob/master/model/vision_utils.py

In [37]:
import warnings
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch

warnings.filterwarnings('ignore')


def get_vision_model():
    # 加载预训练的CLIP模型和处理器
    model_path = "openai/clip-vit-base-patch32"
    model = CLIPModel.from_pretrained(model_path)
    processor = CLIPProcessor.from_pretrained(model_path)
    return (model, processor)


def get_img_process(image, processor):
    image = image[0]
    # 将图像调整为224*224大小
    image = image.resize((224, 224))
    # 使用CLIPProcessor处理每个patch
    inputs = processor(images=image, return_tensors="pt", clean_up_tokenization_spaces=False)
    return inputs

def get_img_embedding(batch_encoding, vision_model):
    embeddings = []

    def hook_fn(module, input, output):
        # 将特征添加到 embeddings 列表中
        embeddings.append(output.last_hidden_state)

    # 从 BatchEncoding 中提取图像张量
    image_tensor = batch_encoding['pixel_values']  # 假设图像张量在 'pixel_values' 键下

    # 如果图像张量的形状是5维，则无需添加额外维度
    if len(image_tensor.shape) == 4:
        image_tensor = image_tensor.unsqueeze(0)  # 添加批次维度

    # 获取批次大小
    batch_size = image_tensor.size(0)

    with torch.no_grad():
        # 注册 hook 到模型的目标层（例如 vision_model 的倒数第二层）
        layer = vision_model.vision_model.encoder
        hook = layer.register_forward_hook(hook_fn)

        for i in range(batch_size):
            # 取出当前批次中的单个图像
            single_image = image_tensor[i]  # 添加批次维度
            # 调用 get_image_features 来获取图像特征
            _ = vision_model.get_image_features(single_image)
        # 取消 hook
        hook.remove()

    # 拼接所有特征向量成为一个张量
    all_embeddings = torch.stack(embeddings, dim=0).squeeze()
    return all_embeddings

if __name__=="__main__":
    (vision_model, preprocess) = get_vision_model()
    vision_model = vision_model.to("cuda")

    image = [Image.open("./German-Shepherd-dog-Alsatian.jpg")]
    image_process = get_img_process(image, processor)
    image_process = image_process.to("cuda")

    image_encoders = get_img_embedding(image_process, vision_model)
    print(image_encoders.shape)

torch.Size([50, 768])


In [36]:
import warnings
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch

warnings.filterwarnings('ignore')

def get_vision_model():
    model_path = "openai/clip-vit-base-patch32"
    model = CLIPModel.from_pretrained(model_path)
    processor = CLIPProcessor.from_pretrained(model_path)
    return model, processor

def get_img_process(image, processor):
    image = image[0]
    image = image.resize((224, 224))
    inputs = processor(images=image, return_tensors="pt", clean_up_tokenization_spaces=False)
    return inputs

def get_img_embedding(batch_encoding, vision_model):
    # 从 BatchEncoding 中提取图像张量
    image_tensor = batch_encoding['pixel_values']

    # 移动数据到 GPU
    image_tensor = image_tensor.to("cuda")

    with torch.no_grad():
        # 直接获取图像特征
        image_features = vision_model.get_image_features(image_tensor)

    return image_features

if __name__ == "__main__":
    vision_model, processor = get_vision_model()
    vision_model = vision_model.to("cuda")

    image = [Image.open("./German-Shepherd-dog-Alsatian.jpg")]
    image_process = get_img_process(image, processor)

    image_encoders = get_img_embedding(image_process, vision_model)
    print(image_encoders.shape)

torch.Size([1, 512])


In [38]:
import warnings
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

warnings.filterwarnings('ignore')

def get_vision_model():
    model_path = "openai/clip-vit-base-patch32"
    model = CLIPModel.from_pretrained(model_path)
    processor = CLIPProcessor.from_pretrained(model_path)
    return model, processor

def get_img_process(image, processor):
    image = image[0]
    image = image.resize((224, 224))
    inputs = processor(images=image, return_tensors="pt", clean_up_tokenization_spaces=False)
    return inputs

def get_img_embedding(batch_encoding, vision_model):
    # 从 BatchEncoding 中提取图像张量
    image_tensor = batch_encoding['pixel_values']

    # 移动数据到 GPU
    image_tensor = image_tensor.to("cuda")

    with torch.no_grad():
        # 获取模型的视觉部分
        vision_model = vision_model.vision_model

        # 获取模型的嵌入层输出
        pixel_values = image_tensor
        embeddings = vision_model.embeddings(pixel_values)

        # 获取模型的编码器输出
        encoder_outputs = vision_model.encoder(embeddings)

        # 获取最终的中间层输出
        last_hidden_state = encoder_outputs.last_hidden_state

        # 打印中间层输出的形状
        print("Intermediate Layer Output Shape:", last_hidden_state.shape)

        # 返回中间层输出
        return last_hidden_state

if __name__ == "__main__":
    vision_model, processor = get_vision_model()
    vision_model = vision_model.to("cuda")

    image = [Image.open("./German-Shepherd-dog-Alsatian.jpg")]
    image_process = get_img_process(image, processor)

    image_encoders = get_img_embedding(image_process, vision_model)
    print(image_encoders)

Intermediate Layer Output Shape: torch.Size([1, 50, 768])
tensor([[[ 0.0261,  0.2113, -0.3151,  ..., -0.1471,  0.3880,  0.4089],
         [ 0.3228,  1.0228, -1.0904,  ...,  0.2487,  0.8785, -0.3540],
         [ 0.4168,  0.2263,  0.0276,  ...,  0.3988,  1.1786, -0.8797],
         ...,
         [ 0.4512,  0.4214, -0.1235,  ...,  0.8371, -0.3764,  0.3441],
         [ 0.2888,  0.1619, -0.2719,  ..., -0.3733,  0.5227,  0.1510],
         [ 0.4708,  0.5472, -0.5883,  ...,  0.0569, -0.2649,  0.8119]]],
       device='cuda:0')
