In [1]:
import numpy as np
import torch
from pkg_resources import packaging
import sys
sys.path.append("..")
from pathlib import Path
import clip.clip as clip
from PIL import Image

device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")

# model, preprocess = clip.load("ViT-B/32")
model, preprocess = clip.load(name="/datassd2/sswang/NFT_Search/CLIP/models/ViT-L-14-336px.pt", device=device)
# 将模型加载到GPU中并切换到评估模式
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 427,944,193
Input resolution: 336
Context length: 77
Vocab size: 49408


In [115]:

def slide_window_tokenizer(text, window_size, step_size) -> list:
    """
    为了处理长度超过77的句子，这里设计滑动窗口分词

    Args:
        text (str): 将要被拆分的句子

    Returns:
        list: 拆分后的句子列表
    """
    words = text.split()
    sentences = []
    slide_window_list = [i for i in range(0, len(words) - 1, step_size) if i + step_size < len(words) - 1]
    for i in slide_window_list:
        sentence = ' '.join(words[i:i+window_size])
        sentences.append(sentence)
    return sentences

def split_text(text, para_num) -> list:
    """
    将长句子拆分成指定段数的短句子

    Args:
        text (str): 将要被拆分的文本

    Returns:
        list: 拆分后的句子列表
    """
    sentences = []
    words = text.split()
    step_size = len(words) // para_num
    for i in range(0, len(words) - 1, step_size):
        sentence = ' '.join(words[i:i+step_size])
        sentences.append(sentence)
    return sentences

def tensorlize_texts(model, text_tokens_list) -> torch.Tensor:
    """
    使用模型提取文本特征，返回文本特征向量列表

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text_tokens_list (str): 输入的文本列表

    Returns:
        torch.Tensor: 文本特征向量列表
    """
    text_tokens = clip.tokenize(text_tokens_list, context_length = 77).cuda( device=device)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens).float().cpu()
        return text_features
    

def tensorlize_texts_slideWindow(model, text, window_size, step_size):
    """
    将长文本分块，然后对每块文本进行向量化，最后将这些向量平均。

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text (str): 输入的文本。

    Returns:
        torch.Tensor: 文本特征向量。
    """

    chunks = slide_window_tokenizer(text, window_size, step_size)
    tensor_list = []
    for chunk in chunks:
        tokens = clip.tokenize([chunk]).cuda(device=device)
        with torch.no_grad():
            tensor_list.append(model.encode_text(tokens).float().cpu())
    # 使用所有块的平均值作为文本的表示
    avg_tensor = torch.mean(torch.stack(tensor_list), dim=0)
    return avg_tensor


def tensorlize_texts_chunked(model, text, para_num):
    """
    将长文本分块，然后对每块文本进行向量化，最后将这些向量平均。

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text (str): 输入的文本。

    Returns:
        torch.Tensor: 文本特征向量。
    """

    chunks = split_text(text, para_num)
    tensor_list = []

    for chunk in chunks:
        tokens = clip.tokenize([chunk]).cuda(device=device)
        with torch.no_grad():
            tensor_list.append(model.encode_text(tokens).float().cpu())

    #使用所有块的平均值作为文本的表示
    avg_tensor = torch.mean(torch.stack(tensor_list), dim=0)
    return avg_tensor


def calculate_cosine_similarity(tensor1, tensor2):
    """
    计算两个张量之间的余弦相似度

    Args:
        tensor1 (torch.Tensor): 第一个张量
        tensor2 (torch.Tensor): 第二个张量

    Returns:
        float: 余弦相似度
    """
    tensor1 /= tensor1.norm(dim=-1, keepdim=True)
    tensor2 /= tensor2.norm(dim=-1, keepdim=True)
    similarity = tensor1.cpu().numpy() @ tensor2.cpu().numpy().T
    return similarity.item()


def tensorlize_valid_subsentence(model, text):
    text_tensor = None
    # 标记为False时，表示该句子无法被模型处理，需要进行拆分
    flag = False
    words = text.split()
    text_length = len(words)
    while not flag:
        try:
            text_tensor = tensorlize_texts(model, text)
            flag = True
        except:
            text_length -= 1
            text = ' '.join(words[:text_length])
    return text_tensor

In [116]:
str1 = "A picture of a triangular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth.A picture of a triangular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth."
print(tensorlize_valid_subsentence(model, str1))

tensor([[-7.2388e-02,  5.1953e-01, -6.0394e-02, -4.6143e-02,  3.8135e-01,
          1.3843e-01,  1.7908e-01,  9.1171e-03,  4.7949e-01, -4.5630e-01,
          1.8396e-01, -6.1426e-01, -8.7219e-02,  8.6621e-01, -8.5596e-01,
         -4.9658e-01,  6.4990e-01, -1.2109e-01,  3.0396e-01, -6.9971e-01,
          9.5654e-01, -1.0101e-02, -3.4473e-01, -7.3181e-02,  1.0187e-01,
         -3.7256e-01, -5.1172e-01, -3.1543e-01,  1.6309e-01, -9.9487e-02,
         -1.6418e-01, -1.7651e-01, -6.4648e-01,  2.1774e-02, -2.5439e-01,
          6.1523e-01,  5.5273e-01,  4.2938e-02, -4.5142e-01, -3.5010e-01,
          4.0771e-01,  5.2307e-02, -1.2421e-01, -2.5488e-01,  1.9836e-01,
          8.8074e-02,  2.8833e-01, -7.6477e-02,  1.0168e-01, -5.0439e-01,
         -5.1727e-02,  9.5596e-03,  1.0925e-01,  4.8901e-01,  3.0273e-01,
          3.1738e-02,  5.8740e-01, -3.9551e-01,  1.3171e-01, -1.8152e-01,
          3.9697e-01, -2.7267e-02,  9.0576e-02,  8.3191e-02, -6.8665e-02,
          2.5171e-01,  4.6875e-01, -5.

In [82]:
list1 = [
        "A picture of a triangular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth.",
        "A picture of a rectangular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth.",
        "A picture of a circular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth."
    ]

list2 = [
        "A picture of a Prime Ape on their planet wearing a silver sweater, with green eyes and a rainbow-colored skin, showing an excited expression. They have white teeth and a leather jacket, and are wearing 3D glasses while holding a slice of pizza in their mouth.",
        "A snapshot of Prime Ape Planet featuring a rainbow-colored creature with green eyes, wearing a silver sweater and a leather jacket. They are grinning widely, showing off their white teeth, and holding a slice of pizza in their mouth. The 3D glasses on their face add to their excited expression.",
        "A visual of Prime Ape Planet showcasing a creature with a rainbow-colored skin, wearing a silver sweater and a leather jacket. They have green eyes and a wide grin, revealing their white teeth, and are holding a slice of pizza in their mouth. The 3D glasses on their face add to their excited expression."
    ]

list3 = [
        "A picture of a 3D model wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The skin of the model is lava, and it has a half bull ring nose and cyclops silver glasses.",
        "A 3D model is depicted in the picture, wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The model's skin is lava, and it has a half bull ring nose and cyclops silver glasses.",
        "The picture shows a 3D model wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The model's skin is lava, and it has a half bull ring nose and cyclops silver glasses, making it a unique and striking image."
    ]

text_list = [list1, list2, list3]
for texts in text_list:
    for text in texts:
        print(len(text.split()))
    print("=========================================")


45
45
45
46
50
54
38
38
43


In [117]:
list1 = [
        "A picture of a triangular shape with a background of 2, covered in fur with a hat with a value of 6, with eyes having a value of 14, not wearing any with eyes having a value of 14, not wearing, and without a mouth A picture of a triangular shape with a covered in fur with a value of 6",
        "A picture of a rectangular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat with a value of 6, with eyes having a value of 14, not wearing any clothes, and without a mouth not wearing any clothes, and without a mouth and without a mouth.",
        "A picture of a circular shape with a background of 2, covered in fur with a value of 6, without any earrings, wearing a hat  with eyes having a value of 14, not wearing any clothes, and without a mouth eyes having a value of 14, not wearing any clothes, and without a mouth."
    ]

list2 = [
        "A picture of a Prime Ape on their planet wearing a silver sweater, with green eyes and a rainbow-colored skin, showing an excited expression. They have white teeth and a leather jacket, and are wearing 3D glasses while holding a slice of pizza in their mouth holding a slice of pizza in their mouth.",
        "A snapshot of Prime Ape Planet featuring a rainbow-colored creature with green eyes, wearing a silver sweater and a leather jacket. They are grinning widely, showing off their white teeth, and holding a slice of pizza in their mouth. The 3D glasses on their face add to their excited expression add to their excited expression.",
        "A visual of Prime Ape Planet showcasing a creature with a rainbow-colored skin, wearing a silver sweater and a leather jacket. They have green eyes and a wide grin, revealing their white teeth, and are holding a slice of pizza in their mouth. The 3D glasses on their face add to their excited expression."
    ]

list3 = [
        "A picture of a 3D model wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The skin of the model is lava, and it has a half bull ring nose and cyclops silver glasses The skin of the model is lava, and it has a half bull ring nose and cyclops silver glasses",
        "A 3D model is depicted in the picture, wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The model's skin is lava, and it has a half bull ring nose and cyclops silver glasses and it has a half bull ring nose and cyclops silver glasses ring nose and cyclops silver glasses.",
        "The picture shows a 3D model wearing a turtleneck white shirt, a king silver necklace, and a propeller hat. The model's skin is lava, and it has a half bull ring nose and cyclops silver glasses, making it a unique and striking image ring nose and cyclops silver glasses, making it a unique and striking image."
    ]

text_list = [list1, list2, list3]
for texts in text_list:
    for text in texts:
        print(len(text.split()))
    print("\n====================\n")


61
57
54


54
55
54


57
56
56




In [118]:
# 直接舍弃后面部分的内容
for text in text_list:
    full_text_tensor = tensorlize_texts(model, text)
    text_tensor = []  # 3个tensor的列表

    for full_text_tensor_item1, full_text_tensor_item2 in zip(full_text_tensor, full_text_tensor):
        print(calculate_cosine_similarity(full_text_tensor_item1, full_text_tensor_item1))
    print("\n====================\n")

1.0000001192092896
1.0000001192092896
1.0000001192092896


1.0
1.0000001192092896
0.9999998807907104


1.0000001192092896
1.0
1.0




In [119]:
# 直接舍弃后面部分的内容
for text in text_list:
    full_text_tensor = tensorlize_texts(model, text)
    text_tensor = []  # 3个tensor的列表
    for str in text:
        substr = str.split()[:50]
        sub_sentence = " ".join(substr)
        text_tensor.append(tensorlize_texts(model, sub_sentence))
    for full_text_tensor_item, split_text_tensor in zip(full_text_tensor, text_tensor):
        print(calculate_cosine_similarity(full_text_tensor_item, split_text_tensor))
    print("\n====================\n")

0.9695712327957153
0.7536751627922058
0.981804609298706


0.8682611584663391
0.9875789880752563
0.8751397132873535


0.9794880747795105
0.9585369229316711
0.9852710962295532




In [120]:
# 计算使用分割后求平均的的文本的余弦相似度
# 将段落拆分成2段，然后计算平均后的tensor

for text in text_list:
    full_text_tensor = tensorlize_texts(model, text)
    text_tensor = []  # 3个tensor的列表
    for str in text:
        text_tensor.append(tensorlize_texts_chunked(model, str, 2))

    for full_text_tensor_item, split_text_tensor in zip(full_text_tensor, text_tensor):
        print(calculate_cosine_similarity(full_text_tensor_item, split_text_tensor))
    print("\n====================\n")

0.9279798269271851
0.7160555124282837
0.8789778351783752


0.8942962288856506
0.8469402194023132
0.8564712405204773


0.7474461793899536
0.6091711521148682
0.6622057557106018




In [121]:
# 计算使用分割后求平均的的文本的余弦相似度
# 使用滑动窗口将段落拆分之后，计算平均后的tensor
text_list = [list1, list2, list3]

for text in text_list:
    full_text_tensor = tensorlize_texts(model, text)
    text_tensor = []  # 3个tensor的列表
    for str in text:
        text_tensor.append(tensorlize_texts_slideWindow(model, str, 50, 20))

    for full_text_tensor_item, split_text_tensor in zip(full_text_tensor, text_tensor):
        print(calculate_cosine_similarity(full_text_tensor_item, split_text_tensor))
    print("\n====================\n")

0.9504979848861694
0.7229177951812744
0.9135902523994446


0.8502402305603027
0.8935145735740662
0.821176290512085


0.8585348129272461
0.8364741802215576
0.8617141246795654


