In [1]:
import numpy as np
import torch
from pkg_resources import packaging
import sys
sys.path.append("..")
from pathlib import Path
import clip.clip as clip
from PIL import Image
import os 
device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

# model, preprocess = clip.load("ViT-B/32")
model, preprocess = clip.load(name="/datassd2/sswang/NFT_Search/CLIP/models/ViT-L-14-336px.pt", device=device)
# 将模型加载到GPU中并切换到评估模式
# model.cuda(device).eval()
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

preprocess

Model parameters: 427,944,193
Input resolution: 336
Context length: 77
Vocab size: 49408


Compose(
    Resize(size=336, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(336, 336))
    <function _convert_image_to_rgb at 0x7fcd3c232dd0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

np.stack(images) 函数将一个由多个图像组成的列表 images 沿着一个新轴（默认为 0）进行连接，生成一个新的数组。这个新的数组的维度比原来的数组多了一个维度，用于存储连接后的图像。如果 images 中的每个图像的尺寸都相同，那么连接后的数组的第一个维度将是 len(images)，第二个维度将是图像的高度，第三个维度将是图像的宽度，第四个维度将是图像的通道数。

这行代码是从一个图像列表中创建了一个 PyTorch 张量 image_input。np.stack(images) 函数沿着一个新轴（默认为 0）将图像列表进行连接。然后，torch.tensor() 将连接后的图像列表转换为 PyTorch 张量。.cuda() 方法将张量移动到 GPU 上进行加速运算。

In [3]:
import json

def check_dir(dir_path):
    """
    检查文件夹路径是否存在，不存在则创建

    Args:
        dir_path (str): 待检查的文件夹路径
    """
    if not os.path.exists(dir_path):
        try:
            os.makedirs(dir_path)
        except Exception as e:
            raise e

def load_json(json_path):
    """
    以只读的方式打开json文件

    Args:
        config_path: json文件路径

    Returns:
        A dictionary

    """
    with open(json_path, 'r', encoding='UTF-8') as f:
        return json.load(f)
    
def save_json(save_path, data):
    """
    Saves the data to a file with the given filename in the given path

    Args:
        :param save_path: The path to the folder where you want to save the file
        :param filename: The name of the file to save
        :param data: The data to be saved

    """
    with open(save_path, 'w', encoding='UTF-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def is_str_Length_valid(str_list) -> bool:
    """
    判断字符串长度是否超过77

    Args:
        str_list (list): 字符串列表

    Returns:
        bool: 是否超过77
    """
    try:
        for str in str_list:
            clip.tokenize(str)
        return True
    except:
        return False



def tensorlize_imgs(model, img_path_list) -> torch.Tensor:
    """
    使用模型提取图片特征，返回图片特征向量列表

    Args:
        img_path_list (list): 图片路径列表

    Returns:
        torch.Tensor: 图片特征向量列表
    """

    images = []
    for img_path in img_path_list:

        image = Image.open(img_path).convert("RGB")
            # 首先将图片预处理成模型需要的格式
        images.append(preprocess(image))
        # 把图片加载进cuda中
    image_input = torch.tensor(np.stack(images)).cuda(device=device)
    with torch.no_grad():
        image_features = model.encode_image(image_input).float()
        # 将image_features从GPU移动到CPU，并返回
        return image_features.cpu()
            

def tensorlize_texts(model, text_tokens_list) -> torch.Tensor:
    """
    使用模型提取文本特征，返回文本特征向量列表

    Args:
        text_tokens_list (list): 文本列表

    Returns:
        torch.Tensor: 文本特征向量列表
    """
    text_tokens = clip.tokenize(text_tokens_list).cuda(device=device)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens).float()
        # 将text_features从GPU移动到CPU，并返回
        return text_features.cpu().numpy().tolist()
    
def load_img_tensor(device, imgTensor_path):
    """
    加载图片的tensor 向量到指定cuda中

    Args:
        device (str): cuda
        img_path (str): 图片tensor路径

    Returns:
        torch.Tensor: 图片特征向量
    """
    # 加载json文件
    NFT_tensor_data = load_json(imgTensor_path)
    image_features = NFT_tensor_data['image_features']
    image_tensors = torch.tensor(image_features).to(device)
    return image_tensors

def load_des_tensor(device, desTensor_path):
    """
    加载描述的tensor 向量到指定cuda中

    Args:
        device (str): cuda
        img_path (str): 描述的tensor路径

    Returns:
        torch.Tensor: 图片特征向量
    """
    # 加载json文件
    NFT_tensor_data = load_json(desTensor_path)
    des_tensors = [list(x) for x in zip(*NFT_tensor_data['des_tensors'])]
    des_features = torch.tensor(des_tensors).to(device)
    return des_features

def slide_window_tokenizer(text, window_size, step_size) -> list:
    """
    为了处理长度超过77的句子，这里设计滑动窗口分词

    Args:
        text (str): 将要被拆分的句子

    Returns:
        list: 拆分后的句子列表
    """
    words = text.split()
    sentences = []
    slide_window_list = [i for i in range(0, len(words) - 1, step_size) if i + step_size < len(words) - 1]
    for i in slide_window_list:
        sentence = ' '.join(words[i:i+window_size])
        sentences.append(sentence)
    return sentences

def calculate_cosine_similarity_topk(img_features, des_features, k = 10) -> tuple:
    """
    计算图片特征和描述特征的余弦相似度，并返回topk的结果

    Args:
        img_features (torch.tensor): 图像特征向量
        des_features (torch.tensor): 描述特征向量
        k (int, optional): 前k位结果. Defaults to 10.

    Returns:
        tuple: (topk的相似度，topk的索引)
    """
    # 归一化图片特征
    img_features /= img_features.norm(dim=-1, keepdim=True)

    # 归一化描述特征
    des_features /= des_features.norm(dim=-1, keepdim=True)
    # similarity = des_features.cpu().numpy() @ img_features.cpu().numpy().T
    # 计算余弦相似度
    text_probs = (100.0 * img_features @ des_features.T).softmax(dim=-1)
    top_probs, top_labels = text_probs.cpu().topk(k, dim=-1)
    return top_probs, top_labels


def slide_window_tokenizer(text, window_size, step_size) -> list:
    """
    为了处理长度超过77的句子，这里设计滑动窗口分词

    Args:
        text (str): 将要被拆分的句子

    Returns:
        list: 拆分后的句子列表
    """
    words = text.split()
    sentences = []
    slide_window_list = [i for i in range(0, len(words) - 1, step_size) if i + step_size < len(words) - 1]
    for i in slide_window_list:
        sentence = ' '.join(words[i:i+window_size])
        sentences.append(sentence)
    return sentences

def tensorlize_texts_slideWindow(model, text, window_size, step_size):
    """
    将长文本分块，然后对每块文本进行向量化，最后将这些向量平均。

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text (str): 输入的文本。

    Returns:
        torch.Tensor: 文本特征向量。
    """

    chunks = slide_window_tokenizer(text, window_size, step_size)
    tensor_list = []
    for chunk in chunks:
        tokens = clip.tokenize([chunk]).cuda(device=device)
        with torch.no_grad():
            tensor_list.append(model.encode_text(tokens).float().cpu())
    # 使用所有块的平均值作为文本的表示
    avg_tensor = torch.mean(torch.stack(tensor_list), dim=0)
    return avg_tensor.numpy().tolist()


def tensorlize_valid_subsentence(model, text) -> torch.Tensor:
    """
    截取有效的子句，然后求特征向量值

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text (str): 输入的文本。

    Returns:
        torch.Tensor: 文本特征向量。
    """
    text_tensor = None
    # 标记为False时，表示该句子无法被模型处理，需要进行拆分
    flag = False
    words = text.split()
    text_length = len(words)
    while not flag:
        try:
            text_tensor = tensorlize_texts(model, text)
            flag = True
        except:
            text_length -= 1
            text = ' '.join(words[:text_length])
    return text_tensor

def handle_long_texts(model, text_list, window_size, step_size) -> list:
    """
    处理长文本，将长文本分块，然后对每块文本进行向量化，最后将这些向量平均。

    Args:
        model (CLIP): 使用的 CLIP 模型。
        text_list (list): 输入的文本列表。
        window_size (int): 窗口宽度
        step_size (int): 窗口移动的步长

    Returns:
        list: 文本特征向量列表。
    """
    text_tensor_list = []
    for text in text_list:
        try:
            text_tensor_list.append(tensorlize_texts(model, text))
        except:
            # text_tensor_list.append(tensorlize_texts_slideWindow(model, text, window_size, step_size))
            text_tensor_list.append(tensorlize_valid_subsentence(model, text))
    return text_tensor_list

In [4]:
dataset_base_path = Path("/datassd2/sswang/dataset/mini100/")
target_dataset_path = Path("/datassd2/sswang/dataset/mini100_tensor_V2/")

In [5]:
import re
import copy

NFT_list_dict = load_json(dataset_base_path.joinpath("NFT_list.json"))

collection_list_copy = copy.deepcopy(NFT_list_dict["collection_list_copy"])

for key, value in collection_list_copy.items():
    NFT_collection_path = dataset_base_path.joinpath(value)
    print("开始处理：", NFT_collection_path.name, "...")
    check_dir(target_dataset_path.joinpath(NFT_collection_path.name))

    NFT_tensor_data = {}
    img_path_list = list(NFT_collection_path.joinpath("img").iterdir())
    # 提取图片名称
    img_name_list = [img_path.stem for img_path in img_path_list]

    # 提取图片特征向量
    image_features_CPU = tensorlize_imgs(model, img_path_list)

    # 提取文本特征向量
    des_tensor = []

    des_query_dict = load_json(NFT_collection_path.joinpath("description.json"))

    for img_name in img_name_list:
        # 去掉 description 中的序号（1. 2. 3.）
        des_list = [re.sub(r'^\d+\. ', '', des) for des in des_query_dict[img_name]]
        # 判断描述的长度有没有超过77
        if is_str_Length_valid(des_list) == False:
            # 如果超过77，就使用滑动窗口分词
            des_tensor.append(handle_long_texts(model, des_list, 50, 20))
        else:
            des_tensor.append(tensorlize_texts(model, des_list))

    NFT_tensor_data["img_name_list"] = img_name_list
    NFT_tensor_data["image_features"] = image_features_CPU.numpy().tolist()
    NFT_tensor_data["des_tensors"] = des_tensor
    save_json(target_dataset_path.joinpath(NFT_collection_path.name, "NFT_tensor_data.json"), NFT_tensor_data)
    print("处理完成：", NFT_collection_path.name)
    
    # 将已经处理完成的项目从列表中删除
    del NFT_list_dict["collection_list_copy"][key]
    # 将处理完成的项目列表保存到json文件中
    save_json(dataset_base_path.joinpath("NFT_list.json"), NFT_list_dict)
    

开始处理： CryptoPunks ...
处理完成： CryptoPunks
开始处理： BoredApeYachtClub ...
处理完成： BoredApeYachtClub
开始处理： MutantApeYachtClub ...
处理完成： MutantApeYachtClub
开始处理： Azuki ...
处理完成： Azuki
开始处理： CLONEX ...
处理完成： CLONEX
开始处理： Moonbirds ...
处理完成： Moonbirds
开始处理： Doodles ...
处理完成： Doodles
开始处理： BoredApeKennelClub ...
处理完成： BoredApeKennelClub
开始处理： Meebits ...
处理完成： Meebits
开始处理： PudgyPenguins ...
处理完成： PudgyPenguins
开始处理： Cool Cats ...
处理完成： Cool Cats
开始处理： Beanz ...
处理完成： Beanz
开始处理： MechMinds ...
处理完成： MechMinds
开始处理： World of Women ...
处理完成： World of Women
开始处理： CrypToadz ...
处理完成： CrypToadz
开始处理： 0N1 Force ...
处理完成： 0N1 Force
开始处理： mfers ...
处理完成： mfers
开始处理： Karafuru ...
处理完成： Karafuru
开始处理： HAPE PRIME ...
处理完成： HAPE PRIME
开始处理： MekaVerse ...
处理完成： MekaVerse
开始处理： projectPXN ...
处理完成： projectPXN
开始处理： FLUF ...
处理完成： FLUF
开始处理： Hashmasks ...
处理完成： Hashmasks
开始处理： Moonbirds Oddities ...
处理完成： Moonbirds Oddities
开始处理： Creature World ...
处理完成： Creature World
开始处理： 3Landers ...
处理完成： 3Landers
开始处理： Phan



处理完成： VeeFriends Series 2
开始处理： Lazy Lions ...
处理完成： Lazy Lions
开始处理： World of Women Galaxy ...
处理完成： World of Women Galaxy
开始处理： ALIENFRENS ...
处理完成： ALIENFRENS
开始处理： Prime Ape Planet ...
处理完成： Prime Ape Planet
开始处理： The Doge Pound ...
处理完成： The Doge Pound
开始处理： Sappy Seals ...
处理完成： Sappy Seals
开始处理： CyberKongz ...
处理完成： CyberKongz
开始处理： DigiDaigaku ...
处理完成： DigiDaigaku
开始处理： CoolmansUniverse ...
处理完成： CoolmansUniverse
开始处理： VOX Series 1 ...
处理完成： VOX Series 1
开始处理： Capsule ...
处理完成： Capsule
开始处理： Murakami.Flowers ...
处理完成： Murakami.Flowers
开始处理： SupDucks ...
处理完成： SupDucks
开始处理： Valhalla ...
处理完成： Valhalla
开始处理： DEGEN TOONZ ...
处理完成： DEGEN TOONZ
开始处理： Lives of Asuna ...
处理完成： Lives of Asuna
开始处理： Nakamigos ...
处理完成： Nakamigos
开始处理： Sneaky Vampire Syndicate ...
处理完成： Sneaky Vampire Syndicate
开始处理： Killer GF ...
处理完成： Killer GF
开始处理： Adam Bomb Squad ...
处理完成： Adam Bomb Squad
开始处理： Impostors Genesis ...
处理完成： Impostors Genesis
开始处理： CryptoSkulls ...
处理完成： CryptoSkulls
开始处理： MURI ...
处

In [None]:
# 计算图像特征和描述特征的余弦相似度
image_features = load_img_tensor(device, target_dataset_path.joinpath("Prime Ape Planet", "NFT_tensor_data.json"))
des_features = load_des_tensor(device, target_dataset_path.joinpath("Prime Ape Planet", "NFT_tensor_data.json"))
des_features1, des_features2, des_feature3 = des_features
top_probs, top_labels = calculate_cosine_similarity_topk(image_features, des_features1, 10)
print(top_probs.shape)
print(top_probs)
print(top_labels.shape)
print(top_labels)


In [21]:
str1 = "A picture of Pixel, a male character with frumpy hair and small shades. He has a lanky and gangly build, with long limbs and a thin frame. He is wearing a hoodie and baggy pants, which add to his unkempt and disheveled appearance. His hair is a wild and unruly mess, with stray strands sticking up in all directions. He has a carefree and rebellious spirit, and seems to reject convention and authority. He is not particularly clean-cut or well-groomed, but has a raw and unpolished charm."
str2 = "Input A stunning figure adorned in a Pearl Kitsune mask, their hair a long and flowing Dreads of Obsidian. They stand tall in a Void head, looking out with a Half-open Jasper gaze, wearing a Track Jacket of Citrine, their body clad in an Azurite cloak, their Type is Y0K-A1, their Background is Azurite, their Body is Azurite, their Eyes are Half Open (Jasper), their Mouth is Neutral, their Wear is Track Jacket (Citrine), their Hair is Dreads (Obsidian), their Face is Kitsune Mask (Pearl), their Head is Void, their extra is Loop Earring, their Style is 5, their Strength is 3, their Spirit is 4."

try:
    tensor1 = tensorlize_texts(model, str1)
except Exception as e:
    tensor1 = tensorlize_texts_slideWindow(model, str1, 45, 20)

print(tensor1)
# print(len(tensor1))

[[0.295318603515625, 0.4466552734375, -0.07173919677734375, 0.2760963439941406, 0.3887939453125, -0.59649658203125, -0.046417236328125, 0.4308662414550781, -0.075347900390625, 0.325439453125, 0.0817413330078125, -0.1146240234375, -0.04766082763671875, -0.0941925048828125, -0.21435546875, 0.02606201171875, 0.373687744140625, -0.107086181640625, 0.05908942222595215, -0.501800537109375, 0.3129425048828125, 0.059429168701171875, 0.35211181640625, 0.110443115234375, 0.188812255859375, -0.057033538818359375, -0.52545166015625, 0.26125335693359375, 0.04547119140625, -0.31732177734375, 0.24681854248046875, 0.156768798828125, -0.246673583984375, -0.10150527954101562, 0.6119384765625, 0.16094970703125, 0.472076416015625, -0.0641334056854248, -0.10406494140625, -0.3747711181640625, 0.0521240234375, -0.090972900390625, 0.4623870849609375, 0.2958984375, -0.37103271484375, 0.06674909591674805, 0.4700927734375, -0.16729736328125, -0.356719970703125, 0.0161285400390625, 0.3125, -0.0677947998046875, 0.

In [27]:
str1 = "A picture of a mysterious, half-open eye stares back at you from the shadows. A man in a Jasper jasper background, with a frown on his face, wears a T-shirt (pearl) and has long (turquoise) hair, a pair of glasses (obsidian) and headphones (rose) adorn his head. The 0N1 logo pin (black) is pinned to his chest. He exudes a strong, confident presence with a spirit (5) of 10."
print(str1.split())
splited_str1 = slide_window_tokenizer(str1, 50, 20)
for i in splited_str1:
    print(i)
    print(len(i.split()))

['A', 'picture', 'of', 'a', 'mysterious,', 'half-open', 'eye', 'stares', 'back', 'at', 'you', 'from', 'the', 'shadows.', 'A', 'man', 'in', 'a', 'Jasper', 'jasper', 'background,', 'with', 'a', 'frown', 'on', 'his', 'face,', 'wears', 'a', 'T-shirt', '(pearl)', 'and', 'has', 'long', '(turquoise)', 'hair,', 'a', 'pair', 'of', 'glasses', '(obsidian)', 'and', 'headphones', '(rose)', 'adorn', 'his', 'head.', 'The', '0N1', 'logo', 'pin', '(black)', 'is', 'pinned', 'to', 'his', 'chest.', 'He', 'exudes', 'a', 'strong,', 'confident', 'presence', 'with', 'a', 'spirit', '(5)', 'of', '10.']
A picture of a mysterious, half-open eye stares back at you from the shadows. A man in a Jasper jasper background, with a frown on his face, wears a T-shirt (pearl) and has long (turquoise) hair, a pair of glasses (obsidian) and headphones (rose) adorn his head. The 0N1 logo
50
background, with a frown on his face, wears a T-shirt (pearl) and has long (turquoise) hair, a pair of glasses (obsidian) and headphones 

In [31]:
str2 = "background, with a frown on his face, wears a T-shirt (pearl) and has long (turquoise) hair, a pair of glasses (obsidian) and headphones (rose) adorn his head. The 0N1 logo pin (black) is pinned to his chest. He exudes a strong, confident presence with a spirit (5) of 10."
tensorlize_texts(model, str1)

RuntimeError: Input A picture of a mysterious, half-open eye stares back at you from the shadows. A man in a Jasper jasper background, with a frown on his face, wears a T-shirt (pearl) and has long (turquoise) hair, a pair of glasses (obsidian) and headphones (rose) adorn his head. The 0N1 logo pin (black) is pinned to his chest. He exudes a strong, confident presence with a spirit (5) of 10. is too long for context length 77