In [3]:
!ls /mnt/d/HuggingFaceModels/models*

/mnt/d/HuggingFaceModels/models--IDEA-CCNL--Taiyi-CLIP-Roberta-102M-Chinese:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--OFA-Sys--chinese-clip-vit-base-patch16:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--Qwen--Qwen1.5-1.8B:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--Qwen--Qwen3-4B:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--bert-base-chinese:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--hfl--chinese-roberta-wwm-ext:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--openai--clip-vit-base-patch32:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--timm--convnext_tiny.in12k_ft_in1k:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--timm--resnet50.a1_in1k:
blobs  refs  snapshots

/mnt/d/HuggingFaceModels/models--unsloth--Qwen3-4B-Base:
blobs  refs  snapshots


In [2]:
cache_dir = "/mnt/d/HuggingFaceModels/"

In [4]:
from PIL import Image
import requests
import clip
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import CLIPProcessor, CLIPModel
import numpy as np

query_texts = ["一只猫", "一只狗",'两只猫', '两只老虎','一只老虎']  # 这里是输入文本的，可以随意替换。
# 加载Taiyi 中文 text encoder
text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese", cache_dir=cache_dir)
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese", cache_dir=cache_dir).eval()
text = text_tokenizer(query_texts, return_tensors='pt', padding=True)['input_ids']

url = "http://images.cocodataset.org/val2017/000000039769.jpg"  # 这里可以换成任意图片的url
# 加载CLIP的image encoder
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=cache_dir)  
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=cache_dir)
image = processor(images=Image.open(requests.get(url, stream=True).raw), return_tensors="pt")

with torch.no_grad():
    image_features = clip_model.get_image_features(**image)
    text_features = text_encoder(text).logits
    # 归一化
    image_features = image_features / image_features.norm(dim=1, keepdim=True)
    text_features = text_features / text_features.norm(dim=1, keepdim=True)
    # 计算余弦相似度 logit_scale是尺度系数
    logit_scale = clip_model.logit_scale.exp()
    logits_per_image = logit_scale * image_features @ text_features.t()
    logits_per_text = logits_per_image.t()
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    print(np.around(probs, 3))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[[0.012 0.    0.987 0.001 0.   ]]


## 评估：Recall@1/5/10 与 MeanRecall
- 在验证集上构建图像索引（特征）
- 对每条查询计算相似度并统计召回（优先使用 FAISS；不可用则 Torch 回退）


In [None]:
# 构建验证集图像索引
clip_model.eval()
tokenizer_eval = AutoTokenizer.from_pretrained('IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese', cache_dir=cache_dir, local_files_only=True)
valid_df = loader.load_queries(split='valid')
valid_imgs = loader.create_img_id_to_image_dict(split='valid')
valid_queries = []
if 'item_ids' in valid_df.columns:
    for _, row in valid_df.iterrows():
        q = row.get('query_text', None)
        ids = [str(i) for i in row.get('item_ids', [])] if isinstance(row.get('item_ids', []), list) else []
        if q and ids:
            valid_queries.append((q, ids))
print(f'Usable valid queries: {len(valid_queries)}')

# 图像特征提取（批量）
eval_image_tf = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
])
all_image_ids = list(valid_imgs.keys())
image_tensors, ids_kept = [], []
for k in all_image_ids:
    img = valid_imgs[k]
    if img is None:
        continue
    ids_kept.append(k)
    image_tensors.append(eval_image_tf(img.convert('RGB')))

with torch.no_grad():
    if len(image_tensors) == 0:
        all_image_feats = torch.empty((0, 512))
    else:
        feats = []
        bs = 128
        for s in tqdm(range(0, len(image_tensors), bs), desc='Build image index'):
            batch = torch.stack(image_tensors[s:s+bs]).to(device)
            with autocast(enabled=(device.type=='cuda')):
                img_feats = clip_model.get_image_features(pixel_values=batch)
            img_feats = torch.nn.functional.normalize(img_feats, p=2, dim=1)
            feats.append(img_feats.detach().cpu())
        all_image_feats = torch.cat(feats, dim=0)
all_image_ids = ids_kept

# FAISS 索引（可选）
faiss_index = None
if HAS_FAISS and all_image_feats.size(0) > 0:
    d = all_image_feats.size(1)
    faiss_index = faiss.IndexFlatIP(d)
    faiss_index.add(all_image_feats.numpy().astype('float32'))

# 评估召回
all_image_feats = all_image_feats.to(device)
def compute_recall_at_k(k_values, queries):
    recalls = {k: 0 for k in k_values}
    total = 0
    for q_text, gt_ids in tqdm(queries, desc='Evaluate'):
        tok = tokenizer_eval(q_text, return_tensors='pt', padding='max_length', truncation=True, max_length=32).to(device)
        with torch.no_grad():
            with autocast(enabled=(device.type=='cuda')):
                q_feat = clip_model.get_text_features(**tok)
            q_feat = torch.nn.functional.normalize(q_feat, p=2, dim=1)
        if faiss_index is not None:
            q_np = q_feat.detach().cpu().numpy().astype('float32')
            _, I = faiss_index.search(q_np, max(k_values))
            top_idx = I[0].tolist()
            top_ids = [all_image_ids[i] for i in top_idx]
        else:
            sims = torch.matmul(q_feat, all_image_feats.t())
            _, top_idx = torch.topk(sims[0], k=max(k_values))
            top_ids = [all_image_ids[i] for i in top_idx.tolist()]
        total += 1
        for k in k_values:
            if any(g in set(top_ids[:k]) for g in gt_ids):
                recalls[k] += 1
    return {k: (recalls[k] / max(total, 1)) if total > 0 else 0.0 for k in k_values}

metrics = compute_recall_at_k([1, 5, 10], valid_queries)
mean_recall = sum(metrics.values()) / len(metrics) if len(metrics) > 0 else 0.0
print(f"Recall@1={metrics[1]:.4f}, Recall@5={metrics[5]:.4f}, Recall@10={metrics[10]:.4f}, MeanRecall={mean_recall:.4f} (N={len(valid_queries)})")
