## Flickr30k-CN统计
Flickr30k-CN数据集下载：https://aistudio.baidu.com/datasetdetail/210101

In [1]:
import os
import glob

flickr_data_path = "datasets/Flicker30k_Dataset/"
flickr_anno_path = "datasets/Flickr30k-CNA/"

train_label = os.path.join(flickr_anno_path, "train", "flickr30k_cna_train.txt")
test_label = os.path.join(flickr_anno_path, "test", "flickr30k_cn_test.txt")
val_label = os.path.join(flickr_anno_path, "val", "flickr30k_cna_val.txt")

In [2]:
# 数据统计

img_list = glob.glob(f"{flickr_data_path}*.jpg")
total_num = len(img_list) #31783条
print(f"=> 图片总数：{total_num}张")

with open(train_label, encoding='utf-8') as f:
    train_label_list = f.readlines()
    print(f"=> train: {len(train_label_list)}张")

with open(test_label, encoding='utf-8') as f:
    test_label_list = f.readlines()
    print(f"=> test: {len(test_label_list)}张")
    
with open(val_label, encoding='utf-8') as f:
    val_label_list = f.readlines()
    print(f"=> val: {len(val_label_list)}张")

=> 图片总数：31783张
=> train: 148909张
=> test: 5000张
=> val: 5000张


## N张子集抽取

In [3]:
# 抽取val集中前N张图和标注

import os
import subprocess

# 多少图片
NUM = 10

# 输出目录
NEW_DATASET = f"assets/Flickr{NUM}/"
LABEL_FILE = NEW_DATASET + "label.txt" # 标签文件
try:
    os.makedirs(NEW_DATASET, mode=0o777, exist_ok=False)
    with open(LABEL_FILE, 'a') as file:
        i = 1
        tmp = []
        
        for line in train_label_list:
            
            # 1315402173	一个小男孩在检查一片南瓜地，他的手推车里已经有一些南瓜了。        
            if len(line_sp := line.split("\t")) == 2:
                
                # 写标签文件
                file.write(line)  # 写入内容并换行
                
                # 复制图片文件
                img = f"{flickr_data_path}{line_sp[0]}.jpg"
                if img not in tmp:
                    cmd = f"cp {img} {NEW_DATASET}"
                    res = subprocess.call(cmd, shell=True)
                    print(f"==> {i}: \t{img}")
                    
                    if i == NUM:
                        break
                    
                    i += 1
                    tmp.append(img)
except FileExistsError as e:
    print(e)

IMG_LIST = glob.glob(f"{NEW_DATASET}*.jpg")
print(f"==> {len(IMG_LIST)}张图")

==> 1: 	datasets/Flicker30k_Dataset/1312954382.jpg
==> 2: 	datasets/Flicker30k_Dataset/1313693129.jpg
==> 3: 	datasets/Flicker30k_Dataset/1313869424.jpg
==> 4: 	datasets/Flicker30k_Dataset/1313961775.jpg
==> 5: 	datasets/Flicker30k_Dataset/1313987366.jpg
==> 6: 	datasets/Flicker30k_Dataset/1314231418.jpg
==> 7: 	datasets/Flicker30k_Dataset/1315116409.jpg
==> 8: 	datasets/Flicker30k_Dataset/131624221.jpg
==> 9: 	datasets/Flicker30k_Dataset/1316247213.jpg
==> 10: 	datasets/Flicker30k_Dataset/131632409.jpg
==> 10张图


## 特征向量批量提取

In [4]:
import glob
import numpy as np
from tqdm import tqdm
from PIL import Image

from utils.altclip import AltCLIP
from utils.cnclip import CNCLIP

altclip = AltCLIP("/mnt/data/CLIP/models/AltCLIP", "cuda:0")
cnclip = CNCLIP("/mnt/data/CLIP/models/chinese-clip-vit-large-patch14", "cuda:1")

print(f"==> 开始提取：{NEW_DATASET}")
IMG_LIST = glob.glob(f"{NEW_DATASET}*.jpg")
print(f"==> {len(IMG_LIST)}张图")

# 图像路径保存txt文件
with open(f"assets/Flickr{NUM}_image_list.txt", 'w') as f: #"w"覆盖写入，"a"追加写入，r只读
    f.write('\n'.join(IMG_LIST))
print(f"==> 完成写入：assets/Flickr{NUM}_image_list.txt")

# 批处理特征提取
def img_feat_ext(IMG_LIST, clip_type):
    
    img_768_emb_list = np.empty((0, 768)).astype('float32')
    
    for img_path in tqdm(IMG_LIST):
        img = Image.open(img_path)
        
        if clip_type == "altclip":
            img_768_emb = altclip(img, txt=None)[0]
            img_768_emb_list = np.append(img_768_emb_list, img_768_emb, axis=0)
            
        if clip_type == "cnclip":
            img_768_emb = cnclip(img, txt=None)[0]
            img_768_emb_list = np.append(img_768_emb_list, img_768_emb, axis=0)
        
    print(img_768_emb_list.shape)
    return img_768_emb_list

# 图像特征提取并保存npy文件
CLIP_IDS = ["altclip", "cnclip"]
for clip_id in CLIP_IDS:
    np.save(f"assets/Flickr{NUM}_{clip_id}.npy", img_feat_ext(IMG_LIST, clip_type=clip_id))
    print(f"==> 完成写入：assets/Flickr{NUM}_{clip_id}.npy")

del altclip, cnclip
import torch
torch.cuda.empty_cache()

`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
2023-09-07 15:25:47.356682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-07 15:25:47.507451: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-07 15:25:48.065613: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No

==> 开始提取：assets/Flickr10/
==> 10张图
==> 完成写入：assets/Flickr10_image_list.txt


100%|██████████| 10/10 [00:01<00:00,  5.85it/s]


(10, 768)
==> 完成写入：assets/Flickr10_altclip.npy


100%|██████████| 10/10 [00:00<00:00, 11.70it/s]


(10, 768)
==> 完成写入：assets/Flickr10_cnclip.npy
