# Roberta-base 转换为hugging face版

In [None]:
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

# NOTE 使用open_clip 中文pretrain model训练的结果
taiyi_path = './project/open_clip_new/src/logs/2022_11_18-21_15_16-model_ViT-L-14-lr_0.0001-b_640-j_16-p_amp/checkpoints/epoch_4.pt'
bertconfig = BertConfig.from_pretrained("hfl/chinese-roberta-wwm-ext", num_labels=512)
my_transformer = BertForSequenceClassification.from_pretrained("hfl/chinese-roberta-wwm-ext", config=bertconfig)
mytokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")

# NOTE 需要改名加载
state_dict_of_bert = torch.load(taiyi_path)['state_dict']
bert_weights = {k.replace('module.transformer.',''):v for k,v in state_dict_of_bert.items() if 'module.transformer' in k}
my_transformer.load_state_dict(bert_weights)

In [None]:
# 同时保存模型和词表格。然后把这个上传到huggingface上面去
my_transformer.save_pretrained('./CLIP-roberta')
mytokenizer.save_pretrained('./CLIP-roberta')

In [None]:
total = sum([param.nelement() for param in my_transformer.parameters()])
print("Number of parameter: %.2fM" % (total/1e6))

# Roberta-large 版

In [None]:
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

# NOTE 使用中文pretrain model训练的结果
taiyi_path = './open_clip/src/logs/2022_07_18-18_39_51-model_ViT-L-14-lr_1e-05-b_224-j_8-p_amp/checkpoints/epoch_7.pt'
bertconfig = BertConfig.from_pretrained("hfl/chinese-roberta-wwm-ext-large", num_labels=768)
my_transformer = BertForSequenceClassification.from_pretrained("hfl/chinese-roberta-wwm-ext-large", config=bertconfig)
mytokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")


state_dict_of_bert = torch.load(taiyi_path)['state_dict']
bert_weights = {k.replace('module.transformer.',''):v for k,v in state_dict_of_bert.items() if 'module.transformer' in k}
my_transformer.load_state_dict(bert_weights)

In [None]:
# 同时保存模型和词表格。然后把这个上传到huggingface上面去
my_transformer.save_pretrained('./Taiyi-CLIP-Roberta-large-326M-Chinese')
mytokenizer.save_pretrained('./Taiyi-CLIP-Roberta-large-326M-Chinese')

In [None]:
total = sum([param.nelement() for param in my_transformer.parameters()])
print("Number of parameter: %.2fM" % (total/1e6))


In [None]:
from PIL import Image
import requests
import clip
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import CLIPProcessor, CLIPModel
import numpy as np

query_texts = ["一只猫", "一只狗",'两只猫', '两只老虎','一只老虎']  # 这里是输入文本的，可以随意替换。
# 加载Taiyi 中文 text encoder
text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval()
text = text_tokenizer(query_texts, return_tensors='pt', padding=True)['input_ids']

url = "http://images.cocodataset.org/val2017/000000039769.jpg"  # 这里可以换成任意图片的url
# 加载CLIP的image encoder
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")  
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
image = processor(images=Image.open(requests.get(url, stream=True).raw), return_tensors="pt")

with torch.no_grad():
    image_features = clip_model.get_image_features(**image)
    text_features = text_encoder(text).logits
    # 归一化
    image_features = image_features / image_features.norm(dim=1, keepdim=True)
    text_features = text_features / text_features.norm(dim=1, keepdim=True)
    # 计算余弦相似度 logit_scale是尺度系数
    logit_scale = clip_model.logit_scale.exp()
    logits_per_image = logit_scale * image_features @ text_features.t()
    logits_per_text = logits_per_image.t()
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    print(np.around(probs, 3))


# ViT-H， 维度对应的Roberta-Large

In [None]:
import torch
from transformers import BertModel, BertTokenizer

# NOTE load from local path
local_path = './scripts_t2i/open_clip_new/src/logs/2022_09_16-23_03_14-model_ViT-H-14-lr_5e-05-b_256-j_32-p_amp/checkpoints/epoch_21.pt'
text_encoder = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext-large").cuda().eval()
state_dict_of_bert = torch.load(local_path)['state_dict']
bert_weights = {k.replace('module.transformer.',''):v for k,v in state_dict_of_bert.items() if 'module.transformer' in k}
text_encoder.load_state_dict(bert_weights)
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")


In [None]:
# 同时保存模型和词表格。然后把这个上传到huggingface上面去
text_encoder.save_pretrained('./fengshen/Taiyi-CLIP-Roberta-326M-ViT-H-Chinese')
tokenizer.save_pretrained('./fengshen/Taiyi-CLIP-Roberta-326M-ViT-H-Chinese')

# ViT-L --- Roberta-base

In [None]:
import torch
from transformers import BertModel, BertTokenizer

# NOTE load from local path
local_path = './project/open_clip_new/src/logs/2022_11_18-21_15_16-model_ViT-L-14-lr_0.0001-b_640-j_16-p_amp/checkpoints/epoch_4.pt'
text_encoder = BertModel.from_pretrained("hfl/chinese-roberta-wwm-ext").cuda().eval()
state_dict_of_bert = torch.load(local_path)['state_dict']
bert_weights = {k.replace('module.transformer.',''):v for k,v in state_dict_of_bert.items() if 'module.transformer' in k}
text_encoder.load_state_dict(bert_weights)
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")


In [None]:
# 同时保存模型和词表格。然后把这个上传到huggingface上面去
text_encoder.save_pretrained('./project/temp_weights/vit-l-roberta-base')
tokenizer.save_pretrained('./project/temp_weights/vit-l-roberta-base')

In [None]:
total = sum([param.nelement() for param in text_encoder.parameters()])
print("Number of parameter: %.2fM" % (total/1e6))