In [3]:
import os
import sys

from train_watermark_model import TransformModel
from transformers import AutoTokenizer, BertModel, BertTokenizer
import bias
import numpy as np
import torch
import json
import Levenshtein
from tqdm import tqdm

def levenshtein_distance(list1, list2):
    return Levenshtein.distance(list1, list2)

def compare_list(list1, list2):
    count = 0
    for index in range(len(list1)):
        if list1[index] == list2[index]:
            count += 1
    return count

def get_greenlist_ids(text):
    context_embedding =bias.get_embedding(torch.device('cuda'), embedding_model, embedding_tokenizer, text)
    with torch.no_grad():
        output = transform_model(context_embedding).cpu()[0].detach().numpy()
    similarity_array = bias.scale_vector(output)
    similarity_array = np.tile(similarity_array, (len(tokenizer)//300)+1)[:len(tokenizer)]   # 扩充到词表大小
    similarity_array = torch.from_numpy(-similarity_array)
    indices = torch.nonzero(similarity_array > 0)   # 获取标记为1的位置
    greenlist_ids = indices.view(-1).tolist()
    return greenlist_ids

# 计算句子嵌入之间的余弦相似度
def cosine_similarity(x, y):
    '''
    :param x: 句子嵌入
    :param y: 句子嵌入
    '''
    x = tokenizer.encode(x, return_tensors="pt").to(torch.device('cuda'))
    y = tokenizer.encode(y, return_tensors="pt").to(torch.device('cuda'))
    with torch.no_grad():
        output_x = embedding_model(x)[0][:, 0, :]
        output_y = embedding_model(y)[0][:, 0, :]
    dot_product = torch.sum(output_x * output_y, dim=-1)
    norm_x = torch.norm(output_x, p=2, dim=-1)
    norm_y = torch.norm(output_y, p=2, dim=-1)
    return dot_product / (norm_x * norm_y)

def vector_distance(x, y):
    return torch.norm(x-y, p=2)

def show_compare(x, y):
    embedding_x = bias.get_embedding(torch.device('cuda'), embedding_model, embedding_tokenizer, x)
    embedding_y = bias.get_embedding(torch.device('cuda'), embedding_model, embedding_tokenizer, y)
    sim = cosine_similarity(x, y)
    distance = vector_distance(embedding_x, embedding_y)
    # print(f'sim=>{sim}\ndistance=>{distance}')
    return sim, distance


embedding_tokenizer = AutoTokenizer.from_pretrained('Mark/models/bert-large-uncased')
embedding_model = BertModel.from_pretrained('Mark/models/bert-large-uncased').to(torch.device('cuda'))
transform_model = TransformModel(input_dim=1024, output_dim=300)
transform_path = "Mark/SRTWM/models/transform_model-bert_large-DISTANCE2.pth"
transform_model.load_state_dict(torch.load(transform_path))
transform_model = transform_model.to(torch.device('cuda'))
tokenizer = BertTokenizer.from_pretrained('Mark/models/bert-base-cased')
print('===transform_model完毕===')
mapping_file = "Mark/SRTWM/utils1/mappings/300-bert-large-uncased.json"
if os.path.exists(mapping_file):
    with open(mapping_file, 'r') as f:
        mapping = json.load(f)
print('===mapping完毕===')

dataset = json.load(open('Mark/SRTWM/result/REPEAT/rewrite/bert_large-no_context-dropout-distance2-random-0.75-800.json'))
original_text = dataset[0]['original_text']
rewrite_text = dataset[0]['rewrite_text']

original_embedding = bias.get_embedding(torch.device('cuda'), embedding_model, embedding_tokenizer, original_text)
rewrite_embedding = bias.get_embedding(torch.device('cuda'), embedding_model, embedding_tokenizer, rewrite_text)

original_list = get_greenlist_ids(original_text)
rewrite_list = get_greenlist_ids(rewrite_text)

print(type(vector_distance(original_embedding, rewrite_embedding)))

distance_list = []
for i in tqdm(range(len(dataset))):
    sim, distance = show_compare(dataset[i]['original_text'], dataset[i]['rewrite_text'])
    if sim > 0.9:
        distance_list.append(distance)

  from .autonotebook import tqdm as notebook_tqdm


OSError: Incorrect path_or_model_id: 'Mark/models/bert-large-uncased'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
count = 0
for i in distance_list:
    if i > 5:
        count+=1

print(f'{count}/{len(distance_list)}')