In [None]:
from transformers import BertTokenizer,BertModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from DataUtil import *
from my_model_p2 import *
import torch
from tqdm import tqdm
from torch import optim
from torch.utils.data import Dataset, DataLoader

def save_data_to_pkl(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {filename}")

def load_data_from_pkl(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
afqmc_benchmarkReader= AFQMCBenchmarkReader(data_path="AFQMC")
train_sentence_pairs, train_scores, dev_sentence_pairs, dev_scores = afqmc_benchmarkReader.get_data()


bert_file_path = "bert-chinese"
external_embed_dim = 200

tokenizer = BertTokenizer.from_pretrained(bert_file_path)
bert_model = BertModel.from_pretrained(bert_file_path).to(device)

sentences_dict = load_data_from_pkl('AFQMC/AFQMC_sentence_dict.pkl')

In [None]:
lcqmc_train_Dataset = STSDataset(train_sentence_pairs, train_scores, tokenizer, sentences_dict)
lcqmc_dev_Dataset = STSTestDataset(dev_sentence_pairs, dev_scores, tokenizer, sentences_dict)

lcqmc_train_loader = DataLoader(lcqmc_train_Dataset, batch_size=64, shuffle=True)
lcqmc_dev_loader = DataLoader(lcqmc_dev_Dataset, batch_size=64, shuffle=False)


print("Model loading...")
myModel = SimilarityModel(bert_model).to(device)# 确保模型在GPU上
# myModel.load_state_dict(torch.load('/data/szj/sts/transformers/SZJ_Model/STSModel.pth'))
criterion = nn.BCELoss()

In [None]:
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        # 使用 tqdm 包裹数据加载器，显示进度条
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.float()
            scores = scores.to(device)  # 移动到GPU
            scores = scores.unsqueeze(1)
            sentence1_words_embeddings = sentence1_words_embeddings.float().to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.float().to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

        evaluate_model_p(myModel, test_loader)

# Evaluation function
def evaluate_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.to(device)  # 移动到GPU
            sentence1_words_embeddings = sentence1_words_embeddings.to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            sentence1_words_embeddings=sentence1_words_embeddings,
                            sentence2_words_embeddings=sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            total_loss += loss.item()
    test_loss = total_loss / len(data_loader)
    print(f"Test Loss: {test_loss}")
    return test_loss

def evaluate_model_p(model, data_loader):
    model.eval()
    ture_num = 0
    total_num = 0
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.to(device)
            scores = scores.unsqueeze(1)  # 移动到GPU
            sentence1_words_embeddings = sentence1_words_embeddings.float().to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.float().to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            for i in range(len(outputs)):
                if float(outputs[i]) > 0.8:
                    score = 1
                else:
                    score = 0
                if int(scores[i]) == score:
                    ture_num += 1
                total_num += 1
    print(str(ture_num)+'/'+str(total_num))
    print('Accuracy: '+str(ture_num/total_num))
    accuracy_list.append(str(ture_num)+'/'+str(total_num))

# Training the model
accuracy_list = []
print("Training...")
optimizer = optim.Adam(myModel.parameters(), lr=1e-5)
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=5)
# train_model(myModel, lcqmc_train_loader, criterion, optimizer, epochs=1)

# Evaluating the model
dev_loss = evaluate_model_p(myModel, lcqmc_dev_loader)
print(f"Dev Loss: {dev_loss}")

In [None]:
optimizer = optim.Adam(myModel.parameters(), lr=1e-5)
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=5)
# train_model(myModel, lcqmc_train_loader, criterion, optimizer, epochs=1)

# Evaluating the model
dev_loss = evaluate_model_p(myModel, lcqmc_dev_loader)
print(f"Dev Loss: {dev_loss}")

In [None]:
def evaluate_model_p(model, data_loader):
    model.eval()
    ture_num = 0
    total_num = 0
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.to(device)
            scores = scores.unsqueeze(1)  # 移动到GPU
            sentence1_words_embeddings = sentence1_words_embeddings.float().to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.float().to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            for i in range(len(outputs)):
                if float(outputs[i]) > 0.3:
                    score = 1
                else:
                    score = 0
                if int(scores[i]) == score:
                    ture_num += 1
                total_num += 1
    print(str(ture_num)+'/'+str(total_num))
    print('Accuracy: '+str(ture_num/total_num))
    accuracy_list.append(str(ture_num)+'/'+str(total_num))

evaluate_model_p(myModel, lcqmc_dev_loader)

In [None]:
0.5 

In [None]:
from BCEmbedding import EmbeddingModel
embedding_model = EmbeddingModel(model_name_or_path="/data/coding/Short-Text-Similarity/bce-embedding")
emb = embedding_model.encode(["手机"])

In [None]:
emb

In [None]:
import pickle
def save_data_to_pkl(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {filename}")

def load_data_from_pkl(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

sentences_dict1 = load_data_from_pkl('/data/coding/Short-Text-Similarity/AFQMC_sentence_bce_dict_1.pkl')
sentences_dict2 = load_data_from_pkl('/data/coding/Short-Text-Similarity/AFQMC_sentence_bce_dict_2.pkl')
sentences_dict3 = load_data_from_pkl('/data/coding/Short-Text-Similarity/AFQMC_sentence_bce_dict_3.pkl')
sentences_dict4 = load_data_from_pkl('/data/coding/Short-Text-Similarity/AFQMC_sentence_bce_dict_4.pkl')
sentences_dict = {**sentences_dict1, **sentences_dict2, **sentences_dict3, **sentences_dict4}
print(len(sentences_dict1))



In [None]:
save_data_to_pkl(sentences_dict,"AFQMC_sentence_bce_dict.pkl")

In [None]:
print(len(sentences_dict1))
print(len(sentences_dict2))
print(len(sentences_dict3))
print(len(sentences_dict4))
print(len(sentences_dict))

In [None]:
from DataUtil import *
LCQMCReader = LCQMCBenchmarkReader('LCQMC')
train_sentence_pairs, train_scores, dev_sentence_pairs, dev_scores, test_sentence_pairs, test_scores = LCQMCReader.get_data()

# sentence_list = LCQMC_train_sentence_pairs+LCQMC_dev_sentence_pairs+LCQMC_test_sentence_pairs
sentence_list = train_sentence_pairs  + dev_sentence_pairs + test_sentence_pairs

In [None]:
from tqdm import tqdm
from BCEmbedding import EmbeddingModel
import numpy as np
from scipy.sparse import csr_matrix
import re
embedding_model = EmbeddingModel(model_name_or_path="/data/coding/Short-Text-Similarity/bce-embedding")
word_vector_length = 768
# 保证 n 为 50
n = 50
sentence_dict={}
for sentence1,sentence2 in tqdm(sentence_list):
    seg_list1 = list(jieba.cut(remove_punctuation(sentence1), cut_all=True))
    # 获取模型中每个词的向量
    sentence_embedding1 = []
    embeddings =[]
    for word in seg_list1:
        try:
            embeddings.append(embedding_model.encode([word])[0])
        except:
            embeddings.append(np.zeros(word_vector_length))
    if len(sentence_embedding1) > n:
        # 如果超过 50 个词，只取前 50 个词
        sentence_embedding1 = sentence_embedding1[:n]
    else:
        # 如果少于 50 个词，用零向量补充
        padding_length = n - len(sentence_embedding1)
        sentence_embedding1.extend([np.zeros(word_vector_length)] * padding_length)
    # 最终的句子向量矩阵为 50 x 200
    sentence_matrix1 = np.array(sentence_embedding1)
    sentence_matrix1 = csr_matrix(sentence_matrix1)
    sentence_dict[remove_punctuation(sentence1)] = sentence_matrix1

    seg_list2 = list(jieba.cut(remove_punctuation(sentence2), cut_all=True))
    # 获取模型中每个词的向量
    sentence_embedding2 = []
    embeddings =[]
    for word in seg_list2:
        try:
            embeddings.append(embedding_model.encode([word])[0])
        except:
            embeddings.append(np.zeros(word_vector_length))
    if len(sentence_embedding2) > n:
        # 如果超过 50 个词，只取前 50 个词
        sentence_embedding2 = sentence_embedding2[:n]
    else:
        # 如果少于 50 个词，用零向量补充
        padding_length = n - len(sentence_embedding2)
        sentence_embedding2.extend([np.zeros(word_vector_length)] * padding_length)
    # 最终的句子向量矩阵为 50 x 200
    sentence_matrix2 = np.array(sentence_embedding2)
    sentence_matrix2 = csr_matrix(sentence_matrix2)
    sentence_dict[remove_punctuation(sentence2)] = sentence_matrix2

In [None]:
save_data_to_pkl(sentence_dict, 'LCQMC_sentence_bce_dict.pkl')