In [None]:
from transformers import BertTokenizer,BertModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from DataUtil import *
from my_model_p import *
import torch
from tqdm import tqdm
from torch import optim
from torch.utils.data import Dataset, DataLoader

def save_data_to_pkl(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {filename}")

def load_data_from_pkl(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
afqmc_benchmarkReader= AFQMCBenchmarkReader(data_path="OPPO-xiaobu")
train_sentence_pairs, train_scores, dev_sentence_pairs, dev_scores = afqmc_benchmarkReader.get_data()


bert_file_path = "bert-chinese"
external_embed_dim = 200

tokenizer = BertTokenizer.from_pretrained(bert_file_path)
bert_model = BertModel.from_pretrained(bert_file_path).to(device)

sentences_dict = load_data_from_pkl('OPPO-xiaobu/OPPOxiaobu_sentence_dict.pkl')

In [None]:
lcqmc_train_Dataset = STSDataset(train_sentence_pairs, train_scores, tokenizer, sentences_dict)
lcqmc_dev_Dataset = STSTestDataset(dev_sentence_pairs, dev_scores, tokenizer, sentences_dict)

lcqmc_train_loader = DataLoader(lcqmc_train_Dataset, batch_size=64, shuffle=True)
lcqmc_dev_loader = DataLoader(lcqmc_dev_Dataset, batch_size=64, shuffle=False)



print("Model loading...")
myModel = SimilarityModel(bert_model,200).to(device)# 确保模型在GPU上
# myModel.load_state_dict(torch.load('/data/szj/sts/transformers/SZJ_Model/STSModel.pth'))
criterion = nn.BCELoss()

In [None]:
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        # 使用 tqdm 包裹数据加载器，显示进度条
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.float()
            scores = scores.to(device)  # 移动到GPU
            scores = scores.unsqueeze(1)
            sentence1_words_embeddings = sentence1_words_embeddings.float()
            sentence2_words_embeddings = sentence2_words_embeddings.float()
            sentence1_words_embeddings = sentence1_words_embeddings.to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

        evaluate_model_p(myModel, test_loader)

# Evaluation function
def evaluate_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.to(device)  # 移动到GPU
            sentence1_words_embeddings = sentence1_words_embeddings.to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            sentence1_words_embeddings=sentence1_words_embeddings,
                            sentence2_words_embeddings=sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            total_loss += loss.item()
    test_loss = total_loss / len(data_loader)
    print(f"Test Loss: {test_loss}")
    return test_loss

def evaluate_model_p(model, data_loader):
    model.eval()
    ture_num = 0
    total_num = 0
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.float()
            scores = scores.to(device)  # 移动到GPU
            scores = scores.unsqueeze(1)
            sentence1_words_embeddings = sentence1_words_embeddings.float()
            sentence2_words_embeddings = sentence2_words_embeddings.float()
            sentence1_words_embeddings = sentence1_words_embeddings.to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            total_loss += loss.item()
            for i in range(len(outputs)):
                if float(outputs[i]) > 0.5:
                    score = 1
                else:
                    score = 0
                if int(scores[i]) == score:
                    ture_num += 1
                total_num += 1
    test_loss = total_loss / len(data_loader)
    print(f"Test Loss: {test_loss}")
    print('Accuracy: '+str(ture_num/total_num))
    accuracy = ture_num/total_num
    loss_list.append(test_loss)
    global best_loss
    global best_model
    if test_loss <= best_loss:
        best_loss = test_loss
        best_model = model
    print("best:"+str(best_loss))

# Training the model
print("Training...")
best_loss=1000.0
loss_list = []
print("Training...")
optimizer = optim.Adam(myModel.parameters(), lr=2e-5)
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=5)
myModel = best_model
print('-------------')
optimizer = optim.Adam(myModel.parameters(), lr=1e-5)
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=5)
myModel = best_model
optimizer = optim.Adam(myModel.parameters(), lr=1e-6)
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=5)


In [None]:
train_model(myModel, lcqmc_train_loader, lcqmc_dev_loader,criterion, optimizer, epochs=10)

In [None]:
def evaluate_model_p(model, data_loader):
    model.eval()
    ture_num = 0
    total_num = 0
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            inputs, scores, sentence1_words_embeddings, sentence2_words_embeddings = batch
            input_ids = inputs['input_ids'].squeeze(1).to(device)  # 移动到GPU
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            scores = scores.float()
            scores = scores.to(device)  # 移动到GPU
            scores = scores.unsqueeze(1)
            sentence1_words_embeddings = sentence1_words_embeddings.float()
            sentence2_words_embeddings = sentence2_words_embeddings.float()
            sentence1_words_embeddings = sentence1_words_embeddings.to(device)
            sentence2_words_embeddings = sentence2_words_embeddings.to(device)

            outputs = model(input_ids, attention_mask,
                            sentence1_words_embeddings,
                            sentence2_words_embeddings)
            loss = criterion(outputs, scores)
            total_loss += loss.item()
            for i in range(len(outputs)):
                if float(outputs[i]) > 0.6:
                    score = 1
                else:
                    score = 0
                if int(scores[i]) == score:
                    ture_num += 1
                total_num += 1
    test_loss = total_loss / len(data_loader)
    print(f"Test Loss: {test_loss}")
    print('Accuracy: '+str(ture_num/total_num))
    accuracy = ture_num/total_num
    loss_list.append(test_loss)
    global best_loss
    global best_model
    if test_loss <= best_loss:
        best_loss = test_loss
        best_model = model
    print("best:"+str(best_loss))
evaluate_model_p(myModel,lcqmc_dev_loader)