# BERT微调

### 导包

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
BERT_PATH='./BERT_CCPoem_v1/BERT_CCPoem_v1/'
from sklearn.metrics import roc_auc_score, f1_score

In [17]:
import gc
def report_gpu():
   print(torch.cuda.list_gpu_processes())
   gc.collect()
   torch.cuda.empty_cache()
import gc
gc.collect()
torch.cuda.empty_cache()

## 创建数据集

In [18]:
df=pd.read_csv('./BERT_CCPoem_v1/dataprocess/data.csv')

In [19]:
df.head()

Unnamed: 0,translation,choices,answer
0,诗人啊，你竟像在遥远的地方站立船头。,行人初上木兰舟|骚人遥驻木兰舟|有人独上木兰舟|行人迢递木兰舟,1
1,他的双眼眼瞳碧绿而有光，头发金黄而弯曲，两鬓呈红色。,绿玉觜攒鸡脑破，玄金爪擘兔心开。|翅金肉白顶红麻，项糁毛青腿少瑕。|头似珊瑚项班红，翅如金箔...,3
2,清晨还是西北风。,清晨西北转|河岳西来转|凌晨从北固|西北转银潢,0
3,柴烟中红星乱闪。,流星紫入烟|红光生紫烟|乱荷红带紫|红星乱紫烟,3
4,在他们身边痛哭的只有尚未省事的儿郎。,狂叫唯童儿|狂呼造化儿|儿童趁欲狂|学叫笑儿娱,0


In [20]:
x=df.translation.to_numpy().reshape(-1,1)
choices=np.array(df['choices'].str.split('|').tolist())
true_label=df.answer.to_numpy().reshape(-1,1)
x.shape,choices.shape,true_label.shape

((21778, 1), (21778, 4), (21778, 1))

In [21]:
X = np.concatenate((x,choices ), axis=1) #沿着水平方向进行合并

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, true_label,test_size=0.3, random_state=10)
X_train.shape,X_test.shape,y_train.shape,y_test.shape


((15244, 5), (6534, 5), (15244, 1), (6534, 1))

## 微调策略
由于数据集的特殊性，一开始我们选择将数据集处理为原文：译文的形式，然后使用BERT进行微调。但是发现这样的微调方式并不好，因为最终模型的训练目标是二者相似，那么相似性可以划分为一个二分类问题，可以利用交叉熵的损失函数去进行训练，但是由于原文：译文的形式，导致训练集中的数据全是正例，模型在训练过程中携带了bias，无法进行训练。

在前者的基础上，我们将数据集划分为了 原文：选项（共4个，包含正确的译文）：正确的译文的下标的形式，这样通过计算原文与四个选项的相似度，即可在模型的输出外面再加上一个全连接层，输出的维度是分类数（4），这样损失函数依旧可以选用交叉熵，这样模型也学习了足够多的负例样本模型效果好

In [49]:
class BertForMultiClassClassificationDataset(torch.utils.data.Dataset):
    #定义一个数据集的抽象类
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        #获取长度
        return len(self.texts)

    def __getitem__(self, idx):
        #获取不同位置的元素
        text = self.texts[idx].tolist()
        label = self.labels[idx]
        #使用tokenizer.encode_plus对文本进行分词和编码
        tokens = self.tokenizer.batch_encode_plus(
            #在batch情况下，需要使用该函数防止维度有问题
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        # print(tokens['input_ids'])
        inputs = tokens['input_ids'].squeeze()
        attention_mask= tokens['attention_mask'].squeeze() #移除尺寸为1的维度
        labels = torch.tensor(label, dtype=torch.long)
        return inputs, attention_mask, labels

In [58]:
class BertForMultiClassClassification(nn.Module):
    def __init__(self, num_classes):
        super(BertForMultiClassClassification, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(0.01)  # 添加一个Dropout层
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)  # 全连接层，将BERT输出映射到分类标签

    def forward(self, input_ids,attention_mask):
        # input_ids: 输入的令牌序列
        # attention_mask: 输入的注意力掩码，用于指定哪些令牌参与注意力计算
        input_ids_split = torch.split(input_ids, split_size_or_sections=1, dim=1)
        attention_mask_split = torch.split(attention_mask, split_size_or_sections=1, dim=1)
        outputs = []
        for i in range(len(input_ids_split)):
            output= self.bert(input_ids_split[i].squeeze(), attention_mask=attention_mask_split[i].squeeze())
            pooled_output = output['pooler_output']

            pooled_output = self.dropout(pooled_output)# 使用Dropout层防止过拟合,一定的概率随机将某些神经元的输出设置为零，以防止过拟合。
            outputs.append( pooled_output)

        outputs = torch.stack(outputs, dim=1) #将五个维度的张量合在一起
        translation=outputs[:,0,:].unsqueeze(1)
        # 使用余弦相似度译文与四个选项的相似度
        similarity_scores =F.softmax( nn.functional.cosine_similarity(outputs[:,1:,:], translation,dim=2))
        
        

        return similarity_scores

### 超参数

In [70]:
# 定义一些超参数
max_length = 128 #最大长度
batch_size = 32 #批大小
num_epochs = 3 #迭代次数
learning_rate = 1e-8 #学习率
num_classes = 4  #分类数
max_batch=16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [61]:
tokenizer=BertTokenizer.from_pretrained(BERT_PATH)

In [62]:
# 生成相应数据集
train_dataset = BertForMultiClassClassificationDataset(X_train, y_train, tokenizer, max_length)
test_dataset = BertForMultiClassClassificationDataset(X_test, y_test, tokenizer, max_length)
# 加载相应数据集和模型
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False)

In [71]:
# 实例化模型
model = BertForMultiClassClassification(num_classes) #在models.py中定义的模型，其中使用了bert模型
model.to(device)


criterion = nn.CrossEntropyLoss() #交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #使用adamw优化器

### 训练

In [74]:
model.train() #设置模型为训练模式
all_predictions = []
all_labels = []
for epoch in range(num_epochs):
        train_total_correct = 0
        train_total_samples = 0
        
        for input_ids, attention_mask, labels in train_dataloader:
                #对每个批次的训练数据进行前向传播、计算损失、反向传播和参数更新

                ## 将tensor移动到GPU上
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device).squeeze()
                # print(labels)
                # print(input_ids.shape,attention_mask.shape,labels.shape)

                # 优化器进行初始化
                optimizer.zero_grad()

                outputs = model(input_ids, attention_mask)
                # print(outputs.shape)
                _,predicted = torch.max(outputs,1)
                # print(predicted)
                loss = criterion(outputs, labels) #计算损失函数
                train_total_correct +=torch.sum(predicted == labels).item()
                train_total_samples += labels.size(0)
                loss.backward()#反向传播
                optimizer.step() #参数更新

        train_accuracy = train_total_correct / train_total_samples

        model.eval()

        with torch.no_grad():
                #不计算梯度，对test集的元素进行验证
                test_total_correct = 0
                test_total_samples = 0

                for test_input_ids, test_attention_mask, test_labels in test_dataloader:
                        test_input_ids = test_input_ids.to(device)
                        test_attention_mask = test_attention_mask.to(device)
                        test_labels = test_labels.to(device).squeeze()

                        test_outputs = model(test_input_ids, test_attention_mask)
                        _, test_predicted_labels = torch.max(test_outputs, dim=1)

                        test_total_correct += torch.sum(test_predicted_labels ==
                                                        test_labels).item()
                        test_total_samples += test_labels.size(0)
                        all_predictions.extend(test_predicted_labels.cpu().numpy())
                        all_labels.extend(test_labels.cpu().numpy())

        test_accuracy = test_total_correct / test_total_samples
        # auc = roc_auc_score(all_labels, all_predictions,multi_class='ovo')

        # 计算F1 Score
        f1 = f1_score(all_labels, all_predictions, average='weighted')  # 可以选择适当的average参数

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Training Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}  F1 Score: {f1:.4f}')


  similarity_scores =F.softmax( nn.functional.cosine_similarity(outputs[:,1:,:], translation,dim=2))


Epoch 1/3, Loss: 1.3846672773361206, Training Accuracy: 0.4541, Test Accuracy: 0.6336  F1 Score: 0.6336


  similarity_scores =F.softmax( nn.functional.cosine_similarity(outputs[:,1:,:], translation,dim=2))


Epoch 2/3, Loss: 1.38505220413208, Training Accuracy: 0.6517, Test Accuracy: 0.6356  F1 Score: 0.6346


  similarity_scores =F.softmax( nn.functional.cosine_similarity(outputs[:,1:,:], translation,dim=2))


Epoch 3/3, Loss: 1.3835978507995605, Training Accuracy: 0.6521, Test Accuracy: 0.6364  F1 Score: 0.6352


### 保存模型

In [75]:
# torch.save(model.state_dict(), 'bert_poem.bin') #保存模型参数至文件中
torch.save(model.bert.state_dict(), 'bert_model_params.pth')