In [185]:
import nltk
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch.nn.functional as F   # 激励函数的库
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import math

In [186]:
# 定义全局变量
n_epochs = 10  # epoch 的数目
batch_size = 1  # 决定每次读取多少样本

#常用函数
bert = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
embedding_dim=0

In [187]:
def label_translate(label):
    labels=[]
    for l in label:
        if l=='no':
            labels.append([1.0,0.0])
        elif l=='yes':
            labels.append([0.0,1.0])
    return torch.tensor(labels)

def sentence_embedding(sent,rel_pos):#对整个句子做嵌入，返回子句向量的list
    global embedding_dim
    v=torch.tensor(bert.encode(sent,batch_size=1))
    v=torch.cat((v,torch.tensor(rel_pos).unsqueeze(0).transpose(0,1)),1)
    embedding_dim=v.shape[1]
    return v


In [188]:
class ECEDataset_sent(Dataset): #每次读取一个句子
    def __init__(self, data_file, transform=None, target_transform=None):
        self.data = pd.read_csv(data_file)
        self.sent_data=self.data.groupby('sent_num')
        self.clauses,self.labels=[],[]
        for sample in self.sent_data:
            sent=sample[1]["text"].values
            label=sample[1]["label"].values
            rel_pos=sample[1]["rel_pos"].values
            self.clauses.append(sentence_embedding(sent,rel_pos))
            self.labels.append(label_translate(label))
        self.length=len(self.clauses)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        clause=self.clauses[idx]
        label=self.labels[idx]

        if self.transform:
            clause = self.transform(clause)
        if self.target_transform:
            label = self.target_transform(label)
        return clause, label

# 定义训练和数据集，以句子为单位
data = ECEDataset_sent(
    "C:\\Users\\27545\\VSCode\\Jupyter\\NLP\\ECE\\clause_keywords.csv")

#划分训练集和测试集
train_data,test_data=train_test_split(data,train_size=0.9)
print(train_data[0],"\n",test_data[0])

# 创建加载器
train_loader=torch.utils.data.DataLoader(train_data, batch_size = batch_size, num_workers = 1,shuffle=True)
test_loader=torch.utils.data.DataLoader(test_data, batch_size = batch_size, num_workers = 1,shuffle=True)

(tensor([[-2.9092e-01,  5.2768e-01,  5.5117e-02,  ...,  1.0992e+00,
         -3.3987e-01, -2.4000e+01],
        [-2.0327e-01,  6.5947e-01,  3.0245e-01,  ...,  5.2761e-01,
         -1.4165e-02, -2.3000e+01],
        [ 2.0353e-02,  5.2573e-01,  3.3532e-01,  ...,  5.9646e-01,
         -1.5202e-01, -2.2000e+01],
        ...,
        [ 2.3827e-01,  3.9158e-01,  6.9815e-01,  ...,  3.5373e-01,
         -1.8468e-01,  4.0000e+00],
        [-2.3928e-01,  1.4739e-01,  1.7036e+00,  ...,  2.4247e-01,
         -1.1574e-01,  5.0000e+00],
        [-9.7419e-02,  3.1230e-01,  3.4986e-01,  ...,  2.2160e-01,
         -3.9245e-01,  6.0000e+00]]), tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
      

In [189]:
# 感知机网络
class MLP(nn.Module):   
    def __init__(self,embedding_dim=768):
        super(MLP,self).__init__()   
        
        self.fc1 = torch.nn.Linear(embedding_dim,256)  
        self.fc2 = torch.nn.Linear(256,32)  
        self.fc3 = torch.nn.Linear(32,2)
    
        
    def forward(self,din):
        # 前向传播， 输入值：din, 返回值 dout
        dout = F.relu(self.fc1(din))   # 使用 relu 激活函数
        dout = F.relu(self.fc2(dout))
        dout = F.softmax(self.fc3(dout), dim=1)  # 输出层使用 softmax 激活函数
        # 2个数字实际上是2个类别，输出是概率分布，最后选取概率最大的作为预测值输出
        return dout

In [190]:
class dot_attention(nn.Module):
    """ 点积注意力机制"""

    def __init__(self, attention_dropout=0.0):
        super(dot_attention, self).__init__()
        self.dropout = nn.Dropout(attention_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, scale=None, attn_mask=None):
        """
        前向传播
        :param q:
        :param k:
        :param v:
        :param scale:
        :param attn_mask:
        :return: 上下文张量和attention张量。
        """
        attention = torch.bmm(q, k.transpose(1, 2))
        # print("at",attention)
        if scale:
            attention = attention * scale        # 是否设置缩放
        if attn_mask:
            attention = attention.masked_fill(attn_mask, -np.inf)     # 给需要mask的地方设置一个负无穷。
        # 计算softmax
        attention = self.softmax(attention)
        # print("ats",attention)
        # 添加dropout
        # attention = self.dropout(attention)
        # print("atd", attention)
        # 和v做点积。
        # print("atten:",attention.shape,"v:",v.shape)
        context = torch.bmm(attention, v)
        # print(context.shape)
        return context, attention

In [191]:
#多头自注意力机制
class MultiHeadAttention(nn.Module):
    """ 多头自注意力"""
    def __init__(self, model_dim=768, num_heads=2, dropout=0.0):
        super(MultiHeadAttention, self).__init__()

        self.dim_per_head = model_dim//num_heads   # 每个头的维度
        self.num_heads = num_heads
        self.linear_k = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_v = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_q = nn.Linear(model_dim, self.dim_per_head * num_heads)

        self.dot_product_attention = dot_attention(dropout)

        self.linear_final = nn.Linear(model_dim, model_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(model_dim)         # LayerNorm 归一化。

    def forward(self, key, value, query, attn_mask=None):
        # 残差连接
        residual = query

        dim_per_head = self.dim_per_head
        num_heads = self.num_heads
        batch_size = key.size(0)

        # 线性映射。
        key = self.linear_k(key)
        value = self.linear_v(value)
        query = self.linear_q(query)

        # 按照头进行分割
        key = key.view(batch_size * num_heads, -1, dim_per_head)
        value = value.view(batch_size * num_heads, -1, dim_per_head)
        query = query.view(batch_size * num_heads, -1, dim_per_head)

        if attn_mask:
            attn_mask = attn_mask.repeat(num_heads, 1, 1)

        # 缩放点积注意力机制
        scale = (key.size(-1) // num_heads) ** -0.5
        context, attention = self.dot_product_attention(query, key, value, scale, attn_mask)

        # 进行头合并 concat heads
        context = context.view(batch_size, -1, dim_per_head * num_heads)

        # 进行线性映射
        output = self.linear_final(context)

        # dropout
        output = self.dropout(output)

        # 添加残差层和正则化层。
        # print("output",output.shape,residual.shape)
        output = self.layer_norm(residual + output)

        return output, attention

#BERT_MultiHeadSelfAttention模型
class Attention_MLP(nn.Module):

    def __init__(self,model_dim=768, num_heads=1, dropout=0.0):
        super().__init__()
        self.attention1=MultiHeadAttention(model_dim, num_heads, dropout)
        self.attention2=MultiHeadAttention(model_dim, num_heads, dropout)
        self.attention3=MultiHeadAttention(model_dim, num_heads, dropout)
        self.attention4 = MultiHeadAttention(model_dim, num_heads, dropout)
        self.attention5 = MultiHeadAttention(model_dim, num_heads, dropout)

        self.FC=torch.nn.Linear(model_dim,2)
        # self.MLP=MLP(embedding_dim)

    def forward(self,query,key,value):
        output1, attention1 = self.attention1.forward(key,value,query)
        output2, attention2 = self.attention2.forward(key, value,
                                                      output1)
        output3, attention3 = self.attention3.forward(key,value,output2)
        output4, attention4 = self.attention4.forward(key,value,
                                                      output3)
        output5, attention5 = self.attention5.forward(key,value,
                                                      output4)
        # print(key.shape,attention5.shape)
        #print(key.shape, attention5.transpose(1, 2).shape)

        output6=key.transpose(1,2)*attention5
        output6=output6.transpose(1,2)

        predicted=F.softmax(self.FC(output6.squeeze(0)),dim=1)
        # predicted=self.MLP(output6.squeeze(0))

        return predicted

In [192]:
# 训练神经网络
def train():
    #定义损失函数和优化器
    lossfunc = nn.CrossEntropyLoss(weight=torch.tensor([1, 15]))  #加入权重weight=torch.tensor([1,15])
    lossfunc.cuda()
    optimizer = torch.optim.SGD(params = model.parameters(), lr = 0.01)
    # 开始训练
    for epoch in range(n_epochs):
        print('Epoch:  {}  \t'.format(epoch+1))
        train_loss = 0.0
        for data,target in tqdm(train_loader):
            keys=data
            target = target.squeeze(0)
            if [0.0,1.0] not in target.numpy().tolist():
                continue
            pos=target.numpy().tolist().index([0.0,1.0])
            # print(pos)
            querys=keys[:,pos,:].unsqueeze(0)
            # print(querys.shape)
            querys,keys,target=querys.to(device),keys.to(device),target.to(device) #送入cuda
            optimizer.zero_grad()   # 清空上一步的残余更新参数值
            output = model(querys,keys,keys)    # 得到预测值
            loss = lossfunc(output,target)  # 计算两者的误差
            loss.backward()         # 误差反向传播, 计算参数更新值
            optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
            train_loss += loss.item()*(len(data)*len(data[0]))
        train_loss = train_loss / len(train_loader.dataset)
        print('Training Loss: {:.6f}'.format( train_loss))
        # 每遍历一遍数据集，测试一下准确率
        test()

# 在数据集上测试神经网络
def test():
    TP,FN,FP,TN=0,0,0,0
    P,R,F = 0,0,0
    with torch.no_grad():  # 训练集中不需要反向传播
        for data, target in tqdm(test_loader):
            keys=data
            target = target.squeeze(0)
            if [0.0, 1.0] not in target.numpy().tolist():
                continue
            pos=target.numpy().tolist().index([0.0,1.0])
            # print(pos)
            querys=keys[:,pos,:].unsqueeze(0)
            querys, keys, target = querys.to(device), keys.to(
                device), target.to(device)  #送入cuda
            output = model(querys, keys, keys)  # 得到预测值
            predicted=output
            for i in range(predicted.shape[0]):
                if predicted[i][0]>predicted[i][1]:
                    pre=0
                else:
                    pre=1
                if pre == 1 and target[i][1] == 1.0 :
                    TP+=1
                elif pre == 0 and target[i][1] == 1.0:
                    FN+=1
                elif pre == 1 and target[i][0] == 1.0:
                    FP+=1
                elif pre == 0 and target[i][0] == 1.0:
                    TN+=1
        # print(TP,FN,FP,TN)
        # print(predicted)
        P=TP/(TP+FP)
        R=TP/(TP+FN)
        F=2/(1/P+1/R)
    print('Pricision, Recall and F of the network on the test clause: %f %%, %f %%, %f %%' % (
        100.0 * P,100.0*R,100.0*F))
    return 100.0 * P,100.0*R,100.0*F

# 声明感知器网络
model = Attention_MLP(embedding_dim,1,1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
train()

cuda:0
Epoch:  1  	


100%|██████████| 1895/1895 [00:16<00:00, 114.34it/s]


Training Loss: 17.494541


100%|██████████| 211/211 [00:03<00:00, 66.04it/s] 


Pricision, Recall and F of the network on the test clause: 23.634053 %, 86.915888 %, 37.162837 %
Epoch:  2  	


100%|██████████| 1895/1895 [00:16<00:00, 113.76it/s]


Training Loss: 12.863670


100%|██████████| 211/211 [00:03<00:00, 60.52it/s] 


Pricision, Recall and F of the network on the test clause: 45.291480 %, 94.392523 %, 61.212121 %
Epoch:  3  	


100%|██████████| 1895/1895 [00:16<00:00, 115.42it/s]


Training Loss: 11.153429


100%|██████████| 211/211 [00:03<00:00, 67.16it/s] 


Pricision, Recall and F of the network on the test clause: 55.645161 %, 96.728972 %, 70.648464 %
Epoch:  4  	


100%|██████████| 1895/1895 [00:16<00:00, 116.01it/s]


Training Loss: 10.563442


100%|██████████| 211/211 [00:03<00:00, 64.25it/s] 


Pricision, Recall and F of the network on the test clause: 66.883117 %, 96.261682 %, 78.927203 %
Epoch:  5  	


100%|██████████| 1895/1895 [00:14<00:00, 127.73it/s]


Training Loss: 10.310930


100%|██████████| 211/211 [00:02<00:00, 71.32it/s] 


Pricision, Recall and F of the network on the test clause: 66.559486 %, 96.728972 %, 78.857143 %
Epoch:  6  	


100%|██████████| 1895/1895 [00:15<00:00, 126.18it/s]


Training Loss: 10.155454


100%|██████████| 211/211 [00:02<00:00, 73.02it/s] 


Pricision, Recall and F of the network on the test clause: 69.696970 %, 96.728972 %, 81.017613 %
Epoch:  7  	


100%|██████████| 1895/1895 [00:13<00:00, 135.87it/s]


Training Loss: 10.024655


100%|██████████| 211/211 [00:02<00:00, 77.73it/s] 


Pricision, Recall and F of the network on the test clause: 72.887324 %, 96.728972 %, 83.132530 %
Epoch:  8  	


100%|██████████| 1895/1895 [00:14<00:00, 132.30it/s]


Training Loss: 10.001854


100%|██████████| 211/211 [00:02<00:00, 76.71it/s] 


Pricision, Recall and F of the network on the test clause: 72.125436 %, 96.728972 %, 82.634731 %
Epoch:  9  	


100%|██████████| 1895/1895 [00:14<00:00, 133.24it/s]


Training Loss: 9.914693


100%|██████████| 211/211 [00:02<00:00, 78.51it/s] 


Pricision, Recall and F of the network on the test clause: 75.090253 %, 97.196262 %, 84.725051 %
Epoch:  10  	


100%|██████████| 1895/1895 [00:14<00:00, 133.05it/s]


Training Loss: 9.916469


100%|██████████| 211/211 [00:02<00:00, 74.99it/s] 

Pricision, Recall and F of the network on the test clause: 73.309609 %, 96.261682 %, 83.232323 %



