<a href="https://colab.research.google.com/github/Rayars/ECE/blob/main/BERT_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers
!pip install sklearn

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 7.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 53.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 62.1 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch.nn.functional as F   # 激励函数的库
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random 
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# 定义全局变量
n_epochs = 10     # epoch 的数目
batch_size = 20  # 决定每次读取多少样本

class ECEDataset_sent(Dataset): #每次读取一个句子
    def __init__(self, data_file, transform=None, target_transform=None):
        self.data = pd.read_csv(data_file)
        self.sent_data=self.data.groupby('sent_num')
        self.clauses,self.labels=[],[]
        for i in self.sent_data:
          self.clauses.append(i[1])
          self.labels.append(i[1]['label'])
        self.length=len(self.clauses)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.length

    def __getitem__(self, idx): 
        clause=list(self.clauses[idx]['text'].values)
        clause += ["" for i in range(75-len(clause))]
        label=list(self.labels[idx].values)
        label += ["no" for i in range(75-len(label))]


        if self.transform:
            clause = self.transform(clause)
        if self.target_transform:
            label = self.target_transform(label)
        return clause, label

# 定义训练和数据集，以句子为单位
data=ECEDataset_sent("/content/sample_data/clause_keywords.csv")

#划分训练集和测试集
train_data,test_data=train_test_split(data,train_size=0.8)
# print(train_data[0],"\n",test_data[0])
# print(len(train_data))
# max=0
# for i in range(len(train_data)):
#   if len(train_data[i][0])>max:
#     max=len(train_data[i][0])
# for i in range(len(test_data)):
#   if len(test_data[i][0])>max:
#     max=len(test_data[i][0])  
# print(max)


# 创建加载器
train_loader=torch.utils.data.DataLoader(train_data, batch_size = batch_size, num_workers = 2,shuffle=True,drop_last=True)
test_loader=torch.utils.data.DataLoader(test_data, batch_size = batch_size, num_workers = 2,shuffle=True,drop_last=True)

In [4]:
# 感知机网络
class MLP(nn.Module):   
    def __init__(self):
        super(MLP,self).__init__()   
        
        self.fc1 = torch.nn.Linear(768,256)  
        self.fc2 = torch.nn.Linear(256,128)  
        self.fc3 = torch.nn.Linear(128,64)   
        self.fc4 = torch.nn.Linear(64,32)   
        self.fc5 = torch.nn.Linear(32,8)
        self.fc6 = torch.nn.Linear(8,2)
        

        
    def forward(self,din):
        # 前向传播， 输入值：din, 返回值 dout
        dout = F.relu(self.fc1(din))   # 使用 relu 激活函数
        dout = F.relu(self.fc2(dout))
        dout = F.relu(self.fc3(dout))
        dout = F.relu(self.fc4(dout))
        dout = F.relu(self.fc5(dout))
        dout = F.softmax(self.fc6(dout), dim=1)  # 输出层使用 softmax 激活函数
        # 2个数字实际上是2个类别，输出是概率分布，最后选取概率最大的作为预测值输出
        return dout

In [5]:
class dot_attention(nn.Module):
    """ 点积注意力机制"""

    def __init__(self, attention_dropout=0.0):
        super(dot_attention, self).__init__()
        self.dropout = nn.Dropout(attention_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, scale=None, attn_mask=None):
        """
        前向传播
        :param q:
        :param k:
        :param v:
        :param scale:
        :param attn_mask:
        :return: 上下文张量和attention张量。
        """
        attention = torch.bmm(q, k.transpose(1, 2))
        if scale:
            attention = attention * scale        # 是否设置缩放
        if attn_mask:
            attention = attention.masked_fill(attn_mask, -np.inf)     # 给需要mask的地方设置一个负无穷。
        # 计算softmax
        attention = self.softmax(attention)
        # 添加dropout
        attention = self.dropout(attention)
        # 和v做点积。
        context = torch.bmm(attention, v)
        return context, attention

In [6]:
#多头自注意力机制 
class MultiHeadAttention(nn.Module):
    """ 多头自注意力"""
    def __init__(self, model_dim=768, num_heads=2, dropout=0.0):
        super(MultiHeadAttention, self).__init__()

        self.dim_per_head = model_dim//num_heads   # 每个头的维度
        self.num_heads = num_heads
        self.linear_k = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_v = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_q = nn.Linear(model_dim, self.dim_per_head * num_heads)

        self.dot_product_attention = dot_attention(dropout)

        self.linear_final = nn.Linear(model_dim, model_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(model_dim)         # LayerNorm 归一化。

    def forward(self, key, value, query, attn_mask=None):
        # 残差连接
        residual = query

        dim_per_head = self.dim_per_head
        num_heads = self.num_heads
        batch_size = key.size(0)

        # 线性映射。
        key = self.linear_k(key)
        value = self.linear_v(value)
        query = self.linear_q(query)

        # 按照头进行分割
        key = key.view(batch_size * num_heads, -1, dim_per_head)
        value = value.view(batch_size * num_heads, -1, dim_per_head)
        query = query.view(batch_size * num_heads, -1, dim_per_head)

        if attn_mask:
            attn_mask = attn_mask.repeat(num_heads, 1, 1)

        # 缩放点击注意力机制
        scale = (key.size(-1) // num_heads) ** -0.5
        context, attention = self.dot_product_attention(query, key, value, scale, attn_mask)

        # 进行头合并 concat heads
        context = context.view(batch_size, -1, dim_per_head * num_heads)

        # 进行线性映射
        output = self.linear_final(context)

        # dropout
        output = self.dropout(output)

        # 添加残差层和正则化层。
        output = self.layer_norm(residual + output)

        return output, attention
#BERT_MultiHeadSelfAttention模型
class Attention_MLP(nn.Module):

    def __init__(self,model_dim=768, num_heads=1, dropout=0.0):
        super().__init__()
        self.attention=MultiHeadAttention(model_dim=768, num_heads=1, dropout=0.0)
        self.MLP=MLP()

    def forward(self,query,key,value):
        output,attention=self.attention.forward(key,query,value)
        result=[]
        for sent in output:
          sent_r=[]
          for clause in sent:
            sent_r.append(self.MLP(clause.unsqueeze(0)).squeeze(0))
          result.append(torch.stack(sent_r))     
        return torch.stack(result)

In [7]:
#常用函数
def label_translate(target):
  target=list(target)
  for i in range(len(target)):
    if target[i]=='yes':
      target[i]=[1.0,0.0]
    else:
      target[i]=[0.0,1.0]
  target=torch.tensor(target)
  return target

def sentence_embedding(sent):#对整个句子做嵌入，返回子句向量的list
  sentence=[]
  bert=SentenceTransformer('bert-base-nli-mean-tokens')
  for c in sent:
    v=torch.tensor(bert.encode(c))
    sentence.append(v)
  sentence=torch.stack(sentence)
  return sentence

def positoin_embedding(clause):
  pass

In [None]:
# 训练神经网络
def train():
    #定义损失函数和优化器
    lossfunc = nn.CrossEntropyLoss()
    lossfunc.cuda()
    optimizer = torch.optim.SGD(params = model.parameters(), lr = 0.01)
    # 开始训练
    for epoch in range(n_epochs):
        print('Epoch:  {}  \t'.format(epoch+1))
        train_loss = 0.0
        for data,target in tqdm(train_loader):
            tran_data,tran_target=[],[]
            for i in range(batch_size):#经过这个双重循环可以将dataloader取出的数据转置
              sentence=[]
              label=[]
              for d in data:
                sentence.append(d[i])
              for l in target:
                label.append(l[i])
              tran_data.append(sentence)
              tran_target.append(label)
            data,target=tran_data,tran_target
            keys=data
            for i in range(batch_size):   #做batch*50大小的嵌入
              keys[i]=sentence_embedding(keys[i])
            keys=torch.stack(keys,0)
            querys=keys
            newtarget=[]
            for t in target:
              newtarget.append(label_translate(t))
            target=torch.stack(newtarget,0)
            querys,keys,target=querys.to(device),keys.to(device),target.to(device) #送入cuda
            optimizer.zero_grad()   # 清空上一步的残余更新参数值
            output = model(querys,keys,keys)    # 得到预测值           
            loss = lossfunc(output,target)  # 计算两者的误差
            loss.backward()         # 误差反向传播, 计算参数更新值
            optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
            train_loss += loss.item()*(len(data)*len(data[0]))
        train_loss = train_loss / len(train_loader.dataset)
        print('Training Loss: {:.6f}'.format( train_loss))
        # 每遍历一遍数据集，测试一下准确率
        test()

# 在数据集上测试神经网络
def test():
    TP,FN,FP,TN=0,0,0,0
    P,R,F = 0,0,0
    with torch.no_grad():  # 训练集中不需要反向传播
        for data,target in tqdm(test_loader):
            tran_data,tran_target=[],[]
            for i in range(batch_size):#经过这个双重循环可以将dataloader取出的数据转置
              sentence=[]
              label=[]
              for d in data:
                sentence.append(d[i])
              for l in target:
                label.append(l[i])
              tran_data.append(sentence)
              tran_target.append(label)
            data,target=tran_data,tran_target
            keys=data
            for i in range(batch_size):   #做batch*50大小的嵌入
              keys[i]=sentence_embedding(keys[i])
            keys=torch.stack(keys,0)
            querys=keys
            newtarget=[]
            for t in target:
              newtarget.append(label_translate(t))
            target=torch.stack(newtarget,0)
            querys,keys,target=querys.to(device),keys.to(device),target.to(device) #送入cuda
            outputs = model(querys,keys,keys) 
            for sent in range(batch_size):
              for clause in range(len(outputs[0])):
                if data[sent][clause][0]>data[sent][clause][1]:
                  predicted=0
                else:
                  predicted=1
                if predicted == 1 and target[sent][clause][0] == 1.0 :
                  TP+=1
                elif predicted == 0 and target[sent][clause][0] == 1.0:
                  FN+=1
                elif predicted == 1 and target[sent][clause][1] == 1.0:
                  FP+=1
                elif predicted == 0 and target[sent][clause][1] == 1.0:
                  TN+=1
        P=TP/(TP+FP)
        R=TP/(TP+FN)
        F=2/(1/P+1/R)
    print('Pricision, Recall and F of the network on the test clause: %f %%, %f %%, %f %%' % (
        100.0 * P,100.0*R,100.0*F))
    return 100.0 * P,100.0*R,100.0*F

# 声明感知器网络
model = Attention_MLP(768,1,1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)
model.to(device)
train()

Epoch:  1  	


  0%|          | 0/84 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

100%|██████████| 84/84 [1:08:43<00:00, 49.09s/it]


Training Loss: 12114.087578


100%|██████████| 21/21 [16:48<00:00, 48.03s/it]


Pricision, Recall and F of the network on the test clause: 7.148134 %, 89.583333 %, 13.239822 %
Epoch:  2  	


 48%|████▊     | 40/84 [32:33<35:55, 48.99s/it]