# 通过Transformer实现文本分类
作者：[YinHang2515](https://github.com/YinHang2515)

日期：2020年11月20日

本示例教程演示如何使用Transformer模型在IMDB数据集上完成文本分类的任务。

IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集，共有25000条文本数据作为训练集，25000条文本数据作为测试集。 该数据集的官方地址为： http://ai.stanford.edu/~amaas/data/sentiment/

## 环境设置

In [16]:
import paddle as pd
import paddle.nn as nn
import numpy as np

## 处理数据集
首先通过paddle内置的dataset完成数据集的导入，并构建字典和相应的reader

然后通过padding的方式对同一个batch中长度不一致的数据进行补齐

In [17]:
print("Loading IMDB word dict....")
word_dict = pd.dataset.imdb.word_dict()

train_reader = pd.dataset.imdb.train(word_dict)
test_reader = pd.dataset.imdb.test(word_dict)

Loading IMDB word dict....


In [18]:
# 添加<pad>
word_dict['<pad>'] = len(word_dict)

for k in list(word_dict)[:5]:
    print("{}:{}".format(k.decode('ASCII'), word_dict[k]))

print("...")

for k in list(word_dict)[-5:]:
    print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))

print("totally {} words".format(len(word_dict)))


the:0
and:1
a:2
of:3
to:4
...
virtual:5143
warriors:5144
widely:5145
<unk>:5146
<pad>:5147
totally 5148 words


## 参数设置

In [19]:
vocab_size = len(word_dict)  
maxlen = 200  
seq_len = 200
batch_size = 32
epochs = 2
pad_id = word_dict['<pad>']
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_dim = 32  # Hidden layer size in feed forward network inside transformer

classes = ['negative', 'positive']

In [20]:
#用padding的方式对齐数据
def create_padded_dataset(reader):
    padded_sents = []
    labels = []
    for batch_id, data in enumerate(reader):
        sent, label = data
        padded_sent = sent[:seq_len] + [pad_id] * (seq_len - len(sent))
        padded_sents.append(padded_sent)
        labels.append(label)
    return np.array(padded_sents), np.expand_dims(np.array(labels), axis=1)

train_sents, train_labels = create_padded_dataset(train_reader())
test_sents, test_labels = create_padded_dataset(test_reader())

print(train_sents.shape)
print(train_labels.shape)
print(test_sents.shape)
print(test_labels.shape)

(25000, 200)
(25000, 1)
(25000, 200)
(25000, 1)


## 用Dataset 与 DataLoader 加载

In [21]:

class IMDBDataset(pd.io.Dataset):
    def __init__(self, sents, labels):

        self.sents = sents
        self.labels = labels
    
    def __getitem__(self, index):

        data = self.sents[index]
        label = self.labels[index]

        return data, label

    def __len__(self):
        
        return len(self.sents)
    
train_dataset = IMDBDataset(train_sents, train_labels)
test_dataset = IMDBDataset(test_sents, test_labels)

train_loader = pd.io.DataLoader(train_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = pd.io.DataLoader(test_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)

## 定义多头自注意力机制 (Multi-head Self Attention)

In [22]:
class MultiHeadSelfAttention(nn.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = nn.Linear(embed_dim, embed_dim)
        self.key_dense = nn.Linear(embed_dim, embed_dim)
        self.value_dense = nn.Linear(embed_dim, embed_dim)
        self.combine_heads = nn.Linear(embed_dim, embed_dim)

    def attention(self, query, key, value):
        score = pd.matmul(query, key, transpose_y=True)
        dim_key = pd.cast(pd.shape(key)[-1], 'float32')
        scaled_score = score / pd.sqrt(dim_key)
        weights = nn.functional.softmax(scaled_score, axis=-1)
        output = pd.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = pd.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return pd.transpose(x, perm=[0, 2, 1, 3])

    def forward(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = pd.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = pd.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = pd.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output


## 定义点式前馈网络（Point wise feed forward network）

点式前馈网络由两层全联接层组成，两层之间有一个 ReLU 激活函数。

这个网络不会改变向量的大小，只是做了一步提取特征的工作

In [23]:
class PointWiseFeedForwardNetwork(nn.Layer):
    def __init__(self, embed_dim, feed_dim):
        super(PointWiseFeedForwardNetwork, self).__init__()
        self.linear1 = pd.fluid.dygraph.Linear(embed_dim, feed_dim, act='relu')
        self.linear2 = nn.Linear(feed_dim, embed_dim)

    def forward(self, x):
        out = self.linear1(x)
        out = self.linear2(out)
        return out

## 定义嵌入层

In [24]:
class TokenAndPositionEmbedding(nn.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = pd.shape(x)[-1]
        positions = pd.arange(start=0, end=maxlen, step=1, dtype='int64')
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## 定义Transformer模型

In [25]:
class TransformerBlock(nn.Layer):
    def __init__(self, embed_dim, num_heads, feed_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(embed_dim, feed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim, epsilon=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, epsilon=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

## 组建网络

In [26]:
class MyNet(nn.Layer):
    def __init__(self):
        super(MyNet, self).__init__()
        self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.trs = TransformerBlock(embed_dim, num_heads, feed_dim)
        self.drop1 = nn.Dropout(0.1)
        self.relu = pd.fluid.dygraph.Linear(feed_dim, 20, act='relu')
        self.drop2 = nn.Dropout(0.1)
        self.soft = pd.fluid.dygraph.Linear(20, 2, act='softmax')

    def forward(self, x):
        x = self.emb(x)
        x = self.trs(x)
        x = pd.mean(x, axis=1)
        x = self.drop1(x)
        x = self.relu(x)
        x = self.drop2(x)
        x = self.soft(x)
        return x

## 训练模型

In [27]:
#使用高层API进行训练
model = pd.Model(MyNet()) # 用 Model封装 MyNet

# 模型配置
model.prepare(optimizer=pd.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
              loss=nn.CrossEntropyLoss())

# 模型训练
model.fit(train_loader,
          test_loader,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1)

Epoch 1/2
Eval begin...
Eval samples: 24992
Epoch 2/2
Eval begin...
Eval samples: 24992


可以看到经过两轮的迭代训练，可以达到85%左右的准确率，当然你也可以通过调整参数、更改优化方式等等来进一步提升性能。