# 通过Transformer实现文本分类
作者：[YinHang2515](https://github.com/YinHang2515)

日期：2020年11月20日

本示例教程演示如何使用Transformer模型在IMDB数据集上完成文本分类的任务。

IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集，共有25000条文本数据作为训练集，25000条文本数据作为测试集。 该数据集的官方地址为： http://ai.stanford.edu/~amaas/data/sentiment/

## 环境设置

In [18]:
import paddle
import paddle.nn as nn
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import numpy as np

## 处理数据集
首先通过paddle内置的dataset完成数据集的导入，并构建字典和相应的reader

然后通过padding的方式对同一个batch中长度不一致的数据进行补齐

In [19]:
print("Loading IMDB word dict....")
word_dict = paddle.dataset.imdb.word_dict()

train_reader = paddle.dataset.imdb.train(word_dict)
test_reader = paddle.dataset.imdb.test(word_dict)

Loading IMDB word dict....


In [20]:
# 添加<pad>
word_dict['<pad>'] = len(word_dict)

for k in list(word_dict)[:5]:
    print("{}:{}".format(k.decode('ASCII'), word_dict[k]))

print("...")

for k in list(word_dict)[-5:]:
    print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))

print("totally {} words".format(len(word_dict)))


the:0
and:1
a:2
of:3
to:4
...
virtual:5143
warriors:5144
widely:5145
<unk>:5146
<pad>:5147
totally 5148 words


## 参数设置

In [21]:
vocab_size = len(word_dict)  
maxlen = 200  
seq_len = 200
batch_size = 32
epochs = 2
pad_id = word_dict['<pad>']
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

classes = ['negative', 'positive']

In [23]:
#用padding的方式对齐数据
def create_padded_dataset(reader):
    padded_sents = []
    labels = []
    for batch_id, data in enumerate(reader):
        sent, label = data
        padded_sent = sent[:seq_len] + [pad_id] * (seq_len - len(sent))
        padded_sents.append(padded_sent)
        labels.append(label)
    return np.array(padded_sents), np.expand_dims(np.array(labels), axis=1)

train_sents, train_labels = create_padded_dataset(train_reader())
test_sents, test_labels = create_padded_dataset(test_reader())

print(train_sents.shape)
print(train_labels.shape)
print(test_sents.shape)
print(test_labels.shape)

for sent in train_sents[:3]:
    print(ids_to_str(sent))


(25000, 200)
(25000, 1)
(25000, 200)
(25000, 1)
<unk> has much in common with the third man another <unk> film set among the <unk> of <unk> europe like <unk> there is much inventive camera work there is an innocent american who gets emotionally involved with a woman he doesnt really understand and whose <unk> is all the more striking in contrast with the <unk> br but id have to say that the third man has a more <unk> storyline <unk> is a bit disjointed in this respect perhaps this is <unk> it is presented as a <unk> and making it too coherent would spoil the effect br br this movie is <unk> <unk> in more than one sense one never sees the sun shine grim but intriguing and frightening <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

## 定义多头自注意力机制 (Multi-head Self Attention)

In [25]:
class MultiHeadSelfAttention(nn.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = dg.Linear(embed_dim, embed_dim)
        self.key_dense = dg.Linear(embed_dim, embed_dim)
        self.value_dense = dg.Linear(embed_dim, embed_dim)
        self.combine_heads = dg.Linear(embed_dim, embed_dim)

    def attention(self, query, key, value):
        score = layers.matmul(query, key, transpose_y=True)
        dim_key = layers.cast(layers.shape(key)[-1], 'float32')
        scaled_score = score / layers.sqrt(dim_key)
        weights = layers.softmax(scaled_score, axis=-1)
        output = layers.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = layers.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return layers.transpose(x, perm=[0, 2, 1, 3])

    def forward(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = layers.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = layers.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = layers.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output


## 定义点式前馈网络（Point wise feed forward network）

点式前馈网络由两层全联接层组成，两层之间有一个 ReLU 激活函数。

这个网络不会改变向量的大小，只是做了一步提取特征的工作

In [26]:
class PointWiseFeedForwardNetwork(nn.Layer):
    def __init__(self, embed_dim, ff_dim):
        super(PointWiseFeedForwardNetwork, self).__init__()
        self.linear1 = dg.Linear(embed_dim, ff_dim, act='relu')
        self.linear2 = dg.Linear(ff_dim, embed_dim)

    def forward(self, x):
        out = self.linear1(x)
        out = self.linear2(out)
        return out

## 定义嵌入层

In [28]:
class TokenAndPositionEmbedding(nn.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = dg.Embedding(size=[vocab_size, embed_dim])
        self.pos_emb = dg.Embedding(size=[maxlen, embed_dim])

    def forward(self, x):
        maxlen = layers.shape(x)[-1]
        positions = layers.range(start=0, end=maxlen, step=1, dtype='int64')
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## 定义Transformer模型

In [27]:
class TransformerBlock(nn.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = PointWiseFeedForwardNetwork(embed_dim, ff_dim)
        self.layernorm1 = dg.LayerNorm(embed_dim, epsilon=1e-6)
        self.layernorm2 = dg.LayerNorm(embed_dim, epsilon=1e-6)
        self.dropout1 = dg.Dropout(rate)
        self.dropout2 = dg.Dropout(rate)

    def forward(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

## 组建网络

In [29]:
class MyNet(paddle.nn.Layer):
    def __init__(self):
        super(MyNet, self).__init__()
        self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.trs = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.drop1 = dg.Dropout(0.1)
        self.relu = dg.Linear(ff_dim, 20, act='relu')
        self.drop2 = dg.Dropout(0.1)
        self.soft = dg.Linear(20, 2, act='softmax')

    def forward(self, x):
        x = self.emb(x)
        x = self.trs(x)
        x = layers.reduce_mean(x, dim=1)
        x = self.drop1(x)
        x = self.relu(x)
        x = self.drop2(x)
        x = self.soft(x)
        return x
# class MyNet(paddle.nn.Layer):
#     def __init__(self):
#         super(MyNet, self).__init__()
#         self.emb = paddle.nn.Embedding(vocab_size, embed_dim)
#         self.fc = paddle.nn.Linear(in_features=embed_dim, out_features=2)
#         self.dropout = paddle.nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.emb(x)
#         x = layers.reduce_mean(x, dim=1)
#         x = self.dropout(x)
#         x = self.fc(x)
#         return x


## 训练模型

In [31]:
def train(model):
    model.train()

    opt = paddle.optimizer.Adam(learning_rate=1e-3, parameters=model.parameters())

    for epoch in range(epochs):
        # shuffle data
        perm = np.random.permutation(len(train_sents))
        train_sents_shuffled = train_sents[perm]
        train_labels_shuffled = train_labels[perm]

        for batch_id in range(len(train_sents_shuffled) // batch_size):
            x_data = train_sents_shuffled[(batch_id * batch_size):((batch_id+1)*batch_size)]
            y_data = train_labels_shuffled[(batch_id * batch_size):((batch_id+1)*batch_size)]

            sent = paddle.to_tensor(x_data)
            label = paddle.to_tensor(y_data)

            logits = model(sent)
            loss = paddle.nn.functional.softmax_with_cross_entropy(logits, label)

            avg_loss = paddle.mean(loss)
            if batch_id % 500 == 0:
                print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, avg_loss.numpy()))
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

        # evaluate model after one epoch
        model.eval()
        accuracies = []
        losses = []
        for batch_id in range(len(test_sents) // batch_size):
            x_data = test_sents[(batch_id * batch_size):((batch_id+1)*batch_size)]
            y_data = test_labels[(batch_id * batch_size):((batch_id+1)*batch_size)]

            sent = paddle.to_tensor(x_data)
            label = paddle.to_tensor(y_data)

            logits = model(sent)
            loss = paddle.nn.functional.softmax_with_cross_entropy(logits, label)
            acc = paddle.metric.accuracy(logits, label)

            accuracies.append(acc.numpy())
            losses.append(loss.numpy())

        avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
        print("[validation] accuracy/loss: {}/{}".format(avg_acc, avg_loss))

        model.train()

model = MyNet()
train(model)


epoch: 0, batch_id: 0, loss is: [0.71156496]
epoch: 0, batch_id: 500, loss is: [0.48003972]
[validation] accuracy/loss: 0.8522727489471436/0.4543215036392212
epoch: 1, batch_id: 0, loss is: [0.52676773]
epoch: 1, batch_id: 500, loss is: [0.42795134]
[validation] accuracy/loss: 0.8535931706428528/0.45140576362609863


可以看到经过两轮的迭代训练，可以达到85%左右的准确率，当然你也可以通过调整参数、更改优化方式等等来进一步提升性能。