# 通过Transformer实现文本分类
作者：[YinHang2515](https://github.com/YinHang2515)

日期：2020年11月20日

本示例教程演示如何使用Transformer模型在IMDB数据集上完成文本分类的任务。

IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集，共有25000条文本数据作为训练集，25000条文本数据作为测试集。 该数据集的官方地址为： http://ai.stanford.edu/~amaas/data/sentiment/

## 环境设置

In [3]:
import paddle as pd
import paddle.nn as nn
import paddle.nn.functional as func
import numpy as np

## 处理数据集
首先通过paddle内置的dataset完成数据集的导入，并构建字典和相应的reader

然后通过padding的方式对同一个batch中长度不一致的数据进行补齐

In [4]:
print("Loading IMDB word dict....")
word_dict = pd.dataset.imdb.word_dict()

train_reader = pd.dataset.imdb.train(word_dict)
test_reader = pd.dataset.imdb.test(word_dict)

Loading IMDB word dict....


In [5]:
# 添加<pad>
word_dict['<pad>'] = len(word_dict)

for k in list(word_dict)[:5]:
    print("{}:{}".format(k.decode('ASCII'), word_dict[k]))

print("...")

for k in list(word_dict)[-5:]:
    print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))

print("totally {} words".format(len(word_dict)))


the:0
and:1
a:2
of:3
to:4
...
virtual:5143
warriors:5144
widely:5145
<unk>:5146
<pad>:5147
totally 5148 words


## 参数设置

In [6]:
vocab_size = len(word_dict)  
maxlen = 200  
seq_len = 200
batch_size = 128
epochs = 2
pad_id = word_dict['<pad>']
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_dim = 32  # Hidden layer size in feed forward network inside transformer

classes = ['positive', 'negative']

In [7]:
#用padding的方式对齐数据
def create_padded_dataset(reader):
    padded_sents = []
    labels = []
    for batch_id, data in enumerate(reader):
        sent, label = data
        padded_sent = sent[:seq_len] + [pad_id] * (seq_len - len(sent))
        padded_sents.append(padded_sent)
        labels.append(label)
    return np.array(padded_sents), np.expand_dims(np.array(labels), axis=1)

train_sents, train_labels = create_padded_dataset(train_reader())
test_sents, test_labels = create_padded_dataset(test_reader())

print(train_sents.shape)
print(train_labels.shape)
print(test_sents.shape)
print(test_labels.shape)

(25000, 200)
(25000, 1)
(25000, 200)
(25000, 1)


## 用Dataset 与 DataLoader 加载

In [8]:

class IMDBDataset(pd.io.Dataset):
    def __init__(self, sents, labels):

        self.sents = sents
        self.labels = labels
    
    def __getitem__(self, index):

        data = self.sents[index]
        label = self.labels[index]

        return data, label

    def __len__(self):
        
        return len(self.sents)
    
train_dataset = IMDBDataset(train_sents, train_labels)
test_dataset = IMDBDataset(test_sents, test_labels)

train_loader = pd.io.DataLoader(train_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = pd.io.DataLoader(test_dataset, places=pd.CPUPlace(), return_list=True,
                                    shuffle=True, batch_size=batch_size, drop_last=True)

## 定义嵌入层

In [11]:
class TokenAndPositionEmbedding(nn.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = pd.shape(x)[-1]
        positions = pd.arange(start=0, end=maxlen, step=1, dtype='int64')
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## 组建网络

In [13]:
class MyNet(nn.Layer):
    def __init__(self):
        super(MyNet, self).__init__()
        self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.trs = nn.TransformerEncoderLayer(embed_dim, num_heads, feed_dim, normalize_before=True)
        self.drop1 = nn.Dropout(0.1)
        self.linear1 = nn.Linear(feed_dim, 20)
        self.drop2 = nn.Dropout(0.1)
        self.linear2 = nn.Linear(20, 2)

    def forward(self, x):
        x = self.emb(x)
        x = self.trs(x)
        x = pd.mean(x, axis=1)
        x = self.drop1(x)
        x = self.linear1(x)
        x = func.relu(x)
        x = self.drop2(x)
        x = self.linear2(x)
        x = func.softmax(x)
        return x

## 训练模型

In [14]:
#使用高层API进行训练
model = pd.Model(MyNet()) # 用 Model封装 MyNet

# 模型配置
model.prepare(optimizer=pd.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
              loss=nn.CrossEntropyLoss())

#模型训练
model.fit(train_loader,
          test_loader,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous step.
Epoch 1/2


  "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything."
  return (isinstance(seq, collections.Sequence) and


Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960
Epoch 2/2
Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960


## 模型验证

In [15]:
#使用model.predict对测试集进行测试
result = model.predict(test_loader, batch_size=64)

Predict begin...
Predict samples: 24960


In [16]:
#随机选择文本打印并观察结果
label_pred = np.array([])
for label in result[0]:
    label_pred = np.append(label_pred, label.reshape(-1))
for i in range(10):
    words = ""
    idx = np.random.randint(int(len(label_pred)/2))
    for k in test_sents[idx]:
        word = list(word_dict)[k]
        if not isinstance(word, str):
            word = word.decode('ASCII')
        if word != '<pad>' and word != '<unk>':
            words += (word + " ")
    print("#" + str(idx) + " " + words)
    print("Lable: " + classes[test_labels[idx][0]])
    if label_pred[2 * idx] > label_pred[2 * idx + 1]:
        print("Predict: " + classes[0])
    else:
        print("Predict: " + classes[1])

#5345 in beautiful water colors at the cliff is definitely a sight to vaguely the trailer witch i didnt find that impressive i was surprised at how beautiful and detailed it was this film just over me with its br at the center is a young boy that comes into contact with a sea creature and its their relationship that carries the movie is a master both at creating memorable imagery and showing young ones in a believable way with their little br there are a few parts that didnt sit well with me it would be an to say that the music during a particular scene ride of the its a shame because such a precious film as this cant afford to take and it hurt a otherwise truly great scene the and its never interested me either but i guess it served more as a background then anything br anyway great film the boat trip scenario with all its imagery and stood out me thinks pure and magical and yes is more intimate then computer animated im really 
Lable: positive
Predict: positive
#19983 what garbage is