# 10.7 文本情感分类：使用循环神经网络

In [226]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext
# from torchtext.vocab import vocab
# import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
sys.path.append("..")
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__, device)
# print(torch.__version__)

1.9.0 cpu


In [208]:
counter = collections.Counter(["a", "a", "b", "b", "b"])
# counter, help(counter)

In [212]:
# help(counter.items())

In [218]:
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
# sorted_by_freq_tuples

In [219]:
ordered_dict = collections.OrderedDict(sorted_by_freq_tuples)

In [225]:
ordered_dict
# help(ordered_dict)

OrderedDict([('b', 3), ('a', 2)])

In [228]:
v1 = torchtext.vocab.vocab(ordered_dict, min_freq=1)

In [232]:
v1["a"], v1["b"]

(1, 0)

In [236]:
help(v1.insert_token)

Help on method insert_token in module torchtext.vocab:

insert_token(token: str, index: int) -> None method of torchtext.vocab.Vocab instance
    Args:
        token: The token used to lookup the corresponding index.
        index: The index corresponding to the associated token.
    Raises:
        RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.



In [234]:
# v1["out of vocab"]

In [237]:
"a" in v1

True

In [235]:
unk_token = '<unk>'

In [238]:
if unk_token not in v1: 
    v1.insert_token(unk_token, 0)

In [240]:
v1.get_itos(), v1.get_stoi()

(['<unk>', 'b', 'a'], {'a': 2, 'b': 1, '<unk>': 0})

In [241]:
#make default index same as index of unk_token
v1.set_default_index(v1[unk_token])

In [242]:
v1["out of vocab"]

0

## 10.7.1 文本情感分类数据
### 10.7.1.1 读取数据

In [78]:
datasets_root_path = "/Users/lusong/Datasets/"

In [79]:
os.listdir(datasets_root_path)

['.DS_Store', 'FashionMNIST', 'Dive-into-DL-PyTorch']

In [82]:
DATA_ROOT = os.path.join(datasets_root_path, 'Dive-into-DL-PyTorch', "data")

In [83]:
os.listdir(DATA_ROOT)

['rainier.jpg',
 'ptb',
 'aclImdb',
 'kaggle_house',
 'fr-en-small.txt',
 'aclImdb_v1.tar.gz',
 'glove',
 'autumn_oak.jpg',
 'airfoil_self_noise.dat',
 'jaychou_lyrics.txt.zip']

In [84]:
fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
    print("从压缩包解压...")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [85]:
from tqdm import tqdm
def read_imdb(folder='train', data_root=os.path.join(DATA_ROOT, "aclImdb")):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

data_root = os.path.join(DATA_ROOT, "aclImdb")
train_data, test_data = read_imdb('train', data_root), read_imdb('test', data_root)

100%|██████████| 12500/12500 [00:04<00:00, 2750.87it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2742.36it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2745.81it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2535.98it/s]


In [86]:
type(train_data), type(test_data), len(train_data), len(test_data)

(list, list, 25000, 25000)

In [68]:
# train_data[0], test_data[0]

### 10.7.1.2 预处理数据

In [243]:
def get_tokenized_imdb(data):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [tok for tok in text.lower().split(' ')]
    return [tokenizer(review) for review, _ in data]

In [244]:
tokenized_data = get_tokenized_imdb(train_data)
# tokenized_data[0]
len(tokenized_data)

25000

In [245]:
counter = collections.Counter([tk for st in tokenized_data for tk in st])

In [246]:
# counter
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)

In [247]:
# sorted_by_freq_tuples

In [248]:
ordered_dict = collections.OrderedDict(sorted_by_freq_tuples)

In [249]:
list(ordered_dict.items())[-1]

('fans)!!!', 1)

In [250]:
v1 = torchtext.vocab.vocab(ordered_dict, min_freq=5)
# v1 = Vocab.vocab(counter, min_freq=5)

In [252]:
# dir(v1)

In [253]:
v1["the"]

0

In [267]:
def get_vocab_imdb(data):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    ordered_dict = collections.OrderedDict(sorted_by_freq_tuples)
    return torchtext.vocab.vocab(ordered_dict, min_freq=3)

# build vocab based on training data

vocab_imdb = get_vocab_imdb(train_data)
'# words in vocab_imdb:', len(vocab_imdb)

('# words in vocab_imdb:', 69426)

In [268]:
unk_token = '<unk>'
vocab_imdb.insert_token(unk_token, 0)

In [269]:
vocab_imdb.set_default_index(vocab_imdb[unk_token])

In [271]:
vocab_imdb[unk_token], vocab_imdb["the"], vocab_imdb["a"]

(0, 1, 2)

In [272]:
def preprocess_imdb(data, vocab):  # 本函数已保存在d2lzh_torch包中方便以后使用
    max_l = 500  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [273]:
processed_train_data = preprocess_imdb(train_data, vocab_imdb)

In [274]:
processed_test_data = preprocess_imdb(test_data, vocab_imdb)

In [277]:
processed_train_data[0].shape, processed_test_data[0].shape

(torch.Size([25000, 500]), torch.Size([25000, 500]))

In [278]:
processed_train_data[1].shape, processed_test_data[1].shape

(torch.Size([25000]), torch.Size([25000]))

In [291]:
# processed_train_data[0][0]
sum(processed_train_data[1]), processed_train_data[1].shape[0]

(tensor(12500), 25000)

### 10.7.1.3 创建数据迭代器

In [305]:
# help(Data.TensorDataset)

In [293]:
batch_size = 64
train_set = Data.TensorDataset(*processed_train_data)
test_set = Data.TensorDataset(*processed_test_data)

train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [300]:
# train_set[0]

In [301]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

## 10.7.2 使用循环神经网络的模型

In [335]:
# help(nn.LSTM)
embed_size, num_hiddens, num_layers = 100, 100, 2

In [336]:
rnn = nn.LSTM(embed_size, num_hiddens, num_layers, bidirectional=True)
rnn

LSTM(100, 100, num_layers=2, bidirectional=True)

In [345]:
inputs = torch.tensor(list(train_iter)[0][0])

  inputs = torch.tensor(list(train_iter)[0][0])


In [346]:
inputs.shape

torch.Size([64, 500])

In [348]:
inputs.permute(1, 0).shape

torch.Size([500, 64])

In [349]:
embedding = nn.Embedding(len(vocab_imdb), embed_size)
embedding.weight.data.shape

torch.Size([69427, 100])

In [350]:
embedding(inputs[0]).shape

torch.Size([500, 100])

In [351]:
outputs, _ = rnn(embedding(inputs.permute(1, 0)))

In [352]:
outputs.shape

torch.Size([500, 64, 200])

In [354]:
outputs[0].shape, outputs[-1].shape

(torch.Size([64, 200]), torch.Size([64, 200]))

In [362]:
encoding = torch.cat((outputs[0], outputs[-1]), 1)
encoding.shape

torch.Size([64, 400])

In [364]:
decoder = nn.Linear(4*num_hiddens, 2)
decoder

Linear(in_features=400, out_features=2, bias=True)

In [367]:
outs = decoder(encoding)
outs.shape, outs[0]

(torch.Size([64, 2]), tensor([-0.0249,  0.0905], grad_fn=<SelectBackward>))

In [368]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)

        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(
            input_size=embed_size,
            hidden_size=num_hiddens,
            num_layers=num_layers,
#             batch_first=True,
            bidirectional=True
        )

        self.decoder = nn.Linear(4*num_hiddens, 2)  # 初始时间步和最终时间步的隐藏状态作为全连接层输入

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings)  # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [387]:
net = BiRNN(vocab_imdb, embed_size, num_hiddens, num_layers)

In [388]:
net

BiRNN(
  (embedding): Embedding(69427, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=2, bias=True)
)

### 10.7.2.1 加载预训练的词向量

In [372]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))

In [373]:
# dir(glove_vocab)

In [374]:
glove_vocab.vectors.shape

torch.Size([400000, 100])

In [375]:
glove_vocab.vectors[0].shape[0]

100

In [389]:
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

In [390]:
len(vocab_imdb.get_itos())

69427

In [391]:
net.embedding.weight.data.shape

torch.Size([69427, 100])

In [392]:
net.embedding.weight.data.copy_(
    load_pretrained_embedding(
        vocab_imdb.get_itos(), 
        glove_vocab
    )
)
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 36691 oov words.


### 10.7.2.2 训练并评价模型

In [None]:
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cpu


In [14]:
# 本函数已保存在d2lzh包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [15]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [16]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'