In [2]:
import collections
from collections import defaultdict
import os
import random
import tarfile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import preprocessing
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import sys
import time
import os
sys.path.append("..")
import d2lzh_tensorflow2 as d2l
print(tf.test.gpu_device_name())
DATA_ROOT = "../../data"

/device:GPU:0


In [3]:
import torchtext.vocab as Vocab

In [4]:
import tensorflow as tf
AUTO = tf.data.experimental.AUTOTUNE
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [5]:
print(tf.__version__)

2.1.0


## 10.7.1. 文本情感分类数据

### 10.7.1.1. 读取数据

In [6]:
#数据放入data目录下，代码解压速度较慢，如果不想用代码解压，也可直接手动解压，跳过这一步
import os
path = os.getcwd()#返回当前进程的工作目录
a_path = os.path.abspath(os.path.join(path, "../../data/aclImdb_v1.tar.gz"))
with tarfile.open(a_path, 'r') as f:
    f.extractall(DATA_ROOT)

In [7]:
# 本函数已保存在d2lzh_tensorflow2包中方便以后使用
def read_imdb(folder='train', data_root="../../data/aclImdb/"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in (os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

In [8]:
train_data, test_data = read_imdb('train'), read_imdb('test')

In [9]:
#每条数据中包含一个字符串评论和一个标签
train_data[0]

['no, this hilariously horrible 70\'s made-for-tv horror clinker isn\'t about a deadly demonically possessed dessert cake. still, this exceptionally awful, yet undeniably amusing and thus enjoyable cathode ray refuse reaches a breathtaking apex of absolute, unremitting silliness and atrociousness that\'s quite tasty in a so-execrable-it\'s-downright-awesome sort of way. richard crenna, looking haggard and possibly inebriated, and yvette mimieux, who acts as if she never got over the brutal rape she endured in "jackson county jail," sluggishly portray a disgustingly nice and respectable suburbanite couple whose quaint, dull, sleepy small town existence gets ripped asunder when the cute german shepard they take in as the family pet turns out to be some ancient lethal evil spirit. pretty soon mimieux and her two repellently cutesy kids kim richards and ike eisenmann (the psychic alien moppets from the disney "witch mountain" pictures) are worshiping a crude crayon drawing of the nasty, ug

### 10.7.1.2. 预处理数据

我们需要对每条评论做分词，从而得到分好词的评论。这里定义的get_tokenized_imdb函数使用最简单的方法：基于空格进行分词。

In [10]:
# 本函数已保存在d2lzh_tensorflow2包中方便以后使用
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    返回: 每一条评论的单词所组成的列表
    """
    def tokenizer(text):
        #基于空格进行分词，并都转换为小写
        return [tok.lower() for tok in text.split(' ')]

    return [tokenizer(review) for review, _ in data]

In [11]:
text = get_tokenized_imdb(train_data)

In [12]:
len(text)

25000

In [13]:
# Counter对列表中的单词进行计数，并返回一个字典
counter = collections.Counter([tk for st in text for tk in st])
# vocab是一个字典，键表示单词，值表示单词出现的频率
vocab = {w: freq for w, freq in counter.most_common() if freq > 5}

现在，我们可以根据分好词的训练数据集来创建词典了。我们在这里过滤掉了出现次数少于5的词.

In [14]:
# 本函数已保存在d2lzh_tensorflow2包中方便以后使用
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    #counter已经创建了一个词典,统计了每个词出现的频率
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)
    #text值保存counter中出现频率大于等于五的词
#     text = {w: freq for w, freq in counter.most_common() if freq >= 5}
#     vocab ={index:word for word,index in enumerate(text.keys())}
#     return vocab
    #return Vocab.Vocab(counter, min_freq=5)


vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 46152)

In [15]:
vocab

<torchtext.vocab.Vocab at 0x1fc83d81708>

因为每条评论长度不一致所以不能直接组合成小批量，我们定义preprocess_imdb函数对每条评论进行分词，并通过词典转换成词索引，然后通过截断或者补“<pad>”（padding）符号来将每条评论长度固定成500。

In [16]:
#在此处使用tensorflow2的填充函数进行填充
def preprocess_imdb(data, vocab):  # 本函数已保存在d2lzh_tensorflow2包中方便以后使用
    max_l = 500

    # 将每条评论通过截断或者补0，使得长度变成500
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
    
    #tokenized_data为一个二维的列表,里面有我们分好的词
    tokenized_data = get_tokenized_imdb(data)
    
     #将每个词转换为词索引并进行截断或补0
    features = tf.Variable([pad([vocab[word] for word in words] ) for words in tokenized_data])
    labels = tf.Variable([score for _, score in data])
    return features, labels

In [17]:
data, label = preprocess_imdb(train_data, vocab)

In [18]:
data

<tf.Variable 'Variable:0' shape=(25000, 500) dtype=int32, numpy=
array([[1378,   10, 5002, ...,    0,    0,    0],
       [   2, 3047,  160, ...,    0,    0,    0],
       [  94,    0,  620, ...,    0,    0,    0],
       ...,
       [  86,  347,   57, ...,    0,    0,    0],
       [   9,  199,   11, ...,    0,    0,    0],
       [  69,   64,  381, ...,    0,    0,    0]])>

10.7.1.3. 创建数据迭代器
现在，我们创建数据迭代器。每次迭代将返回一个小批量的数据。

In [19]:
batch_size = 64
train_set = (tf.data.Dataset.from_tensor_slices(
    ((preprocess_imdb(train_data, vocab))))
    .repeat()
    .shuffle(2048)
    .batch(batch_size)
    .prefetch(AUTO))
test_set = (tf.data.Dataset.from_tensor_slices(
    ((preprocess_imdb(test_data, vocab))))
    .shuffle(2048)
    .batch(batch_size)
    .prefetch(AUTO))

In [20]:
for X, y in train_set:
    print('X', X.shape, 'y', y.shape)
    print(X,y)
    break
'#batches:', data.shape[0]//batch_size

X (64, 500) y (64,)
tf.Tensor(
[[   9   86  118 ...    0    0    0]
 [  10   14    3 ...    0    0    0]
 [  10 3717   39 ...    0    0    0]
 ...
 [   9  308   45 ...    0    0    0]
 [  10   20    7 ...    0    0    0]
 [ 481    0  270 ...    0    0    0]], shape=(64, 500), dtype=int32) tf.Tensor(
[0 0 0 1 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 0
 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 1], shape=(64,), dtype=int32)


('#batches:', 390)

# 10.7.2. 使用循环神经网络的模型
在这个模型中，每个词先通过嵌入层得到特征向量。然后，我们使用双向循环神经网络对特征序列进一步编码得到序列信息。最后，我们将编码的序列信息通过全连接层变换为输出。具体来说，我们可以将双向长短期记忆在最初时间步和最终时间步的隐藏状态连结，作为特征序列的表征传递给输出层分类。在下面实现的BiRNN类中，Embedding实例即嵌入层，LSTM实例即为序列编码的隐藏层，Dense实例即生成分类结果的输出层。

In [24]:
#因为tensorflow并没有像pytorch，mxnet关于glove接口的api，所以必须要重写一个

def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
        index_to_word_dict = dict()
        word_to_embedding = dict()
    else:
        word_to_embedding_dict = dict()

    
    with open(glove_filename, 'r',encoding='utf-8') as glove_file:
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )
            
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_word_dict[i] = word
                word_to_embedding[word] = representation
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array,index_to_word_dict,word_to_embedding
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

In [25]:
word_to_index, index_to_embedding, index_to_word,word_to_embedding = load_embedding_from_disks("C:/Users/HP/dive into d2l/code/chapter10_natural-language-processing/embeddings/ GloVe.6B/glove.6B.50d.txt", with_indexes=True)

In [26]:
# 本函数已保存在d2lzh_tensorflow包中方便以后使用
def get_weights(vocab, word_to_embedding,embedding_dim,word_to_index,index_to_embedding):
    """从预训练好的vocab中提取出words对应的词向量"""
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
#     embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for index, word in enumerate(vocab.itos):
        if word in word_to_embedding.keys():
            embedding_matrix[index] = index_to_embedding[index]
    return embedding_matrix
embedding_matrix = get_weights(vocab,word_to_embedding,50,word_to_index,index_to_embedding)
# net.embedding.set_weights([embedding_matrix])
# net.trainable = False

In [69]:
embed_size, num_hiddens, max_len = 50, 100, 500
num_epochs = 5

In [68]:
model = tf.keras.Sequential([
    layers.Embedding(len(vocab), embed_size,weights=[embedding_matrix],input_length=500),
    layers.Bidirectional(layers.LSTM(num_hiddens)),
    tf.keras.layers.Dense(2,activation='softmax')
])

model.layers[0].trainable = False
model.summary()

In [70]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 50)           2307600   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               120800    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 402       
Total params: 2,428,802
Trainable params: 121,202
Non-trainable params: 2,307,600
_________________________________________________________________


In [71]:
model.compile(tf.keras.optimizers.Adam(0.01),
            loss='sparse_categorical_crossentropy',
            metrics=['sparse_categorical_accuracy'])

In [72]:
model.fit(
    train_set,
    steps_per_epoch=data.shape[0]//batch_size,
    validation_data= test_set,
    epochs=5
    )

Train for 390 steps, validate for 391 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1fc057bf708>

创建一个含一个隐藏层的双向循环神经网络

然后，我们将用这些词向量作为评论中每个词的特征向量。注意，预训练词向量的维度需要与创建的模型中的嵌入层输出大小embed_size一致。此外，在训练中我们不再更新这些词向量。因为tensorflow并没有封装预训练词向量，所以我们要重新实现

In [86]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    sentence = tf.Variable([vocab.stoi[word] for word in sentence])
    print(sentence)
    print(tf.reshape(sentence,[1,-1]))
    label = np.argmax(net(tf.reshape(sentence,[1,-1])), axis=1)
    return 'positive' if np.array(label) == 1 else 'negative'

In [87]:
predict_sentiment(model, vocab, ['this', 'movie', 'is', 'so', 'great'])

<tf.Variable 'Variable:0' shape=(5,) dtype=int32, numpy=array([10, 20,  7, 38, 88])>
tf.Tensor([[10 20  7 38 88]], shape=(1, 5), dtype=int32)


'positive'

In [83]:
predict_sentiment(model, vocab, ['this', 'movie', 'is', 'so', 'bad'])

<tf.Variable 'Variable:0' shape=(5,) dtype=int32, numpy=array([10, 20,  7, 38, 97])>


'negative'