In [None]:
import tensorflow as tf
from tensorflow import keras

tf.__version__

'1.14.0'

download the dataset
---
在load_data的时候，好像是不能指定我们读取的文件的路径，只能在将下载好的文件放在.keras/dataset/下，这样才不会出错

In [11]:
imdb=keras.datasets.imdb
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=15000)

Explore the data
---

In [14]:
# 先看看train_data 是长的什么样子,为一个标量
print(train_data[0])
print(train_labels[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


In [13]:
# 查看训练集的内容，查看训练数据以及标签
print("Traing entries:{},labels:{}".format(len(train_data),len(train_labels)))

Traing entries:25000,labels:25000


In [15]:
# 查看语句的长度，看长度是否是一致的,如果不一致，那么后期就是需要进行处理，将他们padding成同等长度的句子
len(train_data[0]),len(test_data[0])

(218, 68)

In [19]:
# 在上面的内容，我们查看可word2index的内容，但是我们也可以查看原文的内容，这需要我们将要进行index2word操作
word2index=imdb.get_word_index()

# 获取到了word2index的内容，那么我们需要在这个字典的头部插入4个内容，然后将相关key对应的value值向后移
word2index={k:(v+3) for k,v in word2index.items()}
word2index["<PAD>"]=0
word2index["<START>"]=1
word2index["<UNK>"]=2
word2index["<UNUSED>"]=3
len(word2index)  # 字典的长度

# 将word2index 逆转，使其变成index2word即{123：“hello”}的形式
index2word=dict([(value,key) for (key,value) in word2index.items()])

# 将一个列表转换为文字
def decode_review(text):
    # Python 字典 dict.get(key, default=None)函数返回指定键的值，如果值不在字典中返回默认值。
    return " ".join([index2word.get(i,'?') for i in text])

In [20]:
decode_review(train_data[0])

"<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be p

做到上面这一步，我们可以是对这个数据集有了大致的认识了，了解了这个数据集是怎么使用的，后面的内容主要是为了情感分析做准备
---

情感分类
---

In [24]:
# 在上面的内容中，我们看到，每个句子的长度都是不相等的，这在文本处理中是行不通的，我们需要将这些句子padding成同等长度的句子，
# 在这之前，为了不丢失信息，我们可以先查看train_data,test_data句子中最长的句子长度是多少
def get_length(datas):
    max_length=0
    for data in datas:
        if len(data)>max_length:
            max_length=len(data)
    return max_length
print("max length in trainset:",get_length(train_data))
print("max length in testset:",get_length(test_data))

max length in trainset: 2494
max length in testset: 2315


In [26]:
# 对于文本的处理来说，文本的长度尽量不要太长，因为后面如果采用RNNs的话，可能会遇到梯度消失和梯度爆炸的问题，所以我们需要进行一定的截断
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word2index["<PAD>"],padding='post',maxlen=256)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word2index["<PAD>"],padding='post',maxlen=256)

In [27]:
# 现在来查看长度,查看那边的长度是否都是已经变成了256
print(get_length(train_data))
print(get_length(test_data))

256
256


构建模型
---
神经网络通过堆叠层创建而成，这需要做出两个架构方面的主要决策：

1）要在模型中使用多少个层？  
2）要针对每个层使用多少个隐藏单元？

In [28]:
vocab_size=15000
# 序列化建图
model=keras.Sequential()
# embedding层，初始化内容为（vocab_size,embedding_dim）,下面是每个词都会被编辑为16维的向量,
# 之后的维度变为(batch_size, sequence_len, embedding_dim)
model.add(keras.layers.Embedding(vocab_size,16))
# 平均池化
model.add(keras.layers.GlobalAveragePooling1D())
# 全连接层,该长度固定的输出向量会传入一个全连接 (Dense) 层（包含 16 个隐藏单元）。
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
# dropout层
model.add(keras.layers.Dropout(0.5))
# 应用 sigmoid 激活函数后，结果是介于 0 到 1 之间的浮点值，表示概率或置信水平。
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()

W0620 21:25:37.776062  1020 deprecation.py:506] From J:\python\env\nlp\lib\site-packages\tensorflow\python\keras\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0620 21:25:37.835028  1020 deprecation.py:506] From J:\python\env\nlp\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          240000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 240,289
Trainable params: 240,289
Non-trainable params: 0
_________________________________________________________________


定义损失函数与优化器
----

In [30]:
model.compile(optimizer=tf.train.AdamOptimizer(),loss='binary_crossentropy',metrics=['accuracy'])

W0620 21:30:27.418706  1020 deprecation.py:323] From J:\python\env\nlp\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [31]:
# 创建验证集
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

In [32]:
# 模型训练
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


评估模型
--
我们来看看模型的表现如何。模型会返回两个值：损失（表示误差的数字，越低越好）和准确率。

In [33]:
results=model.evaluate(test_data,test_labels)
print(results)

[0.3215745053386688, 0.87444]


创建准确率和损失随时间变化的图
---
`model.fit()` 返回一个 History 对象，该对象包含一个字典，其中包括训练期间发生的所有情况：这个字典是如下形式的字典：  
dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])

In [35]:
history_dict=history.history
history_dict.keys()

dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])

In [37]:
# 将字典中的元素取出来
import matplotlib.pyplot as plt

acc=history_dict["acc"]
loss=history_dict["loss"]
val_loss=history_dict["val_loss"]
val_acc=history_dict["val_acc"]

# 确定画的点数【横坐标】
epoch=range(1,len(acc)+1)
# 画出训练集的loss
plt.plot(epoch,loss,'bo',label="Training loss")
# 画出验证集的loss
plt.plot(epoch,val_loss,'r',label="Validation loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [1]:
plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epoch, acc, 'bo', label='Training acc')
plt.plot(epoch, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

NameError: name 'plt' is not defined