In [32]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
#!ls /home/aistudio/data

In [33]:
# 查看工作区文件，该目录下除data目录外的变更将会持久保存。请及时清理不必要的文件，避免加载过慢。
# View personal work directory. 
# All changes, except /data, under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
#!ls /home/aistudio

In [34]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
#!mkdir /home/aistudio/external-libraries
#!pip install beautifulsoup4 -t /home/aistudio/external-libraries

In [35]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
sys.path.append('/home/aistudio/external-libraries')

In [36]:
# 导入必要的包
import paddle
import paddle.dataset.imdb as imdb
import paddle.fluid as fluid
import numpy as np
import os

In [37]:
!mkdir -p /home/aistudio/.cache/paddle/dataset/imdb/
!cp /home/aistudio/data/data69/aclImdb_v1.tar.gz /home/aistudio/.cache/paddle/dataset/imdb/

In [38]:
# 获取数据字典
print("加载数据字典中...")
word_dict = imdb.word_dict()
# 获取数据字典长度
dict_dim = len(word_dict)
print('完成')

加载数据字典中...
完成


In [39]:
# 获取训练和预测数据
print("加载训练数据中...")
train_reader = paddle.batch(paddle.reader.shuffle(imdb.train(word_dict),
                                                  512),
                            batch_size=128)
print("加载测试数据中...")
test_reader = paddle.batch(imdb.test(word_dict), 
                           batch_size=128)
print('完成')

加载训练数据中...
加载测试数据中...
完成


In [40]:
# # 定义长短期记忆网络
# def lstm_net(ipt, input_dim):
#     # 以数据的IDs作为输入
#     emb = fluid.layers.embedding(input=ipt, size=[input_dim, 128], is_sparse=True)
#     # 第一个全连接层
#     fc1 = fluid.layers.fc(input=emb, size=128)
#     # 进行一个长短期记忆操作
#     lstm1, _ = fluid.layers.dynamic_lstm(input=fc1, #返回：隐藏状态（hidden state），LSTM的神经元状
#                                          size=128) #size=4*hidden_size
#     # 第一个最大序列池操作
#     fc2 = fluid.layers.sequence_pool(input=fc1, pool_type='max')
#     # 第二个最大序列池操作
#     lstm2 = fluid.layers.sequence_pool(input=lstm1, pool_type='max')
#     # 以softmax作为全连接的输出层，大小为2,也就是正负面
#     out = fluid.layers.fc(input=[fc2, lstm2], size=2, act='softmax')
#     return out

In [41]:
# 双向LSTM + 注意力 + Dropout + 学习率调度
def lstm_net(ipt, input_dim):
    emb = fluid.layers.embedding(input=ipt, size=[input_dim, 300], is_sparse=True)
    fc1 = fluid.layers.fc(input=emb, size=128, act="relu")
    
    # 进行一个长短期记忆操作
    lstm1, _ = fluid.layers.dynamic_lstm(input=fc1, #返回：隐藏状态（hidden state），LSTM的神经元状
                                         size=128) #size=4*hidden_size
    
    # Attention
    attention = fluid.layers.fc(input=lstm1, size=128, act="tanh")
    attention_weight = fluid.layers.fc(input=attention, size=1, act="softmax")
    scaled_attention = fluid.layers.elementwise_mul(lstm1, attention_weight, axis=0)
    lstm_out = fluid.layers.sequence_pool(input=scaled_attention, pool_type="sum")
    `
    # Dropout
    lstm_out = fluid.layers.dropout(lstm_out, dropout_prob=0.5)
    
    # 输出层
    out = fluid.layers.fc(input=lstm_out, size=2, act="softmax")
    return out


In [42]:
import paddle
paddle.enable_static()
# 定义输入数据， lod_level不为0指定输入数据为序列数据
words = fluid.layers.data(name='words', shape=[1], dtype='int64', lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# 获取长短期记忆网络
model = lstm_net(words, dict_dim)

In [43]:
# 获取损失函数和准确率
cost = fluid.layers.cross_entropy(input=model, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=model, label=label)

In [44]:
# 获取预测程序
test_program = fluid.default_main_program().clone(for_test=True)

In [45]:
# 定义优化方法
# optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002)
learning_rate = fluid.layers.exponential_decay(
    learning_rate=0.002, decay_steps=1000, decay_rate=0.96, staircase=True
)
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=learning_rate)

opt = optimizer.minimize(avg_cost)

In [46]:
# 定义使用CPU还是GPU，使用CPU时use_cuda = False,使用GPU时use_cuda = True
use_cuda = True
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# 进行参数初始化
exe.run(fluid.default_startup_program())

[]

In [47]:
# 定义输入数据的维度
# 定义数据数据的维度，数据的顺序是一条句子数据对应一个标签
feeder = fluid.DataFeeder(place=place, feed_list=[words, label])

In [48]:
# 开始训练
for pass_id in range(10):
    # 进行训练
    train_cost = 10
    for batch_id, data in enumerate(train_reader()):              #遍历train_reader迭代器
        train_cost = exe.run(program=fluid.default_main_program(),#运行主程序
                             feed=feeder.feed(data),              #喂入一个batch的数据
                             fetch_list=[avg_cost])               #fetch均方误差

        if batch_id % 40 == 0:                 #每40次batch打印一次训练、进行一次测试
            print('Pass:%d, Batch:%d, Cost:%0.5f' % (pass_id, batch_id, train_cost[0]))
    # 进行测试
    test_costs = []   #测试的损失值
    test_accs = []    #测试的准确率
    for batch_id, data in enumerate(test_reader()):
        test_cost, test_acc = exe.run(program=test_program,
                                            feed=feeder.feed(data),
                                             fetch_list=[avg_cost, acc])
        test_costs.append(test_cost[0])
        test_accs.append(test_acc[0])
    # 计算平均预测损失在和准确率
    test_cost = (sum(test_costs) / len(test_costs))
    test_acc = (sum(test_accs) / len(test_accs))
    print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))
#保存模型
model_save_dir = "/home/aistudio/work/emotionclassify.inference.model"
# 如果保存路径不存在就创建
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)
print ('save models to %s' % (model_save_dir))
fluid.io.save_inference_model(model_save_dir, #保存推理model的路径
                                  ['words'],      #推理（inference）需要 feed 的数据
                                  [model],         #保存推理（inference）结果的 Variables
                                  exe)            #exe 保存 inference mo

Pass:0, Batch:0, Cost:0.59956
Pass:0, Batch:40, Cost:0.00001
Pass:0, Batch:80, Cost:0.00012
Pass:0, Batch:120, Cost:0.00107
Pass:0, Batch:160, Cost:0.00134
Test:0, Cost:14.30677, ACC:0.50175
Pass:1, Batch:0, Cost:26.24308
Pass:1, Batch:40, Cost:0.00011
Pass:1, Batch:80, Cost:0.00004
Pass:1, Batch:120, Cost:0.00704
Pass:1, Batch:160, Cost:0.00062
Test:1, Cost:16.69327, ACC:0.50175
Pass:2, Batch:0, Cost:32.13813
Pass:2, Batch:40, Cost:0.00000
Pass:2, Batch:80, Cost:0.00000
Pass:2, Batch:120, Cost:0.01846
Pass:2, Batch:160, Cost:0.00504
Test:2, Cost:9.63199, ACC:0.50175
Pass:3, Batch:0, Cost:17.38382
Pass:3, Batch:40, Cost:0.00997
Pass:3, Batch:80, Cost:0.00572
Pass:3, Batch:120, Cost:0.00087
Pass:3, Batch:160, Cost:0.00006
Test:3, Cost:14.16478, ACC:0.50175
Pass:4, Batch:0, Cost:24.05401
Pass:4, Batch:40, Cost:0.06184
Pass:4, Batch:80, Cost:0.03280
Pass:4, Batch:120, Cost:0.02416
Pass:4, Batch:160, Cost:0.00976
Test:4, Cost:6.69925, ACC:0.50175
Pass:5, Batch:0, Cost:11.96461
Pass:5, Batc

['fc_10.tmp_2']

In [49]:
# 定义预测数据
reviews_str = ['read the book forget the movie', 'this is a great movie', 'this is very bad']
# 把每个句子拆成一个个单词
reviews = [c.split() for c in reviews_str]

In [50]:
# 获取结束符号的标签
UNK = word_dict['<unk>']
# 获取每句话对应的标签
lod = []
for c in reviews:
    # 需要把单词进行字符串编码转换
    lod.append([word_dict.get(words.encode('utf-8'), UNK) for words in c])

In [51]:
# 获取每句话的单词数量
base_shape = [[len(c) for c in lod]]

In [52]:
# 生成预测数据
tensor_words = fluid.create_lod_tensor(lod, base_shape, place)

In [53]:
infer_exe = fluid.Executor(place)    #创建推测用的executor
inference_scope = fluid.core.Scope() #Scope指定作用域

In [54]:
with fluid.scope_guard(inference_scope):#修改全局/默认作用域（scope）, 运行时中的所有变量都将分配给新的scope。
    #从指定目录中加载 推理model(inference model)
    [inference_program,                                            #推理的program
     feed_target_names,                                            #str列表，包含需要在推理program中提供数据的变量名称
     fetch_targets] = fluid.io.load_inference_model(model_save_dir,#fetch_targets: 推断结果，model_save_dir:模型训练路径 
                                                        infer_exe) #infer_exe: 运行 inference model的 executor
    results = infer_exe.run(inference_program,                                 #运行预测程序
                            feed={feed_target_names[0]: tensor_words},#喂入要预测的x值
                            fetch_list=fetch_targets)                           #得到推测结果 
    # 打印每句话的正负面概率
    for i, r in enumerate(results[0]):
        print("\'%s\'的预测结果为：正面概率为：%0.5f，负面概率为：%0.5f" % (reviews_str[i], r[0], r[1]))

'read the book forget the movie'的预测结果为：正面概率为：0.47315，负面概率为：0.52685
'this is a great movie'的预测结果为：正面概率为：0.47846，负面概率为：0.52154
'this is very bad'的预测结果为：正面概率为：0.47827，负面概率为：0.52173


请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 