In [1]:
import tensorflow as tf
from tensorflow import logging
from tensorflow import gfile
import os
import numpy as np
import math


logging.set_verbosity(logging.INFO)

定义超参数列表

In [2]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size=16,
        num_timesteps=50,
        num_lstm_layers=2,
        num_lstm_nodes=[20, 28],
        num_fc_nodes=19,
        batch_size=100,
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        num_word_threshould=10,
    )


hps = get_default_params()
print(hps.num_lstm_nodes)


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

[20, 28]


In [3]:
# input file
seg_train_file = './cnews/cnews.train.seg.txt'
seg_test_file = './cnews/cnews.test.seg.txt'
seg_val_file = './cnews/cnews.val.seg.txt'

In [4]:
# 词表文件
vocab_file = './cnews/cnews.vocab.txt'


In [5]:
# 类别文件
category_file = './cnews/cnews.category.txt'

In [6]:
output_folder = './cnews/run_text_rnn'
if not gfile.Exists(output_folder):
    gfile.MakeDirs(output_folder)


In [7]:
class Vocab:
    def __init__(self, filename, num_word_threshould):
        self._word_to_id = {}
        self._num_word_threshould = num_word_threshould  #阈值，少于这个数就不要了
        self._unk_id = -1
        self._read_file(filename)

    def _read_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshould:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk_id = idx
            self._word_to_id[word] = idx

    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk_id)

    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids

    def size(self):
        return len(self._word_to_id)

    @property #装饰器，将函数变成一个属性可以调用
    def get_unk_id(self):
        return self._unk_id


In [8]:
vocab = Vocab(vocab_file, hps.num_word_threshould)
vocab_size = vocab.size()
logging.info('vocab size：%d' % vocab_size) #打印词表的长度

INFO:tensorflow:vocab size：77325


测试一下

In [9]:
test_str = '的 在 你好 是'
logging.info('sentence2id: %s' % vocab.sentence_to_id(test_str))

INFO:tensorflow:sentence2id: [2, 4, 9902, 7]


查找类别的编号

In [10]:
class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx

    def category_to_id(self, category_name):
        if not category_name in self._category_to_id:
            raise Exception('%s is not in category list' % category_name)
        return self._category_to_id[category_name]

    def size(self):
        return len(self._category_to_id)


In [11]:
category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()

logging.info(num_classes)
logging.info(category_vocab.category_to_id('家居'))

INFO:tensorflow:10
INFO:tensorflow:2


In [12]:
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # 矩阵 [batch_size , num_timesteps]
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)

    def _parse_file(self, filename):
        logging.info('Loading data from %s' % filename)
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            # id_words 20 60    _num_timesteps:50
            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [self._vocab.get_unk_id for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        # 列表变成 矩阵形式
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        self._random_shuffle()

    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]

    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._indicator = 0
            self._random_shuffle()
            end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            raise Exception('batch size : %d is too large!'% batch_size)
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs


In [13]:
train_dataset = TextDataSet(seg_train_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataSet(seg_test_file, vocab, category_vocab, hps.num_timesteps)
val_dataset = TextDataSet(seg_val_file, vocab, category_vocab, hps.num_timesteps)

print(train_dataset.next_batch(2))
print(test_dataset.next_batch(2))
print(val_dataset.next_batch(2))

INFO:tensorflow:Loading data from ./cnews/cnews.train.seg.txt
INFO:tensorflow:Loading data from ./cnews/cnews.test.seg.txt
INFO:tensorflow:Loading data from ./cnews/cnews.val.seg.txt
(array([[  468,    11, 58788,  1690, 26736,  1309,  3327,  4663,  1471,
        15707,     2,  7263,     1, 27873,     2, 48082,     1,  8547,
        42867,     2, 26468,  8355,     1,  1901, 13393,     2, 10116,
            1,  3484,  5679,     2,  5379,     1,   453,  9072, 58788,
        21857,     2,  2201,     3, 17897,  5485,     1,   639,   166,
          893,  1004,     2,  1505,     1],
       [  463,   779,  3335,   952,    30,    24,   880,  2218,    44,
         6007,    77,     0, 28604,  1632, 13135,  1879,   737,   386,
            2,   265,   245,     1,   311,   181,   350,   463,   779,
         3335,   952,   649,  1137,    29, 77308,     1,  3220,  2532,
            6,  2124,  2013,  1274,   747,     1,    40,  2035,   262,
           28,    22,   603,   880,  1696]]), array([5, 9]))
(

构造模型，embedding层后添加LSTM，再添加全连接层，模型使用了dropout正则

In [14]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    inputs = tf.placeholder(dtype=tf.int32, shape=(batch_size, num_timesteps))
    outputs = tf.placeholder(dtype=tf.int32, shape=(batch_size, ))

    keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob', shape=[]) #正则化的超参数
    # shape = [2, 2]  val: [[0, 0], [0, 0]]
    # shape = [2]  val: [0, 0]
    # shape = [1] val: [0]
    # shape = []  val: 0 标量
    global_step = tf.Variable(initial_value=tf.zeros(shape=[], dtype=tf.int64), name='global_step', trainable=False)

    # embedding 嵌入 编码  机器翻译 nlp
    embedding_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0)
    with tf.variable_scope(name_or_scope='embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(name='embeddings', shape=[vocab_size, hps.num_embedding_size], dtype=tf.float32, trainable=True)
        # embed_inputs shape : 三维矩阵 shape [batch_size, num_timesteps, num_embedding_size]
        embed_inputs = tf.nn.embedding_lookup(params=embeddings, ids=inputs)
        print('embed_inputs shape:', embed_inputs.shape)


    # lstm
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_initializer = tf.random_uniform_initializer(minval=-scale, maxval=scale)
    with tf.variable_scope(name_or_scope='lstm_nn', initializer=lstm_initializer):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hps.num_lstm_nodes[i], state_is_tuple=True, name='lstm_cell')
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_prob)
            cells.append(cell)
        cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
        initial_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32)
        # rnn_outputs shape : [batch_size, num_timesteps, hps.num_lstm_nodes[-1]]
        rnn_outputs, middle_hidden_state = tf.nn.dynamic_rnn(cell=cell, inputs=embed_inputs, initial_state=initial_state)
        print('rnn_outputs shape:', rnn_outputs.shape)
        # last shape [batch_size, hps.num_lstm_nodes[-1]]
        last = rnn_outputs[:, -1, :]
        print('last shape: ', last.shape)

    # fc全连接层
    fc_initializer = tf.uniform_unit_scaling_initializer(factor=1.0, dtype=tf.float32)
    with tf.variable_scope(name_or_scope='fc', initializer=fc_initializer):
        fc1 = tf.layers.dense(inputs=last, units=hps.num_fc_nodes, activation=tf.nn.relu, name='fc1')
        fc1_dropout = tf.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout, num_classes, name='fc2')

    # metrics (loss accuracy)
    with tf.name_scope(name='metrics'):
        # sparse_ 用这个：强大一些
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=outputs, logits=logits)
        loss = tf.reduce_mean(softmax_loss)
        # argmax [0, 2, 1, 5, 8, 2]  ---> 4
        y_pred = tf.argmax(tf.nn.softmax(logits), axis=1, output_type=tf.int32)
        correct_pred = tf.equal(y_pred, outputs)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32))


    # train_op 训练层
    with tf.name_scope(name='train_op'):
        # train_op = tf.train.AdamOptimizer(hps.learning_rate).minimize(loss, global_step=global_step)
        train_vars = tf.trainable_variables()
        for var in train_vars:
            logging.info('train variable name : %s' % var.name)
        grads, _ = tf.clip_by_global_norm(t_list=tf.gradients(loss, train_vars), clip_norm=hps.clip_lstm_grads)
        # 普通梯度下降 BGD SGD MBGD  动量（有速度） 冲量
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(grads_and_vars=zip(grads, train_vars), global_step=global_step, name='train_op')

    return (
        (inputs, outputs, keep_prob),
        (loss, accuracy),
        (train_op, global_step)
    )

In [15]:
placeholders, metrics, others = create_model(hps, vocab_size, num_classes)
inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others

embed_inputs shape: (100, 50, 16)
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
rnn_outputs shape: (100, 50, 28)
last shape:  (100, 28)
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` 

In [16]:
# 执行训练模型代码
init_op = tf.global_variables_initializer()
train_keep_prob = 0.8
test_keep_prob = 1.0

num_train_steps = 10000 #训练一万次

In [17]:
with tf.Session() as sess:
    sess.run(init_op)

    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(hps.batch_size)
        outputs_val = sess.run(fetches=[loss, accuracy, train_op, global_step], feed_dict={
            inputs: batch_inputs,
            outputs: batch_labels,
            keep_prob: train_keep_prob
        })

        loss_val, accuracy_val, _, global_step_val = outputs_val

        if global_step_val % 20 == 0:
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.5f' % (global_step_val, loss_val, accuracy_val))
        if global_step_val % 200 == 0:
            # 每隔200步 就去测试一下
            # test_dataset.next_batch()
            pass

INFO:tensorflow:Step:     0, loss: 2.302, accuracy: 0.10000
INFO:tensorflow:Step:    20, loss: 2.302, accuracy: 0.08000
INFO:tensorflow:Step:    40, loss: 2.257, accuracy: 0.16000
INFO:tensorflow:Step:    60, loss: 2.295, accuracy: 0.13000
INFO:tensorflow:Step:    80, loss: 2.272, accuracy: 0.15000
INFO:tensorflow:Step:   100, loss: 2.247, accuracy: 0.11000
INFO:tensorflow:Step:   120, loss: 2.160, accuracy: 0.14000
INFO:tensorflow:Step:   140, loss: 2.087, accuracy: 0.23000
INFO:tensorflow:Step:   160, loss: 2.011, accuracy: 0.25000
INFO:tensorflow:Step:   180, loss: 1.980, accuracy: 0.27000
INFO:tensorflow:Step:   200, loss: 1.826, accuracy: 0.43000
INFO:tensorflow:Step:   220, loss: 1.743, accuracy: 0.33000
INFO:tensorflow:Step:   240, loss: 1.656, accuracy: 0.40000
INFO:tensorflow:Step:   260, loss: 1.730, accuracy: 0.36000
INFO:tensorflow:Step:   280, loss: 1.703, accuracy: 0.32000
INFO:tensorflow:Step:   300, loss: 1.718, accuracy: 0.33000
INFO:tensorflow:Step:   320, loss: 1.649

INFO:tensorflow:Step:  2740, loss: 0.652, accuracy: 0.84000
INFO:tensorflow:Step:  2760, loss: 0.590, accuracy: 0.82000
INFO:tensorflow:Step:  2780, loss: 0.470, accuracy: 0.81000
INFO:tensorflow:Step:  2800, loss: 0.350, accuracy: 0.89000
INFO:tensorflow:Step:  2820, loss: 0.478, accuracy: 0.87000
INFO:tensorflow:Step:  2840, loss: 0.503, accuracy: 0.87000
INFO:tensorflow:Step:  2860, loss: 0.465, accuracy: 0.86000
INFO:tensorflow:Step:  2880, loss: 0.245, accuracy: 0.92000
INFO:tensorflow:Step:  2900, loss: 0.578, accuracy: 0.81000
INFO:tensorflow:Step:  2920, loss: 0.504, accuracy: 0.85000
INFO:tensorflow:Step:  2940, loss: 0.370, accuracy: 0.89000
INFO:tensorflow:Step:  2960, loss: 0.439, accuracy: 0.85000
INFO:tensorflow:Step:  2980, loss: 0.449, accuracy: 0.85000
INFO:tensorflow:Step:  3000, loss: 0.446, accuracy: 0.87000
INFO:tensorflow:Step:  3020, loss: 0.534, accuracy: 0.83000
INFO:tensorflow:Step:  3040, loss: 0.406, accuracy: 0.89000
INFO:tensorflow:Step:  3060, loss: 0.428

INFO:tensorflow:Step:  5480, loss: 0.071, accuracy: 0.98000
INFO:tensorflow:Step:  5500, loss: 0.201, accuracy: 0.93000
INFO:tensorflow:Step:  5520, loss: 0.071, accuracy: 0.98000
INFO:tensorflow:Step:  5540, loss: 0.157, accuracy: 0.94000
INFO:tensorflow:Step:  5560, loss: 0.136, accuracy: 0.96000
INFO:tensorflow:Step:  5580, loss: 0.193, accuracy: 0.94000
INFO:tensorflow:Step:  5600, loss: 0.130, accuracy: 0.96000
INFO:tensorflow:Step:  5620, loss: 0.246, accuracy: 0.90000
INFO:tensorflow:Step:  5640, loss: 0.147, accuracy: 0.95000
INFO:tensorflow:Step:  5660, loss: 0.147, accuracy: 0.96000
INFO:tensorflow:Step:  5680, loss: 0.121, accuracy: 0.98000
INFO:tensorflow:Step:  5700, loss: 0.068, accuracy: 0.97000
INFO:tensorflow:Step:  5720, loss: 0.067, accuracy: 0.97000
INFO:tensorflow:Step:  5740, loss: 0.139, accuracy: 0.95000
INFO:tensorflow:Step:  5760, loss: 0.149, accuracy: 0.94000
INFO:tensorflow:Step:  5780, loss: 0.238, accuracy: 0.94000
INFO:tensorflow:Step:  5800, loss: 0.032

INFO:tensorflow:Step:  8220, loss: 0.079, accuracy: 0.96000
INFO:tensorflow:Step:  8240, loss: 0.016, accuracy: 1.00000
INFO:tensorflow:Step:  8260, loss: 0.055, accuracy: 0.98000
INFO:tensorflow:Step:  8280, loss: 0.096, accuracy: 0.97000
INFO:tensorflow:Step:  8300, loss: 0.098, accuracy: 0.98000
INFO:tensorflow:Step:  8320, loss: 0.089, accuracy: 0.99000
INFO:tensorflow:Step:  8340, loss: 0.111, accuracy: 0.95000
INFO:tensorflow:Step:  8360, loss: 0.039, accuracy: 0.99000
INFO:tensorflow:Step:  8380, loss: 0.196, accuracy: 0.95000
INFO:tensorflow:Step:  8400, loss: 0.105, accuracy: 0.95000
INFO:tensorflow:Step:  8420, loss: 0.081, accuracy: 0.97000
INFO:tensorflow:Step:  8440, loss: 0.077, accuracy: 0.97000
INFO:tensorflow:Step:  8460, loss: 0.083, accuracy: 0.97000
INFO:tensorflow:Step:  8480, loss: 0.134, accuracy: 0.96000
INFO:tensorflow:Step:  8500, loss: 0.043, accuracy: 1.00000
INFO:tensorflow:Step:  8520, loss: 0.036, accuracy: 0.98000
INFO:tensorflow:Step:  8540, loss: 0.019