In [1]:
import tensorflow as tf
import os
import sys
import numpy as np
import math
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tensorflow.contrib.rnn import GRUCell

tf.logging.set_verbosity(tf.logging.INFO)



In [2]:
## 下面高参数 10K 次 train: 99.7%, Valid: 92.7%, Test: 93.2%
def get_default_params():
    return tf.contrib.training.HParams(
        # num_embedding_size = 16,
        num_embedding_size = 32,
        # 一个句子取前50个分词
        # num_timesteps = 50,
        num_timesteps = 200,
        # num_lstm_nodes = [32, 32],
        num_lstm_nodes = [64, 64],
        num_lstm_layers = 2,
        # num_fc_nodes = 32,
        num_fc_nodes = 64,
        batch_size = 100,
        clip_lstm_grads = 1.0,
        learning_rate = 0.001,
        num_word_threshold = 10,
    )
hps = get_default_params()

seg_train_file = '.\deep_learn\sohu_seg_train_file.txt'
seg_test_file = '.\deep_learn\sohu_seg_test_file.txt'

vocab_file =  '.\deep_learn\sohu_vocab.txt'
category_file = '.\deep_learn\sohu_category.txt'
output_file = '.\deep_learn\sohu_run_text_run'
vocab_padding_file = '.\deep_learn\sohu_padding_vocab.txt'

if not os.path.exists(output_file):
    os.mkdir(output_file)

In [3]:
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """
    Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.

    The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
     for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
    Variables notation is also inherited from the article
    
    Args:
        inputs: The Attention inputs.
            Matches outputs of RNN/Bi-RNN layer (not final state):
                In case of RNN, this must be RNN outputs `Tensor`:
                    If time_major == False (default), this must be a tensor of shape:
                        `[batch_size, max_time, cell.output_size]`.
                    If time_major == True, this must be a tensor of shape:
                        `[max_time, batch_size, cell.output_size]`.
                In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
                the backward RNN outputs `Tensor`.
                    If time_major == False (default),
                        outputs_fw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_bw.output_size]`.
                    If time_major == True,
                        outputs_fw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_bw.output_size]`.
        attention_size: Linear size of the Attention weights.
        time_major: The shape format of the `inputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            Using `time_major = True` is a bit more efficient because it avoids
            transposes at the beginning and end of the RNN calculation.  However,
            most TensorFlow data is batch-major, so by default this function
            accepts input and emits output in batch-major form.
        return_alphas: Whether to return attention coefficients variable along with layer's output.
            Used for visualization purpose.
    Returns:
        The Attention output `Tensor`.
        In case of RNN, this will be a `Tensor` shaped:
            `[batch_size, cell.output_size]`.
        In case of Bidirectional RNN, this will be a `Tensor` shaped:
            `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
    """

    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas


# api sentence_to_id的实现
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._padding = 1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            # 第一行为<UNK> 用于test出现train没有出现的词
            # 第二行为<PADDING> 用于过短进行padding
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    @property
    def unk(self):
        return self._unk
    
    @property
    def padding(self):
        return self._padding
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        # 分词后的句子用空格来隔开每一个词语,这里每一句话里的分词用id来表示
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids
# keras中imdb中使用0作padding, 未出现过的词用2代替
class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
            
    def size(self):
        return len(self._category_to_id)
    
    def category_to_id(self, category):
        if not category in self._category_to_id:
            print(self._category_to_id)
            raise Exception("{} is not in our category".format(category))
            
        return self._category_to_id[category]
          
vocab = Vocab(vocab_padding_file, hps.num_word_threshold)
vocab_size = vocab.size()
print(vocab.word_to_id('<UNK>')) # 0
print(vocab.word_to_id('<PADDING>')) # 1
print('unk', vocab.unk)
print('padding', vocab.padding)
tf.logging.info('vocab_size: {}'.format(vocab_size))
category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('category_size: {}'.format(num_classes))
test_str = '女人'
tf.logging.info('id:{}'.format(category_vocab.category_to_id(test_str)))



0
1
unk 0
padding 1
INFO:tensorflow:vocab_size: 85429
INFO:tensorflow:category_size: 12
INFO:tensorflow:id:11


In [4]:
# 
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        
        self._inputs = []
        self._outputs = []
        
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from {}'.format(filename))
        lines = 0
        import re
        if re.findall('train', filename):
            with open(filename, 'r') as f:
                lines = f.readlines()
        elif re.findall('test', filename):
            with open(filename, 'r', encoding='utf-8-sig') as f:
                lines = f.readlines()  
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            
            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            
            id_words = id_words + [
                self._vocab.padding for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception("batch size: {} is too large".format(batch_size))
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_output = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_output

train_dataset = TextDataSet(seg_train_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataSet(seg_test_file, vocab, category_vocab, hps.num_timesteps)

INFO:tensorflow:Loading data from .\deep_learn\sohu_seg_train_file.txt
INFO:tensorflow:Loading data from .\deep_learn\sohu_seg_test_file.txt


In [5]:
print(train_dataset.next_batch(5))
# 因为next_batch == 2 ,输出为
# 只选择句子前50个分词, 输出为2*50的矩阵,第一个为句子分词的索引id, 第二个为分类的索引id





(array([[    8,   326,     3,  5006,   120,  4673, 18872,   816,     8,
         6155, 71263,    52,    29,     0,  9178,    51,    27,   110,
          129,    35,   110,   168,   181,    63,    79,    72,    26,
            2,   456, 27886,  1687,   931,    63,    69,    54,    60,
          129,    54,    69,   330,   129,   126,    72,   156,   126,
           35,    79,    72,     2,   122,  4690,     3,    82,   148,
          239,     3, 25267, 22801,     6, 14604,  1240,     2,   678,
          585, 38310,     3,  3088,  1108,     7,    18,   948,    29,
            0,  9178,  1090,     3, 17612, 59812,     4,  4623,  3170,
           44,   326,     2,  2638, 24308, 10143,   191,     8,   924,
        13340, 36985,   209,    52,   835,     3,   326,  8051,    51,
            2,   727,     3,    11,  3170,  2213,   351,    64,   326,
          881,    36,   337,  6808,     4,    49,   461,   125,    48,
          670,    61,    50,     2,   697,     2,    11,    52,  1318,
     

In [6]:
def create_model(hps, vocab_size, num_classes):
    # 取一个句子的前50个分词, num_classes为固定的50个分词
    num_timesteps = hps.num_timesteps
    # 训练批次大小
    batch_size = hps.batch_size
    # 输入为[批次的大小,50]
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    # 输出为[批次的大小,]
    outputs = tf.placeholder(tf.int32, (batch_size, ))
    # dropout的使用
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    # 保存训练到哪一步
    global_step = tf.Variable(
        tf.zeros([], tf.int64), name='global_step', trainable=False)
    
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')

    # 随机化embedding 编码
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope(
        'embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size+2, hps.num_embedding_size],
            tf.float32)
        print('embeddings', embeddings)# (85430, 32)
        # 把输入的分词中的id -> embedding编码形式
        # ex [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs) # (100, 200, 32)
        print('embed_inputs', embed_inputs)
    # (Bi-)RNN layser
    
    
    
    
    rnn_outputs, _ = bi_rnn(GRUCell(64), GRUCell(64),
                            inputs=embed_inputs, sequence_length=seq_len_ph, dtype=tf.float32)
    # [batch_size, cell_fw.output_size + cell_bw.output_size]
    print('rnn_outputs', rnn_outputs) # shape=(100, 200, 64)
    # (100, 200, 64)
    # last = rnn_outputs[:, -1, :]
# Attention layer
    with tf.name_scope('Attention_layer'):
        attention_output, alphas = attention(rnn_outputs, 50, return_alphas=True)
    drop = tf.nn.dropout(attention_output, keep_prob)
    print(drop.shape)# (100, 128)
    
        # tf.summary.histogram('alphas', alphas)
    # 网络initializer的一种方法
    """scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    # 构建lstm
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            # 循环初始化lstm
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True
            )
            # 使用dropout方法
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob
            )
            cells.append(cell)
        # 合并两个cell
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        # 初始化cell内的值
        initial_state = cell.zero_state(batch_size, tf.float32)
        # run_outputs: [batch_size, num_timesteps, lstm_outpus[-1]]
        run_outputs, _ = tf.nn.dynamic_rnn(
            cell, embed_inputs, initial_state=initial_state
        )
        print(run_outputs)
        last = run_outputs[:, -1, :]"""
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    # lstm连接到全连接层
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.layers.dense(drop,
                              hps.num_fc_nodes,
                              activation=tf.nn.relu,
                              name='fc1')
        fc1_dropout = tf.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout,
                                 num_classes,
                                 name='fc2')
        
        # W = tf.Variable(tf.truncated_normal([150 * 2, 1], stddev=0.1))
        # b = tf.Variable(tf.constant(0., shape=[1]))
        # y_hat = tf.nn.xw_plus_b(tf.cast(drop, dtype=tf.int32), W, b)
        # y_hat = tf.squeeze(y_hat)
        # tf.summary
    # 计算损失函数
    
    with tf.name_scope('metrics'):
        
        sofmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels= outputs
        )
        loss = tf.reduce_mean(sofmax_loss)
        # optimzer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)
        
        y_pred = tf.arg_max(tf.nn.softmax(logits=logits),
                            1,
                            output_type= tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        # accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), outputs), tf.float32))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    # 构建train_op
    
    with tf.name_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info('variable name: {}'.format(var.name))
        # 限制训练时的梯度大小,使得不会出现梯度爆炸
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads
        )
        # 梯度应用到变量中去
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(
            zip(grads, tvars), global_step= global_step
        )
    
    # return ((inputs, outputs, keep_prob),
    return ((inputs, outputs, keep_prob, seq_len_ph),
            (loss, accuracy),
            # (optimzer)
            (train_op, global_step)
            )

placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes
)
# inputs, outputs, keep_prod = placeholders
inputs, outputs, keep_prod, seq_len_ph = placeholders
loss, accuracy = metrics
train_op, global_step = others
# optimzer = others

Instructions for updating:
Colocations handled automatically by placer.
embeddings <tf.Variable 'embedding/embedding:0' shape=(85431, 32) dtype=float32_ref>
embed_inputs Tensor("embedding/embedding_lookup/Identity:0", shape=(100, 200, 32), dtype=float32)
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
rnn_outputs (<tf.Tensor 'bidirectional_rnn/fw/fw/transpose_1:0' shape=(100, 200, 64) dtype=float32>, <tf.Tensor 'ReverseSequence:0' shape=(100, 200, 64) dtype=float32>)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
(100, 128)
Instructions for updating:
Use tf.initializers.varianc

In [7]:
# train:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keep_prob_value = 1.0

test_steps = 100
num_train_steps = 10000

with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(
            hps.batch_size
        )
        
        seq_len = np.array([list(x).index(1)+1 if 1 in x else hps.num_timesteps for x in batch_inputs])
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
        
        # outputs_val = sess.run([loss, accuracy, optimzer],
                               feed_dict = {
                                inputs: batch_inputs,
                                outputs: batch_labels,
                                   keep_prod: train_keep_prob_value,
                                   seq_len_ph: seq_len
                                   
                               })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        # loss_val, accuracy_val, optimzer = outputs_val
        if (i+1) % 20 == 0:
            tf.logging.info("Train Step: {}, loss: {}, accuracy: {}".format(i, loss_val, accuracy_val))

        if (i+1) % 100 == 0:
            all_test_acc_cal = []
            for j in range(test_steps):
                test_inputs, test_labels = test_dataset.next_batch(hps.batch_size)
                seq_len = np.array([list(x).index(1)+1 if 1 in x else hps.num_timesteps for x in test_inputs])
                test_val = sess.run([loss, accuracy, train_op, global_step],
                # test_val = sess.run([loss, accuracy, optimzer],
                                    feed_dict= {
                                        inputs: test_inputs,
                                        outputs: test_labels,
                                        seq_len_ph: seq_len,
                                        keep_prod: test_keep_prob_value,
                                    })
                test_loss_val, test_accuarcy_val, _, test_step_val = test_val
                # test_loss_val, test_accuarcy_val, optimzer = test_val
                all_test_acc_cal.append(test_accuarcy_val)
            test_acc = np.mean(all_test_acc_cal)
            tf.logging.info("Test Step: {}, loss: {}, accuracy: {}".format(i, test_loss_val, test_acc))

INFO:tensorflow:Train Step: 19, loss: 2.459465742111206, accuracy: 0.20999999344348907
INFO:tensorflow:Train Step: 39, loss: 2.413097620010376, accuracy: 0.15000000596046448
INFO:tensorflow:Train Step: 59, loss: 2.3422775268554688, accuracy: 0.18000000715255737
INFO:tensorflow:Train Step: 79, loss: 2.146698236465454, accuracy: 0.23999999463558197
INFO:tensorflow:Train Step: 99, loss: 2.185227870941162, accuracy: 0.1899999976158142
INFO:tensorflow:Test Step: 99, loss: 1.7761415243148804, accuracy: 0.3418000042438507
INFO:tensorflow:Train Step: 119, loss: 1.8352121114730835, accuracy: 0.3799999952316284
INFO:tensorflow:Train Step: 139, loss: 1.4725143909454346, accuracy: 0.46000000834465027
INFO:tensorflow:Train Step: 159, loss: 1.5246915817260742, accuracy: 0.47999998927116394
INFO:tensorflow:Train Step: 179, loss: 1.5380370616912842, accuracy: 0.47999998927116394
INFO:tensorflow:Train Step: 199, loss: 1.592097282409668, accuracy: 0.49000000953674316
INFO:tensorflow:Test Step: 199, loss