In [1]:
import tensorflow as tf
import os
import sys
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO)



In [2]:
## 下面高参数 10K 次 train: 99.7%, Valid: 92.7%, Test: 93.2%
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size = 32,
        # 一个句子取前50个分词
        # num_timesteps = 50,
        num_timesteps = 200,
        # num_lstm_nodes = [32, 32],
        num_lstm_nodes = [64, 64],
        num_lstm_layers = 2,
        # num_fc_nodes = 32,
        num_fc_nodes = 64,
        batch_size = 100,
        clip_lstm_grads = 1.0,
        learning_rate = 0.001,
        num_word_threshold = 10,
    )
hps = get_default_params()

seg_train_file = '.\deep_learn\sohu_seg_train_file.txt'
seg_test_file = '.\deep_learn\sohu_seg_test_file.txt'

vocab_file =  '.\deep_learn\sohu_vocab.txt'
category_file = '.\deep_learn\sohu_category.txt'
output_file = '.\deep_learn\sohu_run_text_run'

if not os.path.exists(output_file):
    os.mkdir(output_file)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
# api sentence_to_id的实现
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        # 分词后的句子用空格来隔开每一个词语,这里每一句话里的分词用id来表示
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids

class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
            
    def size(self):
        return len(self._category_to_id)
    
    def category_to_id(self, category):
        if not category in self._category_to_id:
            # print(self._category_to_id)
            raise Exception("{} is not in our category".format(category))
            
        return self._category_to_id[category]
          
vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()

tf.logging.info('vocab_size: {}'.format(vocab_size))
category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('category_size: {}'.format(num_classes))
test_str = '女人'
tf.logging.info('id:{}'.format(category_vocab.category_to_id(test_str)))

INFO:tensorflow:vocab_size: 85428
INFO:tensorflow:category_size: 12
INFO:tensorflow:id:11


In [4]:
# 
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        
        self._inputs = []
        self._outputs = []
        
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from {}'.format(filename))
        lines = 0
        import re
        if re.findall('train', filename):
            with open(filename, 'r') as f:
                lines = f.readlines()
        elif re.findall('test', filename):
            with open(filename, 'r', encoding='utf-8-sig') as f:
                lines = f.readlines()  
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            
            id_words = id_words[0: self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [
                self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)

        
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception("batch size: {} is too large".format(batch_size))
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_output = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_output

train_dataset = TextDataSet(seg_train_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataSet(seg_test_file, vocab, category_vocab, hps.num_timesteps)

print(train_dataset.next_batch(3))
# print(test_dataset.next_batch(2))
# 因为next_batch == 2 ,输出为
'''(array([[ 3879,  3048,     0,  1302,  6449,  9225, 32020, 75465,    26,
          938,    25, 42449,  1743,  5589,  1071, 28916,   875,     7,
         1071,  9306, 13826, 20919,   190,     1,  5614,  7024,   359,
        68957,   559,  9255, 11563,   172,  2431,   627,  7521,    51,
         5280, 15051,    50,    94,     0,     0,    65,   938, 42449,
           90,  2603,  2234,     8,  3420],
       [  192,   116,   310, 21315,  4187,  7290,  2223,  5918,    11,
        33789,    12,   922,    13,  6187,  1061,     7, 37987,  4187,
            2,   192,   116,   310,   880, 25294,     1,   104,   675,
         9024,     2,  1643,    10,    11,  8970,    12,     3,   968,
            1,     7,    11,  6890,   639,    13,   310,    42,     9,
            4,     4,    18,  3163,  3163]]), array([ 9, 10]))
'''
# 只选择句子前50个分词, 输出为2*50的矩阵,第一个为句子分词的索引id, 第二个为分类的索引id


INFO:tensorflow:Loading data from .\deep_learn\sohu_seg_train_file.txt
INFO:tensorflow:Loading data from .\deep_learn\sohu_seg_test_file.txt
(array([[  809,    39, 11753,   258,     0,  1437,  3073,  3067,     9,
           39,     4,  3090,   922,    13,   272,  1347,  1797,    13,
          209,    53,    78,   414,    64,    34,   105,   180,   272,
         3541,   111,   146,    17,    46,     6,     6,    76,  2225,
            9,   344,    20,    15,   202,    26, 31842,   146,     9,
            4,   344,    20,    15,   202,    25,     1,     9,     4,
            4,    18,   809,  2092,   141,  1800,  4546, 10592,  3090,
         7723,    11, 12717, 28594,    12,  3067,     3,   519,    23,
            4,   430,     2, 10439,  7806,     9,    39,     4, 13272,
        10592,  3090,     3, 11753,     7,   241,    35,   171,  6886,
          258,  4093,   679,     8,    72,  1860,    16,     9,   113,
           46,     2,  1762, 20723,     1,  7806,     2,  1946,    21,
      

'(array([[ 3879,  3048,     0,  1302,  6449,  9225, 32020, 75465,    26,\n          938,    25, 42449,  1743,  5589,  1071, 28916,   875,     7,\n         1071,  9306, 13826, 20919,   190,     1,  5614,  7024,   359,\n        68957,   559,  9255, 11563,   172,  2431,   627,  7521,    51,\n         5280, 15051,    50,    94,     0,     0,    65,   938, 42449,\n           90,  2603,  2234,     8,  3420],\n       [  192,   116,   310, 21315,  4187,  7290,  2223,  5918,    11,\n        33789,    12,   922,    13,  6187,  1061,     7, 37987,  4187,\n            2,   192,   116,   310,   880, 25294,     1,   104,   675,\n         9024,     2,  1643,    10,    11,  8970,    12,     3,   968,\n            1,     7,    11,  6890,   639,    13,   310,    42,     9,\n            4,     4,    18,  3163,  3163]]), array([ 9, 10]))\n'

In [5]:
def create_model(hps, vocab_size, num_classes):
    # 取一个句子的前50个分词, num_classes为固定的50个分词
    num_timesteps = hps.num_timesteps
    # 训练批次大小
    batch_size = hps.batch_size
    # 输入为[批次的大小,50]
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    # 输出为[批次的大小,]
    outputs = tf.placeholder(tf.int32, (batch_size, ))
    # dropout的使用
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    # 保存训练到哪一步
    global_step = tf.Variable(
        tf.zeros([], tf.int64), name='global_step', trainable=False)
    # 随机化embedding 编码
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope(
        'embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32)
        print('embeddings', embeddings) # (85428, 32)
        # 把输入的分词中的id -> embedding编码形式
        # ex [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
        print('embed_inputs', embed_inputs) # (100, 600, 32)
    # 网络initializer的一种方法
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    # 构建lstm
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            # 循环初始化lstm
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True
            )
            # 使用dropout方法
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob
            )
            cells.append(cell)
        # 合并两个cell
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        # 初始化cell内的值
        initial_state = cell.zero_state(batch_size, tf.float32)
        # run_outputs: [batch_size, num_timesteps, lstm_outpus[-1]]
        run_outputs, _ = tf.nn.dynamic_rnn(
            cell, embed_inputs, initial_state=initial_state
        )
        print('run_outputs------', run_outputs) # (100, 600, 64)
        last = run_outputs[:, -1, :] # (100, 64)
        print('last------------', last)
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    # lstm连接到全连接层
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.layers.dense(last, # (100, 64)
                              hps.num_fc_nodes, # 64
                              activation=tf.nn.relu,
                              name='fc1')
        # 使用dropout方法
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        print('fc1_dropout', fc1_dropout)
        logits = tf.layers.dense(fc1_dropout,
                                 num_classes,
                                 name='fc2')
        print('logits', logits)
    # 计算损失函数
    with tf.name_scope('metrics'):
        sofmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels= outputs
        )
        loss = tf.reduce_mean(sofmax_loss)
        y_pred = tf.arg_max(tf.nn.softmax(logits=logits),
                            1,
                            output_type= tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    # 构建train_op
    with tf.name_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info('variable name: {}'.format(var.name))
        # 限制训练时的梯度大小,使得不会出现梯度爆炸
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads
        )
        # 梯度应用到变量中去
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(
            zip(grads, tvars), global_step= global_step
        )
    
    return ((inputs, outputs, keep_prob),
            (loss, accuracy),
            (train_op, global_step))

placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes
)
inputs, outputs, keep_prod = placeholders
loss, accuracy = metrics
train_op, global_step = others

Instructions for updating:
Colocations handled automatically by placer.
embeddings <tf.Variable 'embedding/embedding:0' shape=(85428, 32) dtype=float32_ref>
embed_inputs Tensor("embedding/embedding_lookup/Identity:0", shape=(100, 200, 32), dtype=float32)
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
run_outputs------ Tensor("lstm_nn/rnn/transpose_1:0", shape=(100, 200, 64), dtype=float32)
last------------ Tensor("lstm_nn/strided_slice:0", shape=(100, 64), dtype=float32)
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=unifo

In [6]:
# train:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keep_prob_value = 1.0

test_steps = 100
num_train_steps = 1000

with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(
            hps.batch_size
        )
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
                               feed_dict = {
                                inputs: batch_inputs,
                                outputs: batch_labels,
                                   keep_prod: train_keep_prob_value,
                               })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        if (i+1) % 20 == 0:
            tf.logging.info("Train Step: {}, loss: {}, accuracy: {}".format(global_step_val, loss_val, accuracy_val))

        if (i+1) % 100 == 0:
            all_test_acc_cal = []
            for j in range(test_steps):
                test_inputs, test_labels = test_dataset.next_batch(hps.batch_size)
                test_val = sess.run([loss, accuracy, train_op, global_step],
                                    feed_dict= {
                                        inputs: test_inputs,
                                        outputs: test_labels,
                                        keep_prod: test_keep_prob_value,
                                    })
                test_loss_val, test_accuarcy_val, _, test_step_val = test_val
                all_test_acc_cal.append(test_accuarcy_val)
            test_acc = np.mean(all_test_acc_cal)
            tf.logging.info("Test Step: {}, loss: {}, accuracy: {}".format(global_step_val, test_loss_val, test_acc))

INFO:tensorflow:Train Step: 20, loss: 2.436840057373047, accuracy: 0.11999999731779099
INFO:tensorflow:Train Step: 40, loss: 2.459986686706543, accuracy: 0.09000000357627869
INFO:tensorflow:Train Step: 60, loss: 2.456941604614258, accuracy: 0.07999999821186066
INFO:tensorflow:Train Step: 80, loss: 2.477553367614746, accuracy: 0.07999999821186066
INFO:tensorflow:Train Step: 100, loss: 2.4217422008514404, accuracy: 0.1599999964237213
INFO:tensorflow:Test Step: 100, loss: 2.1643269062042236, accuracy: 0.14980000257492065
INFO:tensorflow:Train Step: 220, loss: 2.393247127532959, accuracy: 0.15000000596046448
INFO:tensorflow:Train Step: 240, loss: 2.236553907394409, accuracy: 0.10999999940395355
INFO:tensorflow:Train Step: 260, loss: 2.2571945190429688, accuracy: 0.17000000178813934
INFO:tensorflow:Train Step: 280, loss: 2.291293144226074, accuracy: 0.14000000059604645
INFO:tensorflow:Train Step: 300, loss: 2.164581537246704, accuracy: 0.20000000298023224
INFO:tensorflow:Test Step: 300, los