In [1]:
import tensorflow as tf
import os
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO)



In [2]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size = 32,
        num_timesteps = 120,
        num_fc_nodes = 64,
        num_filters=256,
        num_kernel_size=3,
        batch_size = 100,
        # lstm梯度
        clip_lstm_grads = 1.0,
        learning_rate = 0.001,
        # 词频的限定大小
        num_word_threshold = 10,
    )

hps = get_default_params()

train_file = r'.\deep_learn\jd_deep_learn\train_data.tsv'
test_file = r'.\deep_learn\jd_deep_learn\test_data.tsv'

seg_train_file = r'.\deep_learn\jd_deep_learn\seg_train_data.txt'
seg_test_file = r'.\deep_learn\jd_deep_learn\seg_test_data.txt'

vocab_file = r'.\deep_learn\jd_deep_learn\jd_vocab.txt'
category_file = r'.\deep_learn\jd_deep_learn\jd_category.txt'
output_dir = r'.\deep_learn\jd_deep_learn\jd_cnn_runout'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
        
    def _read_dict(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids

class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r',encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
    def size(self):
        print('category_to_id dict:{}'.format(self._category_to_id))
        return len(self._category_to_id)
    
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Exception("{} is not in our category".format(category))
        
        return self._category_to_id[category]
    
          
vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()

tf.logging.info('vocab_size: {}'.format(vocab_size))
category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('category_size: {}'.format(num_classes))
test_str = '5'
tf.logging.info('id:{}'.format(category_vocab.category_to_id(test_str)))
# print(vocab.word_to_id())

INFO:tensorflow:vocab_size: 4761
category_to_id dict:{'5': 0, '1': 1, '2': 2, '3': 3}
INFO:tensorflow:category_size: 4
INFO:tensorflow:id:0


In [4]:
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        self._inputs = []
        self._outputs = []
        
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from {}'.format(filename))
        import csv
        csv_reader = csv.reader(open(filename, encoding='utf-8'))
        for row in csv_reader:
            label, content = row[0].replace('\n', '').split('\t')
            id_label = self._category_vocab.category_to_id(label) # 标签转number
            # print(id_label)
            id_words = self._vocab.sentence_to_id(content) # 文字转number
            # print(id_words)
            
            id_words = id_words[0: self._num_timesteps] # 过长截断
            padding_num = self._num_timesteps - len(id_words) # 过短padding
            id_words = id_words + [self._vocab.unk for _ in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
            
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        # tf.logging.info('_random_shuffle number: {}'.format(p))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
        
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception('batch size: {} is too large'.format(batch_size))
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs
train_data = TextDataSet(seg_train_file, vocab, category_vocab, hps.num_timesteps)
test_data = TextDataSet(seg_test_file, vocab, category_vocab, hps.num_timesteps)

print(train_data.next_batch(3))
print(test_data.next_batch(3))
        

INFO:tensorflow:Loading data from .\deep_learn\jd_deep_learn\seg_train_data.txt
INFO:tensorflow:Loading data from .\deep_learn\jd_deep_learn\seg_test_data.txt
(array([[   5,  754,   19,   23,    1,  249,  147, 1056,    1,  787,   30,
         147,   96,  185,  393,  249,   10,  417,  110,    2,    1,   13,
          18,   58,    1, 1279,    8,   14,    2,    6, 1004,   29,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  509,   89, 3391,

In [5]:
y_pred = 0
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    # inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    inputs = tf.placeholder(tf.int32, (None, num_timesteps))
    # outputs = tf.placeholder(tf.int32, (batch_size, ))
    outputs = tf.placeholder(tf.int32, (None, ))
    
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    global_step = tf.Variable(
        tf.zeros([], tf.int64), name='global_step', trainable=False
    )
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope(
        'embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32
        )
        """
        简单的讲就是根据inputs中的id，寻找embedding中的对应元素。
        比如，input_ids=[1,3,5]，则找出embedding中下标为1,3,5的向量组成一个矩阵返回。
        """
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
        
    scale = 1.0 / math.sqrt(hps.num_embedding_size+hps.num_filters) / 3.0
    cnn_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('cnn', initializer=cnn_init):
        conv1 = tf.layers.conv1d(embed_inputs,
                                 hps.num_filters,
                                 hps.num_kernel_size,
                                 activation=tf.nn.relu,
                                 )
        global_maxpooling = tf.reduce_max(conv1, axis=[1])
        
    """with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            # 循环初始化lstm
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True
            )
            # 使用dropout方法
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob
            )
            cells.append(cell)
        # 合并两个cell
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        # 初始化cell内的值
        initial_state = cell.zero_state(batch_size, tf.float32)
        
        run_outputs, _ = tf.nn.dynamic_rnn(
            cell, embed_inputs, initial_state=initial_state
        )
        last = run_outputs[:, -1, :]"""
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.layers.dense(global_maxpooling,
                              hps.num_fc_nodes,
                              activation=tf.nn.relu,
                              name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout,
                                 num_classes,
                                 name='fc2')
    with tf.name_scope('metrics'):
        sofmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=outputs
        )
        loss = tf.reduce_mean(sofmax_loss)
        y_pred = tf.arg_max(tf.nn.softmax(logits=logits),
                            1,
                            output_type=tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
    with tf.name_scope('train_op'):
        train_op = tf.train.AdamOptimizer(hps.learning_rate).minimize(loss, global_step=global_step)
        
    return ((inputs, outputs, keep_prob),
            (loss, accuracy),
            (train_op, global_step))
# 调用函数
placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes
)
inputs, outputs, keep_prod = placeholders
loss, accuracy = metrics
train_op, global_step = others



model_dir = os.path.join(output_dir, 'model')
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
saver = tf.train.Saver()
model_name = 'ckp-1000'
model_path = os.path.join(model_dir, model_name)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use `tf.math.argmax` instead
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [6]:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keep_prob_value = 1.0

test_steps = 100
num_train_steps = 10000
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_data.next_batch(hps.batch_size)
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
                               feed_dict={
                                   inputs: batch_inputs,
                                   outputs: batch_labels,
                                   keep_prod:train_keep_prob_value,
                                   })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        if (i+1) % 100 == 0:
            tf.logging.info("Train Step: {}, loss: {}, accuracy: {}".format(global_step_val, loss_val, accuracy_val))
    
        if (i+1) % 1000 == 0:
            all_test_acc_cal = []
            for j in range(test_steps):
                test_inputs, test_labels = test_data.next_batch(hps.batch_size)
                test_val = sess.run([loss, accuracy, train_op, global_step],
                                    feed_dict= {
                                        inputs: test_inputs,
                                        outputs: test_labels,
                                        keep_prod: test_keep_prob_value,
                                    })
                test_loss_val, test_accuarcy_val, _, test_step_val = test_val
                all_test_acc_cal.append(test_accuarcy_val)
            test_acc = np.mean(all_test_acc_cal)
            saver.save(sess, os.path.join(model_dir, 'ckp-{}'.format(i+1)))
            tf.logging.info("Test Step: {}, loss: {}, accuracy: {}".format(global_step_val, test_loss_val, test_acc))
    # save saved_model
    # path = r'.\deep_learn\jd_deep_learn\jd_lstm_runout\model\jd_comment_cnn_serving'
    # builder =
    builder = tf.saved_model.builder.SavedModelBuilder(r'.\deep_learn\jd_deep_learn\jd_cnn_runout\jd_comment_cnn_serving3')
    input = {
        'inputs': tf.saved_model.utils.build_tensor_info(inputs), 
            'keep_prob': tf.saved_model.utils.build_tensor_info(keep_prod)
             }
    sigs = {}
    output = {'outputs': tf.saved_model.utils.build_tensor_info(sess.graph.get_tensor_by_name('metrics/ArgMax:0'))}
    # sigs[tf.saved_model.tag_constants.]
    signature = tf.saved_model.signature_def_utils.build_signature_def(input, output, 
                                                                       method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
    builder.add_meta_graph_and_variables(sess, tags=[tf.saved_model.tag_constants.SERVING],
                                         signature_def_map={'jd_comment_cnn_predict': signature})
    builder.save()
#     SignatureDef = sm.signature_def_utils.build_signature_def(
#                             inputs={'input_1': X_TensorInfo, 'input_2': scale_TensorInfo},
#                             outputs={'output': y_TensorInfo},
#                             method_name='what'
# )

INFO:tensorflow:Train Step: 100, loss: 1.0961178541183472, accuracy: 0.47999998927116394
INFO:tensorflow:Train Step: 200, loss: 1.1528830528259277, accuracy: 0.4099999964237213
INFO:tensorflow:Train Step: 300, loss: 1.1192028522491455, accuracy: 0.38999998569488525
INFO:tensorflow:Train Step: 400, loss: 0.8872318863868713, accuracy: 0.6000000238418579
INFO:tensorflow:Train Step: 500, loss: 0.8053421974182129, accuracy: 0.6299999952316284
INFO:tensorflow:Train Step: 600, loss: 0.9460598826408386, accuracy: 0.5799999833106995
INFO:tensorflow:Train Step: 700, loss: 0.8905470371246338, accuracy: 0.6299999952316284
INFO:tensorflow:Train Step: 800, loss: 0.9654304385185242, accuracy: 0.5699999928474426
INFO:tensorflow:Train Step: 900, loss: 0.8554862141609192, accuracy: 0.5899999737739563
INFO:tensorflow:Train Step: 1000, loss: 1.1096627712249756, accuracy: 0.49000000953674316
INFO:tensorflow:Test Step: 1000, loss: 0.7573379278182983, accuracy: 0.6291000247001648
INFO:tensorflow:Train Step: 