In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

参考资料

http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
https://github.com/yoonkim/CNN_sentence


这里定义text cnn的网络模型  
    __init__是其中的初始化函数  
        包含三个tf.placeholder：输入特征data，目标类别target，dropout的概率dropout_keep_prob  
        模型的超参数：  
            类别数量num_classes，句子序列的最大长度max_sequence_len，词典大小vocab_size  
            词嵌入的大小embedding_size，  
            卷积层的卷积核的大小列表filter_sizes，各个大小卷积的数量num_filters  
            l2正则化参数l2_reg_lambda，学习率_learning_rate  
        模型定义的一些可以用到的量：  
            输出分值_scores，预测类别_prediction，  
            l2正则化项损失l2_loss，模型整体损失_loss，  
            精度_accuracy，误差_error  
            优化算法_optimize  


    embedding层的大小为 [vocabulary_size, embedding_size]  
    如果卷积核的大小列表为[3, 4, 5], 那么总共的卷积核的数量为3 * num_filters
    dropout的概率定义成一个网络的输入是因为，只需要训练的时候允许，当预测的时候禁止就好

In [None]:
class TextModel(object):
    
    def __init__(self, 
        data, target,
        max_sequence_len, vocab_size, embedding_size, filter_sizes, num_filters,
        num_classes=2,
        embedding_init=None,
        l2_reg_lambda=0.0,
        learning_rate=1e-3
        ):
        """
        __init__
        """
        # inputs, placeholder
        self.data = data
        self.target = target
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        # model configs
        self._num_classes = num_classes
        self._max_sequence_len = max_sequence_len
        self._vocab_size = vocab_size
        self._embedding_size = embedding_size
        self._filter_sizes = filter_sizes
        self._num_filters = num_filters
        self._l2_reg_lambda = l2_reg_lambda
        self._learning_rate = learning_rate
        # model ops
        self._scores = None
        self._prediction = None
        self.l2_loss = None
        self._loss = None
        self._accuracy = None
        self._optimize = None
        self._error = None

get_model_variables函数返回所定义的网络的量  
从而可以在外边定义优化步骤，定义训练过程等，将模型定义与训练等定制化项目分开  
    返回包括：预测，损失函数（用于评估/优化等），优化函数（用于执行训练过程），精度（用于评估）  


In [None]:
    def get_model_variables(self):
        """
        get_model_variables
        """
        return self.prediction, self.loss, self.optimize, self.accuracy

对于类的方法，装饰器一样起作用。Python内置的@property装饰器就是负责把一个方法变成属性调用的。注意到这个神奇的@property，我们在对实例属性操作的时候，就知道该属性很可能不是直接暴露的，而是通过getter和setter方法来实现的。
还可以定义只读属性，只定义getter方法，不定义setter方法就是一个只读属性：  
class Student(object):
    @property
    def birth(self):
        return self._birth

    @birth.setter
    def birth(self, value):
        self._birth = value

    @property
    def age(self):
        return 2014 - self._birth

这里利用python的@property装饰器功能实现网络的属性获取，好处是隐藏了细节，而且通过在内部判断是否定义过某个属性，可以实现一次定义，也就是下次获取的时候只取上次定义的，而不需要重新运行graph生成  

scores属性返回网络计算后的分值
这里定义了经典的cnn做句子分类的论文Convolutional Neural Networks for Sentence Classification (EMNLP 2014) 中定义的网络，其原始实现见：https://github.com/yoonkim/CNN_sentence
该函数同时定义了预测类别self._prediction，l2损失self.l2_loss  

In [None]:
    @property
    def scores(self):
        """
        score
        """
        if self._scores is None:
            # embedding_layer
            with tf.device('/cpu:0'), tf.name_scope("embedding"):
                self.W = tf.Variable(
                            tf.random_uniform([self._vocab_size, self._embedding_size], -1.0, 1.0),
                            name="W")
                self.embedded = tf.nn.embedding_lookup(self.W, self.data)
                self.embedded_expanded = tf.expand_dims(self.embedded, -1)
            # create a convolution + maxpool layer for each filter size
            pooled_outputs = []
            for i, filter_size in enumerate(self._filter_sizes):
                with tf.name_scope("conv-maxpool-{0}".format(filter_size)):
                    # convolution layer
                    filter_shape = [filter_size, self._embedding_size, 1, self._num_filters]
                    print("conv-maxpool-{0}.filter_size {1}".format(filter_size, filter_shape))
                    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                    b = tf.Variable(tf.constant(0.1, shape=[self._num_filters]), name="b")
                    conv = tf.nn.conv2d(
                            self.embedded_expanded,
                            W,
                            strides=[1, 1, 1, 1],
                            padding="VALID",
                            name="conv")
                    # apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                    # maxpooling over the outputs
                    pooled = tf.nn.max_pool(
                                h,
                                ksize=[1, self._max_sequence_len - filter_size + 1, 1, 1],
                                strides=[1, 1, 1, 1],
                                padding="VALID",
                                name="pool")
                    pooled_outputs.append(pooled)
            # combine all the pooled features
            num_filters_total = self._num_filters * len(self._filter_sizes)
            print("num_filters_total {0}".format(num_filters_total))
            self.h_pool = tf.concat(pooled_outputs, 3)
            self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
            # add dropout
            with tf.name_scope("dropout"):
                self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            # final (unnormalized) scores and predictions
            with tf.name_scope("cnn_output"):
                W = tf.get_variable(
                        "W",
                        shape=[num_filters_total, self._num_classes],
                        initializer=tf.contrib.layers.xavier_initializer())
                b = tf.Variable(tf.constant(0.1, shape=[self._num_classes]), name="b")
                # may be get some loss here?
                self.l2_loss = tf.nn.l2_loss(W) + tf.nn.l2_loss(b)
                self._scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
                # define prediction
                self._prediction = tf.argmax(self._scores, 1, name="predictions")
        # return scores
        return self._scores

网络构成粗看起来如下：
![Screen-Shot-2015-11-06-at-8.03.47-AM-1024x413.png](attachment:Screen-Shot-2015-11-06-at-8.03.47-AM-1024x413.png)
    第一层将词表示成低维向量  
    第二层对词嵌入向量做不同大小的卷积核的卷积操作，一般用3，4，5三个不同大小的核  
    第三层将卷积层输出做max-pool，输出一个长的特征向量  
    增加dropout正则化  
    最后用softmax做结果分类  

![Screen-Shot-2015-11-06-at-12.05.40-PM-1024x937.png](attachment:Screen-Shot-2015-11-06-at-12.05.40-PM-1024x937.png)

tf.device("/cpu:0") forces an operation to be executed on the CPU. By default TensorFlow will try to put the operation on the GPU if one is available, but the embedding implementation doesn’t currently have GPU support and throws an error if placed on the GPU.  

tf.name_scope creates a new Name Scope with the name “embedding”. The scope adds all operations into a top-level node called “embedding” so that you get a nice hierarchy when visualizing your network in TensorBoard.  

tf.nn.embedding_lookup creates the actual embedding operation. The result of the embedding operation is a 3-dimensional tensor of shape [None, sequence_length, embedding_size].  

TensorFlow’s convolutional conv2d operation expects a 4-dimensional tensor with dimensions corresponding to batch, width, height and channel. The result of our embedding doesn’t contain the channel dimension, so we add it manually, leaving us with a layer of shape [None, sequence_length, embedding_size, 1].  

Because each convolution produces tensors of different shapes we need to iterate through them, create a layer for each of them, and then merge the results into one big feature vector.  

Each filter slides over the whole embedding, but varies in how many words it covers. "VALID" padding means that we slide the filter over our sentence without padding the edges, performing a narrow convolution that gives us an output of shape [1, sequence_length - filter_size + 1, 1, 1].   

Performing max-pooling over the output of a specific filter size leaves us with a tensor of shape [batch_size, 1, 1, num_filters]. This is essentially a feature vector, where the last dimension corresponds to our features.  

Once we have all the pooled output tensors from each filter size we combine them into one long feature vector of shape [batch_size, num_filters_total]. Using -1 in tf.reshape tells TensorFlow to flatten the dimension when possible.  

The idea behind dropout is simple. A dropout layer stochastically “disables” a fraction of its neurons. This prevent neurons from co-adapting and forces them to learn individually useful features. The fraction of neurons we keep enabled is defined by the dropout_keep_prob input to our network. We set this to something like 0.5 during training, and to 1 (disable dropout) during evaluation.  

tf.nn.xw_plus_b is a convenience wrapper to perform the Wx + b matrix multiplication.  

![Screen-Shot-2015-12-10-at-10.13.50-AM1-1024x525.png](attachment:Screen-Shot-2015-12-10-at-10.13.50-AM1-1024x525.png)

这里没有加入预训练词向量初始化  
有文章说l2正则化作用不大  
https://arxiv.org/abs/1510.03820  
A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification  
原始论文有两个输入通道，一个是静态词向量（不更新），一个是非静态词向量（随着训练更新）

![Screen-Shot-2015-12-10-at-10.22.29-AM-1024x523.png](attachment:Screen-Shot-2015-12-10-at-10.22.29-AM-1024x523.png)

prediction定义了预测类别输出  

In [None]:
    @property
    def prediction(self):
        """
        prediction
        """
        if self._prediction is None:
            with tf.name_scope("cnn_output"):
                self._prediction = tf.argmax(self.scores, 1, name="predictions")
        return self._prediction

loss定义了网络的损失函数

In [None]:
    @property
    def loss(self):
        """
        loss
        """
        if self._loss is None:
            # calculate loss
            with tf.name_scope("cnn_loss"):
                losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.target)
                self._loss = tf.reduce_mean(losses) + self._l2_reg_lambda * self._l2_reg_lambda
        return self._loss

accuracy定义了精度计算  

In [None]:
    @property
    def accuracy(self):
        """
        accuracy
        """
        if self._accuracy is None:
            # calculate accuracy
            with tf.name_scope("cnn_accuracy"):
                correct_predictions = tf.equal(self.prediction, tf.argmax(self.target, 1))
                self._accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="cnn_accuracy")
        return self._accuracy

error定义了误差  

In [None]:
    @property
    def error(self):
        """
        error
        """
        if self._error is None:
            # calculate error
            with tf.name_scope("cnn_error"):
                correct_predictions = tf.not_equal(self.prediction, tf.argmax(self.target, 1))
                self._error = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="cnn_error")
        return self._error

In [None]:
optimize定义了优化过程  

In [None]:
    @property
    def optimize(self):
        """
        optimize
        """
        if self._optimize is None:
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            """
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(self._loss)
            self._optimize = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
            """
            optimizer = tf.train.AdamOptimizer(learning_rate=self._learning_rate)
            #optimizer = tf.train.GradientDescentOptimizer(learning_rate=self._learning_rate)
            self._optimize = optimizer.minimize(self.loss, global_step=self.global_step)
        return self._optimize

![Screen-Shot-2015-12-10-at-10.25.46-AM-1024x558.png](attachment:Screen-Shot-2015-12-10-at-10.25.46-AM-1024x558.png)

In TensorFlow, a Session is the environment you are executing graph operations in, and it contains state about Variables and queues. Each session operates on a single graph. If you don’t explicitly use a session when creating variables and operations you are using the current default session created by TensorFlow. You can change the default session by executing commands within a session.as_default() block (see below).  

A Graph contains operations and tensors. You can use multiple graphs in your program, but most programs only need a single graph. You can use the same graph in multiple sessions, but not multiple graphs in one session. TensorFlow always creates a default graph, but you may also create a graph manually and set it as the new default, like we do below. Explicitly creating sessions and graphs ensures that resources are released properly when you no longer need them.  