In [1]:
import pandas as pd

data_path = './Tomatoes/'

data = pd.read_csv("%s%s" % (data_path, 'train.tsv'), sep = '\t')
test_data = pd.read_csv("%s%s" % (data_path, 'test.tsv'), sep = '\t')

import re
def clean(_str):
    return " ".join(re.findall("[0-9a-zA-Z]*", _str)).strip()
def split(_str):
    return _str.split()

data['Phrase'] = data['Phrase'].apply(clean)
test_data['Phrase'] = data['Phrase'].apply(clean)

def _len(_str):
    return len(_str.split())
data['phracelen'] = data['Phrase'].apply(_len)
data['phracelen'].describe()

count    156060.00000
mean          6.89463
std           6.57485
min           0.00000
25%           2.00000
50%           4.00000
75%           9.00000
max          48.00000
Name: phracelen, dtype: float64

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = 22)

for train_index, dev_index in split.split(data, data[['Sentiment']]):
    dev_data = data.loc[dev_index]
    train_data = data.loc[train_index]
train_data.shape, dev_data.shape

((124848, 5), (31212, 5))

In [3]:
import gensim
import numpy as np



def get_w2v(splited_corpus, w2v_size, min_count):
    '''
    func: 获取word2vec模型
    param: splited_corpus
        type: pd.Series
        detail: 应当为训练集中所有语料
    param: w2v_size
        type: int
        detail: w2v向量维度
    return: w2v_model
        type: gensim.models.Word2Vec
        detail: 训练的模型只可以使用其transform接口
    '''
    sentences = [x.split() for x in splited_corpus]
    model = gensim.models.Word2Vec(sentences, min_count=min_count, size=w2v_size)
    return model

def get_w2v_key_vev(w2v_model):
    vecs = []
    words = []
    for word in w2v_model.wv.vocab:
        vecs.append(w2v_model[word])
        words.append(word)
    return words, vecs

def get_x_index(x, words):
    res = []
    for inst in x:
        res.append(np.array([words.index(word) for word in inst.split() if word in words]))
    return res

def max_len(list_2d):
    maxlen = 0
    for arr in list_2d:
        if(len(arr) > maxlen):
            maxlen = len(arr)
    return maxlen

def mean_len(list_2d):
    mean_len = 0
    for arr in list_2d:
        mean_len += len(arr)
    return int(mean_len / len(list_2d))

def ceil2(num):
    res = 2
    while res < num:
        res *= 2
    return res

def padding(data2d, max_len, pad_val):
    res = []
    for index, seq in enumerate(data2d):
        if(len(seq) < max_len):
            res.append(np.concatenate([seq, np.full([max_len - len(seq)], pad_val)]))
        else:
            res.append(seq[:max_len])
    return res

def concat_list_h(list1, list2):
    res = []
    for i, ele in enumerate(list1):
        res.append(np.concatenate([ele, list2[i]]))
    return res

from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
train_y = np.array(list(train_data['Sentiment'])).reshape(-1, 1)
train_y = oh_enc.fit_transform(train_y).toarray()
dev_y = np.array(list(dev_data['Sentiment'])).reshape(-1, 1)
dev_y = oh_enc.fit_transform(dev_y).toarray()
corpus = list(train_data['Phrase'])
max_seq_len = ceil2(max_len(corpus))
mean_seq_len = mean_len(corpus)
print(max_seq_len, mean_seq_len)
max_seq_len = 16
w2v_model = get_w2v(corpus, 300, min_count = 1)
words, embedding_matrix = get_w2v_key_vev(w2v_model)
embedding_matrix.append([0 for i in range(len(embedding_matrix[0]))])

train_x = get_x_index(list(train_data['Phrase']), words)
train_x = padding(train_x, max_seq_len, len(embedding_matrix) - 1)

dev_x = get_x_index(list(dev_data['Phrase']), words)
dev_x = padding(dev_x, max_seq_len, len(embedding_matrix) - 1)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


512 45




In [47]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"

import tensorflow as tf
import numpy as np
from datetime import datetime

class DNNClassifier:
    def __init__(self, neurons, input_dim, n_class, embedding_matrix, seq_len, learning_rate = 0.1, 
                 batch_size = 128, n_epochs = 500, hiden_act_fn = tf.sigmoid, output_act_fn = None, 
                 sum_root_dir = "tf_logs", embedding_drop_out_prob = 0.5, use_BN = False, rate_decay_steps = 100000,
                 rate_decay_rate = 0.99, use_L2 = False, regulation_rate = 0.0001):
        self.input_dim = input_dim
        self.n_class = n_class
        self.neurons = neurons
        self.learning_rate = learning_rate
        self.seq_len = seq_len
        self.log_dir = self.log_dir(sum_root_dir)
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.hiden_act_fn = hiden_act_fn
        self.output_act_fn = output_act_fn
        self.hiden_layer_num = len(neurons)
        self.embedding_drop_out_prob = embedding_drop_out_prob
        self.embedding_matrix = embedding_matrix
        self.use_BN = use_BN
        self.rate_decay_steps = rate_decay_steps
        self.rate_decay_rate = rate_decay_rate
        self.use_L2 = use_L2
        self.regulation_rate = regulation_rate
        self.graph = tf.Graph()
        
    def log_dir(self, root_logdir):
        now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
        log_dir = "{}/run-{}/".format(root_logdir, now)
        return log_dir
    
    def neuron_layer(self, x, neuron_num, activate_fn, scope, is_trainning, is_use_BN, regularizer = None):
        with tf.name_scope(scope):
            n_input = int(x.shape[1])
            weight = tf.Variable(tf.random_uniform([n_input, neuron_num], -1, 1), name = "weight")
            bias = tf.Variable(tf.random_uniform([neuron_num], -1, 1), name = "bias")
            output = tf.matmul(x, weight) + bias
            if(is_use_BN):
                scale = tf.Variable(tf.ones([neuron_num]), 
                                       name = "scale")
                offset = tf.Variable(tf.zeros([neuron_num]),
                                    name = "offset")
                untrain_mean = tf.Variable(tf.zeros([neuron_num]), trainable = False,
                                  name = "untrainable_mean")
                untrain_variance = tf.Variable(tf.ones([neuron_num]), trainable = False,
                                      name = "untrainable_var")
                def train_bn():
                    mean, variance = tf.nn.moments(output, [0])
                    return tf.nn.batch_normalization(output, mean, variance,
                                                    offset, scale, 
                                                     variance_epsilon = 0.01,
                                                     name = "bn_output_train")
                def inference_bn():
                    return tf.nn.batch_normalization(output, untrain_mean, 
                                                     untrain_variance,
                                                    offset, scale,
                                                     variance_epsilon = 0.01,
                                                     name = "bn_output_train")
                output = tf.cond(is_trainning, train_bn, inference_bn)
            if(activate_fn):
                output = activate_fn(output)
            else:
                output = output
            if(regularizer):
                return weight, bias, output, regularizer(weight) + regularizer(bias)
            else:
                return weight, bias, output, None
    
    def build(self):
        with self.graph.as_default(), tf.name_scope("DNN"):
            self.input_x = tf.placeholder(dtype = tf.int32, shape = [None, self.seq_len], name = "input_x")
            self.input_y = tf.placeholder(dtype = tf.int32, shape = [None, self.n_class], name = "input_y")
            self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name="emb_dropout_keep_prob")
            self.global_step = tf.Variable(0, trainable = False)
            self.learning_rate_ = tf.train.exponential_decay(
                self.learning_rate,
                self.global_step,
                self.rate_decay_steps,
                self.rate_decay_rate
            )
            if(self.use_L2):
                self.regularizer = tf.contrib.layers.l2_regularizer(self.regulation_rate)
            else:
                self.regularizer = None
            with tf.device('/cpu:0'), tf.name_scope("embedding"):
                self.embedding_matrix_ = tf.constant(self.embedding_matrix, name = "embedding_matrix", dtype = tf.float32)
                self.embedded = tf.nn.embedding_lookup(self.embedding_matrix_, self.input_x)
                self.embedded = tf.reduce_mean(self.embedded, axis = 1)
            with tf.name_scope("emb_bn"):
                emb_scale = tf.Variable(tf.ones([self.embedded.shape[1]]), 
                                       name = "scale")
                emb_offset = tf.Variable(tf.zeros([self.embedded.shape[1]]),
                                    name = "offset")
                emb_mean, emb_variance = tf.nn.moments(self.embedded, [0])
                self.bn_embedded = tf.nn.batch_normalization(self.embedded, emb_mean, emb_variance,
                                                emb_offset, emb_scale, 
                                                 variance_epsilon = 0.01,
                                                 name = "bn_embedded")
            with tf.name_scope("emb_dropout"):
                #使用初始BN后不使用dropout
                #self.droped_embedded = tf.nn.dropout(self.embedded, self.emb_dropout_keep_prob)
                #self.droped_embedded = tf.nn.dropout(self.bn_embedded, self.emb_dropout_keep_prob)
                self.droped_embedded = self.bn_embedded
            with tf.name_scope("global"):
                self.is_trainning = tf.placeholder(tf.bool)
            '''
            隐藏层与输出层的建立与连接
            '''
            self.neurons.append(self.n_class)
            self.layer_weights = []
            self.layer_biases = []
            self.layer_outputs = []
            for layer_index in range(len(self.neurons)):
                if(0 == layer_index):
                    layer_input = self.droped_embedded
                else:
                    layer_input = self.layer_outputs[-1]
                if(layer_index == len(self.neurons) - 1):
                    act_fn = self.output_act_fn
                else:
                    act_fn = self.hiden_act_fn
                weight, bias, output, regulation_ = self.neuron_layer(layer_input, self.neurons[layer_index],
                                                        act_fn, scope = "hiden_layer_" + str(layer_index),
                                                        is_trainning = self.is_trainning, is_use_BN = self.use_BN,
                                                        regularizer = self.regularizer)
                self.layer_weights.append(weight)
                self.layer_biases.append(bias)
                self.layer_outputs.append(output)
                if(self.use_L2):
                    if(layer_index == 0):
                        self.regulation = regulation_
                    else:
                        self.regulation += regulation_
            
            self.prediction = self.layer_outputs[-1]
            with tf.name_scope("loss"):
                self.cross_e = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                    logits = self.prediction, labels = self.input_y))
                if(self.use_L2):
                     self.cross_e += self.regulation
            with tf.name_scope("target"):
                self.correct = tf.equal(tf.argmax(self.prediction, 1), tf.arg_max(
                    self.input_y, 1), name = 'correct')
                self.acc = tf.reduce_mean(tf.cast(self.correct, tf.float32), name = 'acc')
            #with tf.name_scope("saver"):
                #self.saver = tf.train.Saver()
            with tf.name_scope("summary"):
                self.crosse_summary = tf.summary.scalar("cross_e", self.cross_e)
                self.acc_summary = tf.summary.scalar("acc", self.acc)
                self.weights_mean = []
                self.weights_mean_summary = []
                for layer_index in range(len(self.neurons)):
                    self.weights_mean.append(
                        tf.reduce_mean(tf.sqrt(tf.square(self.layer_weights[layer_index]))))
                    self.weights_mean_summary.append(
                        tf.summary.scalar("weight_magnitude_" + str(layer_index), 
                                          self.weights_mean[-1]))
                self.filewriter = tf.summary.FileWriter(self.log_dir, tf.get_default_graph())
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate = 
                                                         self.learning_rate_)
            self.train_step = self.optimizer.minimize(self.cross_e, global_step = self.global_step)
                
    def fit(self, x, y, dev_x = None, dev_y = None, test_x = None):
        self.x = x
        self.y = y
        self.dev_x = dev_x
        self.dev_y = dev_y
        self.test_x = test_x
            
    def predict(self, x):
        return self.sess.run(self.prediction, feed_dict = {
                        self.input_x : self.x_dev,
                        self.is_trainning : False
                    })
    
    def fetch_batch(self, X, Y, batch_index, batch_size):
        return X[batch_index * batch_size : batch_index * batch_size + batch_size], Y[batch_index * batch_size : batch_index * batch_size + batch_size]
    
    def train(self):
        print('tf log dir : ', self.log_dir)
        n_batches = int(np.ceil(len(self.x) / self.batch_size))
        batch_size = self.batch_size
        dev_feed_dict = {
            self.input_x : self.dev_x,
            self.input_y : self.dev_y,
            self.emb_dropout_keep_prob : 1.0,
            self.is_trainning : False
        }
        train_feed_dict = {
            self.input_x : self.x[:10000],
            self.input_y : self.y[:10000],
            self.emb_dropout_keep_prob : 1.0,
            self.is_trainning : False
        }
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(graph = self.graph, config = config) as self.sess:
            tf.global_variables_initializer().run()
            for epoch in range(self.n_epochs):
                for batch_index in range(n_batches):
                    batch_x = self.x[batch_size * batch_index : (batch_index + 1) * batch_size]
                    batch_y = self.y[batch_size * batch_index : (batch_index + 1) * batch_size]
                    feed_dict = {
                        self.input_x : batch_x,
                        self.input_y : batch_y,
                        self.emb_dropout_keep_prob : self.embedding_drop_out_prob,
                        self.is_trainning : True
                    }
                    self.sess.run(self.train_step, feed_dict = feed_dict)
                    step = epoch * n_batches + batch_index
                if(epoch % 5 == 0):
                    for layer_index in range(len(self.neurons)):
                        print('weight_' + str(layer_index),
                            self.weights_mean[layer_index].eval(feed_dict = train_feed_dict))
                    print('train_cross_e', self.cross_e.eval(feed_dict = train_feed_dict))
                    print('dev acc', self.acc.eval(feed_dict = dev_feed_dict))
                    #self.saver.save(sess, "DNN.ckpt")
                    crosse_summary_str = self.crosse_summary.eval(feed_dict = train_feed_dict)
                    acc_summary_str = self.acc_summary.eval(feed_dict = dev_feed_dict)
                    self.filewriter.add_summary(crosse_summary_str, step)
                    self.filewriter.add_summary(acc_summary_str, step)
                    for layer_index in range(len(self.neurons)):
                        weight_mean_str = self.weights_mean_summary[layer_index].eval(feed_dict = train_feed_dict)
                        self.filewriter.add_summary(weight_mean_str, step)
            self.filewriter.close()           
               
m = DNNClassifier([250, 200, 100, 80, 20], len(embedding_matrix[0]), 5, np.array(embedding_matrix), max_seq_len,
                  learning_rate = 0.8, n_epochs= 200, hiden_act_fn = tf.sigmoid,
                 output_act_fn = None, rate_decay_steps = 1000000, rate_decay_rate = 0.99, use_L2 = True)
m.build()

m.fit(train_x, train_y, dev_x, dev_y)
m.train()
                
        


tf log dir :  tf_logs/run-20191117031328/
weight_0 0.46370396
weight_1 0.4607172
weight_2 0.4630477
weight_3 0.46335676
weight_4 0.45703107
weight_5 0.43493444
train_cross_e 3.3992834
dev acc 0.53123796
weight_0 0.3144973
weight_1 0.31349128
weight_2 0.31724837
weight_3 0.31998953
weight_4 0.3286518
weight_5 0.40843377
train_cross_e 2.1654773
dev acc 0.54575163
weight_0 0.21536334
weight_1 0.21674477
weight_2 0.22193377
weight_3 0.22577414
weight_4 0.24666291
weight_5 0.3930574
train_cross_e 1.6125463
dev acc 0.5512303
weight_0 0.15119484
weight_1 0.1536245
weight_2 0.15876354
weight_3 0.16343723
weight_4 0.19857872
weight_5 0.40946534
train_cross_e 1.365466
dev acc 0.55145454
weight_0 0.111814216
weight_1 0.11285579
weight_2 0.116992846
weight_3 0.12372551
weight_4 0.17320837
weight_5 0.4227554
train_cross_e 1.2604624
dev acc 0.5512623
weight_0 0.09025104
weight_1 0.08761307
weight_2 0.09001388
weight_3 0.09953391
weight_4 0.16110593
weight_5 0.4289851
train_cross_e 1.2059938
dev acc 