In [1]:
import pandas as pd

data_path = './Tomatoes/'

data = pd.read_csv("%s%s" % (data_path, 'train.tsv'), sep = '\t')
test_data = pd.read_csv("%s%s" % (data_path, 'test.tsv'), sep = '\t')

import re
def clean(_str):
    return " ".join(re.findall("[0-9a-zA-Z]*", _str)).strip()
def split(_str):
    return _str.split()

data['Phrase'] = data['Phrase'].apply(clean)
test_data['Phrase'] = data['Phrase'].apply(clean)

def _len(_str):
    return len(_str.split())
data['phracelen'] = data['Phrase'].apply(_len)
data['phracelen'].describe()

count    156060.00000
mean          6.89463
std           6.57485
min           0.00000
25%           2.00000
50%           4.00000
75%           9.00000
max          48.00000
Name: phracelen, dtype: float64

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, train_size = 0.8, random_state = 22)

for train_index, dev_index in split.split(data, data[['Sentiment']]):
    dev_data = data.loc[dev_index]
    train_data = data.loc[train_index]
train_data.shape, dev_data.shape

((124848, 5), (31212, 5))

In [3]:
import gensim
import numpy as np



def get_w2v(splited_corpus, w2v_size, min_count):
    '''
    func: 获取word2vec模型
    param: splited_corpus
        type: pd.Series
        detail: 应当为训练集中所有语料
    param: w2v_size
        type: int
        detail: w2v向量维度
    return: w2v_model
        type: gensim.models.Word2Vec
        detail: 训练的模型只可以使用其transform接口
    '''
    sentences = [x.split() for x in splited_corpus]
    model = gensim.models.Word2Vec(sentences, min_count=min_count, size=w2v_size)
    return model

def get_w2v_key_vev(w2v_model):
    vecs = []
    words = []
    for word in w2v_model.wv.vocab:
        vecs.append(w2v_model[word])
        words.append(word)
    return words, vecs

def get_x_index(x, words):
    res = []
    for inst in x:
        res.append(np.array([words.index(word) for word in inst.split() if word in words]))
    return res

def max_len(list_2d):
    maxlen = 0
    for arr in list_2d:
        if(len(arr) > maxlen):
            maxlen = len(arr)
    return maxlen

def mean_len(list_2d):
    mean_len = 0
    for arr in list_2d:
        mean_len += len(arr)
    return int(mean_len / len(list_2d))

def ceil2(num):
    res = 2
    while res < num:
        res *= 2
    return res

def padding(data2d, max_len, pad_val):
    res = []
    for index, seq in enumerate(data2d):
        if(len(seq) < max_len):
            res.append(np.concatenate([seq, np.full([max_len - len(seq)], pad_val)]))
        else:
            res.append(seq[:max_len])
    return res

def concat_list_h(list1, list2):
    res = []
    for i, ele in enumerate(list1):
        res.append(np.concatenate([ele, list2[i]]))
    return res

from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
train_y = np.array(list(train_data['Sentiment'])).reshape(-1, 1)
train_y = oh_enc.fit_transform(train_y).toarray()
dev_y = np.array(list(dev_data['Sentiment'])).reshape(-1, 1)
dev_y = oh_enc.fit_transform(dev_y).toarray()
corpus = list(train_data['Phrase'])
max_seq_len = ceil2(max_len(corpus))
mean_seq_len = mean_len(corpus)
print(max_seq_len, mean_seq_len)
max_seq_len = 16
w2v_model = get_w2v(corpus, 300, min_count = 1)
words, embedding_matrix = get_w2v_key_vev(w2v_model)
embedding_matrix.append([0 for i in range(len(embedding_matrix[0]))])

train_x = get_x_index(list(train_data['Phrase']), words)
train_x = padding(train_x, max_seq_len, len(embedding_matrix) - 1)

dev_x = get_x_index(list(dev_data['Phrase']), words)
dev_x = padding(dev_x, max_seq_len, len(embedding_matrix) - 1)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


512 45




In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2"

import tensorflow as tf
from keras.preprocessing import sequence
from keras.layers import Embedding
from datetime import datetime

class CNNLSTM:
    def __init__(self, seq_len, num_classes, batch_seqs_num, embedding_matrix, embedding_size, filter_sizes,
                num_filters, conv_activate_fn = tf.nn.relu, fcl_activate_fn = tf.sigmoid, learning_rate = 0.01,
                n_epochs = 100, filtered_dims = 128, pooled_dims = 100, num_lstm_cells = 1, lstm_hiden_size = 32,
                cnn_drop_out_prob = 0.5, lstm_drop_out_prob = 0.5, sum_root_dir = "tf_logs"):
        self.seq_len = seq_len
        self.num_classes = num_classes
        self.batch_seqs_num = batch_seqs_num
        self.embedding_matrix = embedding_matrix
        self.embedding_size = embedding_size
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.conv_activate_fn = conv_activate_fn
        self.fcl_activate_fn = fcl_activate_fn
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.filtered_dims = filtered_dims
        self.pooled_dims = pooled_dims
        if(filtered_dims > embedding_size):
            print('filtered_dims should be less than embedding_size')
        self.num_lstm_cells = num_lstm_cells
        self.lstm_hiden_size = lstm_hiden_size
        self.cnn_drop_out_prob = cnn_drop_out_prob
        self.lstm_drop_out_prob = lstm_drop_out_prob
        self.log_dir = self.log_dir(sum_root_dir)
        self.graph = tf.Graph()
    
    def log_dir(self, root_logdir):
        now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
        log_dir = "{}/run-{}/".format(root_logdir, now)
        return log_dir
    
    def build(self):
        with tf.name_scope("cnn_lstm"), self.graph.as_default():
            self.input_x = tf.placeholder(dtype = tf.int32, shape = [None, self.seq_len], name = "input_x")
            self.input_y = tf.placeholder(dtype = tf.int32, shape = [None, self.num_classes], name = "input_y")
            self.cnn_dropout_keep_prob = tf.placeholder(tf.float32, name="cnn_dropout_keep_prob")
            self.lstm_dropout_keep_prob = tf.placeholder(tf.float32, name="lstm_dropout_keep_prob")
            self.global_step = tf.Variable(0, trainable = False)

            with tf.device('/cpu:0'), tf.name_scope("embedding"):
                self.embedding_matrix = tf.constant(self.embedding_matrix, name = "embedding_matrix", dtype = tf.float32)
                self.embedded = tf.expand_dims(tf.nn.embedding_lookup(self.embedding_matrix, self.input_x), -1,
                                               name = "embedded_output")
            self.pooled_outputs = []
            filter_size = self.filter_sizes[0]
            with tf.name_scope("conv_maxpool"):
                filter_shape = [filter_size, self.embedding_size - self.filtered_dims + 1, 1, self.num_filters]
                filter_weight = tf.Variable(tf.truncated_normal(filter_shape, -1, 1), name = "filter_weight")
                filter_bias = tf.Variable(tf.truncated_normal([self.num_filters], -1, 1))
                conv_output = self.conv_activate_fn(tf.nn.bias_add(tf.nn.conv2d(
                    self.embedded,
                    filter_weight,
                    strides = [1, 1, 1, 1],
                    padding = 'VALID',
                ), filter_bias, name = "conv_output"), name = "act_conv_output")

                pooled_output = tf.nn.max_pool(conv_output,
                                              ksize = [1, 1, self.filtered_dims - self.pooled_dims + 1, 1],
                                              strides = [1, 1, 1, 1],
                                              padding = "VALID",
                                              name = "pooled_output")
                self.reduced_pooled_output = tf.reshape(pooled_output, [-1, pooled_output.shape[1],
                                                                            (self.pooled_dims) * self.num_filters],
                                                       name = 'reduced_pooled_output')
            with tf.name_scope("cnn_dropout"):
                self.droped_pooled_output = tf.nn.dropout(self.reduced_pooled_output, self.cnn_dropout_keep_prob)
            with tf.name_scope("lstm"):
                self.lstm_cells = [tf.nn.rnn_cell.BasicLSTMCell(self.lstm_hiden_size, name = "%s%d" % ('lstmcell_', i)) 
                                  for i in range(self.num_lstm_cells)]
                self.cells = tf.nn.rnn_cell.MultiRNNCell(self.lstm_cells)
                self.initial_state = self.cells.zero_state(self.batch_seqs_num, tf.float32)
                self.lstm_output, self.lstm_state = tf.nn.dynamic_rnn(self.cells, self.droped_pooled_output, 
                                                                     dtype = tf.float32)
                self.lstm_last_output = self.lstm_state[-1].h
            with tf.name_scope("lstm_dropout"):
                self.droped_lstm_output = tf.nn.dropout(self.lstm_last_output, self.lstm_dropout_keep_prob)
            with tf.name_scope("full_connect"):
                self.fcl_output = tf.contrib.layers.fully_connected(self.droped_lstm_output, 
                                                              self.num_classes,
                                                              self.fcl_activate_fn)
            self.prediction = self.fcl_output
            with tf.name_scope("loss"):
                self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                    logits = self.prediction, labels = self.input_y,  
                ))
            with tf.name_scope("target"):
                self.train_correct = tf.equal(tf.arg_max(self.prediction, 1), tf.arg_max(self.input_y, 1), name = "correct")
                self.train_acc = tf.reduce_mean(tf.cast(self.train_correct, tf.float32), name = "acc")
            with tf.name_scope("summary"):
                self.loss_sum = tf.summary.scalar("loss", self.loss)
                self.acc_sum = tf.summary.scalar("acc", self.train_acc)
                self.sum = tf.summary.merge_all()
                self.filewriter = tf.summary.FileWriter(self.log_dir, tf.get_default_graph())
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_step = self.optimizer.minimize(self.loss, global_step = self.global_step)
            
    
    def fit(self, x, y, dev_x = None, dev_y = None, test_x = None):
        self.x = x
        self.y = y
        self.dev_x = dev_x
        self.dev_y = dev_y
        self.test_x = test_x
    
    def train(self):
        print('tf log dir : ', self.log_dir)
        n_batches = int(np.ceil(len(self.x) / self.batch_seqs_num))
        batch_size = self.batch_seqs_num
        dev_feed_dict = {
            self.input_x : self.dev_x,
            self.input_y : self.dev_y,
            self.cnn_dropout_keep_prob : 1.0,
            self.lstm_dropout_keep_prob : 1.0
        }
        train_feed_dict = {
            self.input_x : self.x[:10000],
            self.input_y : self.y[:10000],
            self.cnn_dropout_keep_prob : 1.0,
            self.lstm_dropout_keep_prob : 1.0
        }
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(config = config, graph = self.graph) as self.sess:
            tf.global_variables_initializer().run()
            for epoch in range(self.n_epochs):
                for batch_index in range(n_batches):
                    batch_x = self.x[batch_size * batch_index : (batch_index + 1) * batch_size]
                    batch_y = self.y[batch_size * batch_index : (batch_index + 1) * batch_size]
                    feed_dict = {
                        self.input_x : batch_x,
                        self.input_y : batch_y,
                        self.cnn_dropout_keep_prob : self.cnn_drop_out_prob,
                        self.lstm_dropout_keep_prob : self.lstm_drop_out_prob
                    }
                    self.sess.run(self.train_step, feed_dict = feed_dict)
                    step = epoch * n_batches + batch_index
                print('train epoch %d / %d Done' % (epoch, self.n_epochs))
                if(epoch % 5 == 0):
                    print('dev acc', self.train_acc.eval(feed_dict = dev_feed_dict))
                    print('train loss', self.loss.eval(feed_dict = train_feed_dict))
                    dev_acc_str = self.acc_sum.eval(feed_dict = dev_feed_dict)
                    train_loss_str = self.loss_sum.eval(feed_dict = train_feed_dict)
                    self.filewriter.add_summary(dev_acc_str, step)
                    self.filewriter.add_summary(train_loss_str, step)

m = CNNLSTM(max_seq_len, 5, 32, np.array(embedding_matrix), len(embedding_matrix[0]), [3], 8, learning_rate = 0.0001,
            n_epochs = 1000, num_lstm_cells = 2)
m.build()
m.fit(train_x, train_y, dev_x, dev_y)
m.train()

tf log dir :  tf_logs/run-20191117000924/
train epoch 0 / 1000 Done
dev acc 0.5099321
train loss 1.3566505
train epoch 1 / 1000 Done
train epoch 2 / 1000 Done
train epoch 3 / 1000 Done
train epoch 4 / 1000 Done
train epoch 5 / 1000 Done
dev acc 0.5099962
train loss 1.3502307
train epoch 6 / 1000 Done
train epoch 7 / 1000 Done
train epoch 8 / 1000 Done
train epoch 9 / 1000 Done
train epoch 10 / 1000 Done
dev acc 0.51287967
train loss 1.3463413
train epoch 11 / 1000 Done
train epoch 12 / 1000 Done
train epoch 13 / 1000 Done
train epoch 14 / 1000 Done
train epoch 15 / 1000 Done
dev acc 0.5144816
train loss 1.3405045
train epoch 16 / 1000 Done
train epoch 17 / 1000 Done
train epoch 18 / 1000 Done
train epoch 19 / 1000 Done
train epoch 20 / 1000 Done
dev acc 0.5184865
train loss 1.335283
train epoch 21 / 1000 Done
train epoch 22 / 1000 Done
train epoch 23 / 1000 Done
train epoch 24 / 1000 Done
train epoch 25 / 1000 Done
dev acc 0.5270409
train loss 1.3273197
train epoch 26 / 1000 Done
train

dev acc 0.55805457
train loss 1.282232
train epoch 231 / 1000 Done
train epoch 232 / 1000 Done
train epoch 233 / 1000 Done
train epoch 234 / 1000 Done
train epoch 235 / 1000 Done
dev acc 0.5624439
train loss 1.2809722
train epoch 236 / 1000 Done
train epoch 237 / 1000 Done
train epoch 238 / 1000 Done
train epoch 239 / 1000 Done
train epoch 240 / 1000 Done
dev acc 0.559208
train loss 1.2775995
train epoch 241 / 1000 Done
train epoch 242 / 1000 Done
train epoch 243 / 1000 Done
train epoch 244 / 1000 Done
train epoch 245 / 1000 Done
dev acc 0.56330895
train loss 1.2788327
train epoch 246 / 1000 Done
train epoch 247 / 1000 Done
train epoch 248 / 1000 Done
train epoch 249 / 1000 Done
train epoch 250 / 1000 Done
dev acc 0.563341
train loss 1.2770001
train epoch 251 / 1000 Done
train epoch 252 / 1000 Done
train epoch 253 / 1000 Done
train epoch 254 / 1000 Done
train epoch 255 / 1000 Done
dev acc 0.56353325
train loss 1.2774004
train epoch 256 / 1000 Done
train epoch 257 / 1000 Done
train epoc

train epoch 460 / 1000 Done
dev acc 0.56715363
train loss 1.259878
train epoch 461 / 1000 Done
train epoch 462 / 1000 Done
train epoch 463 / 1000 Done
train epoch 464 / 1000 Done
train epoch 465 / 1000 Done
dev acc 0.5695566
train loss 1.2596384
train epoch 466 / 1000 Done
train epoch 467 / 1000 Done
train epoch 468 / 1000 Done
train epoch 469 / 1000 Done
train epoch 470 / 1000 Done
dev acc 0.56846726
train loss 1.2591655
train epoch 471 / 1000 Done
train epoch 472 / 1000 Done
train epoch 473 / 1000 Done
train epoch 474 / 1000 Done
train epoch 475 / 1000 Done
dev acc 0.56683326
train loss 1.2604071
train epoch 476 / 1000 Done
train epoch 477 / 1000 Done
train epoch 478 / 1000 Done
train epoch 479 / 1000 Done
train epoch 480 / 1000 Done
dev acc 0.5684032
train loss 1.2597601
train epoch 481 / 1000 Done
train epoch 482 / 1000 Done
train epoch 483 / 1000 Done
train epoch 484 / 1000 Done
train epoch 485 / 1000 Done
dev acc 0.56635267
train loss 1.2605032
train epoch 486 / 1000 Done
train e

train epoch 689 / 1000 Done
train epoch 690 / 1000 Done
dev acc 0.57067794
train loss 1.2511585
train epoch 691 / 1000 Done
train epoch 692 / 1000 Done
train epoch 693 / 1000 Done
train epoch 694 / 1000 Done
train epoch 695 / 1000 Done
dev acc 0.5721197
train loss 1.2502112
train epoch 696 / 1000 Done
train epoch 697 / 1000 Done
train epoch 698 / 1000 Done
train epoch 699 / 1000 Done
train epoch 700 / 1000 Done
dev acc 0.57071
train loss 1.2494948
train epoch 701 / 1000 Done
train epoch 702 / 1000 Done
train epoch 703 / 1000 Done
train epoch 704 / 1000 Done
train epoch 705 / 1000 Done
dev acc 0.571511
train loss 1.2482859
train epoch 706 / 1000 Done
train epoch 707 / 1000 Done
train epoch 708 / 1000 Done
train epoch 709 / 1000 Done
train epoch 710 / 1000 Done
dev acc 0.5711585
train loss 1.249132
train epoch 711 / 1000 Done
train epoch 712 / 1000 Done
train epoch 713 / 1000 Done
train epoch 714 / 1000 Done
train epoch 715 / 1000 Done
dev acc 0.5747789
train loss 1.249066
train epoch 71

train epoch 918 / 1000 Done
train epoch 919 / 1000 Done
train epoch 920 / 1000 Done
dev acc 0.57487506
train loss 1.2457948
train epoch 921 / 1000 Done
train epoch 922 / 1000 Done
train epoch 923 / 1000 Done
train epoch 924 / 1000 Done
train epoch 925 / 1000 Done
dev acc 0.5735935
train loss 1.2446873
train epoch 926 / 1000 Done
train epoch 927 / 1000 Done
train epoch 928 / 1000 Done
train epoch 929 / 1000 Done
train epoch 930 / 1000 Done
dev acc 0.5741702
train loss 1.2441809
train epoch 931 / 1000 Done
train epoch 932 / 1000 Done
train epoch 933 / 1000 Done


KeyboardInterrupt: 