In [None]:
import numpy as np
import tensorflow as tf

with open('./data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('./data/labels.txt', 'r') as f:
    labels = f.read()

from string import punctuation
#移除所有标点符号
all_text = ''.join([c for c in reviews if c not in punctuation])
print(all_text[:1000])
# 以'\n'为分隔符，拆分文本
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
# 文本拆分为单独的单词列表
words = all_text.split()

In [None]:
from collections import Counter
count = Counter(words)

#按技术进行排序
vocab = sorted(count,key=count.get,reverse=True)
# 生成字典：{单词：整数}

vocab_to_int = {word:i for i,word in enumerate(vocab,1)}
# 将文本列表 转换为 整数列表same shape ==reviews list
reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

In [None]:
# 将标签转换为数值：positive==1 和 negative ==0
labels = labels.split('\n')
labels = np.array([1 if each=='positive' else 0 for each in labels])

In [None]:
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

In [None]:
# 从  reviews_ints列表中移除0长度的评论
non_zero_idx = [i for i,review in enumerate(reviews_ints) if len(review)>0]
#len(non_zero_idx)
#为了防止出现bug,此处用了in的判断来去除空值,当然还有别的方法可以用,此处不讨论。
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
labels = [labels[i] for i in non_zero_idx]

#选择每个句子长为200
seq_len = 200
from tensorflow.contrib.keras import preprocessing
features = np.zeros((len(reviews_ints),seq_len),dtype=int)
#将reviews_ints值逐行 赋值给features
features = preprocessing.sequence.pad_sequences(reviews_ints,200)
features.shape

In [None]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1,test_size=0.2,random_state=0)
for train_index,test_index in ss.split(np.array(reviews_ints)):
    train_x = features[train_index]
    train_y = labels[train_index]
    test_x = features[test_index]
    test_y = labels[test_index]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nTrain_Y set: \t{}".format(train_y.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

In [None]:
lstm_size = 256
lstm_layers = 1
batch_size = 128
learning_rate = 0.001

In [None]:
n_words = len(vocab_to_int)

tf.reset_default_graph()
X = tf.placeholder(tf.int32,[None,200],name='inputs')
labels_ = tf.placeholder(tf.int32,[None,1],name='labels')
keep_prob = tf.placeholder(tf.float32,name='keep_prob')

In [None]:
#创建基础的LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)

#对cell添加dropout
drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)

#堆栈多个LSTM layers
cell = tf.contrib.rnn.MultiRNNCell([drop]*lstm_layers)

## 将所有cell初始化为0状态。
initial_state = cell.zero_state(batch_size,tf.float32)

In [None]:
max_pool = tf.reduce_max(outputs,reduction_indices=[1])
predictions = tf.contrib.layers.fully_connected(max_pool, 1, activation_fn=tf.sigmoid)
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(labels_, predictions)
tf.summary.scalar('cost',cost)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [None]:
with tf.name_scope('accuracy'):
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_), tf.float32))
tf.summary.scalar('accuracy',accuracy)

In [None]:
def get_batches(x, y, batch_size=100):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

merged = tf.summary.merge_all()
direc = 'C:\\Users\\1\\Desktop\\summary'
train_writer = tf.summary.FileWriter(direc+'\\train',graph)
test_writer = tf.summary.FileWriter(direc+'\\test',graph)

In [None]:
epochs = 6
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {X: x,
                    labels_: y[:,None],
                    keep_prob:0.6}
            loss, _, summary1 = sess.run([cost, optimizer, merged], feed_dict=feed)
            
            if iteration%5==0:
                train_writer.add_summary(summary1,iteration)
                print("Epoch: {}/{}".format(e+1, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                for x, y in get_batches(test_x, test_y, batch_size):
                    feed = {X: x,
                            labels_: y[:,None],
                            keep_prob:1.0}
                    batch_acc, summary2 = sess.run([accuracy, merged], feed_dict=feed)
                    val_acc.append(batch_acc)
                test_writer.add_summary(summary2,iteration)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")