/
components.py
122 lines (106 loc) · 5.9 KB
/
components.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os, re
import tensorflow as tf
import numpy as np
class BucketedDataIterator():
## bucketed data iterator uses R2RT's implementation(https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html)
def __init__(self, df, num_buckets = 3):
df = df.sort_values('length').reset_index(drop=True)
self.size = int(len(df) / num_buckets)
self.dfs = []
for bucket in range(num_buckets):
self.dfs.append(df.iloc[bucket*self.size: (bucket+1)*self.size])
self.num_buckets = num_buckets
# cursor[i] will be the cursor for the ith bucket
self.cursor = np.array([0] * num_buckets)
self.shuffle()
self.epochs = 0
def shuffle(self):
#sorts dataframe by sequence length, but keeps it random within the same length
for i in range(self.num_buckets):
self.dfs[i] = self.dfs[i].sample(frac=1).reset_index(drop=True)
self.cursor[i] = 0
def next_batch(self, n):
if np.any(self.cursor+n > self.size):
self.epochs += 1
self.shuffle()
i = np.random.randint(0, self.num_buckets)
res = self.dfs[i].iloc[self.cursor[i]:self.cursor[i]+n]
self.cursor[i] += n
return np.asarray(res['review'].tolist()), res['label'].tolist(), res['length'].tolist()
def get_sentence(vocabulary_inv, sen_index):
return ' '.join([vocabulary_inv[index] for index in sen_index])
def sequence(rnn_inputs, hidden_size, seq_lens):
cell_fw = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
print('build fw cell: '+str(cell_fw))
cell_bw = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
print('build bw cell: '+str(cell_bw))
rnn_outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,
cell_bw,
inputs=rnn_inputs,
sequence_length=seq_lens,
dtype=tf.float32
)
print('rnn outputs: '+str(rnn_outputs))
print('final state: '+str(final_state))
return rnn_outputs
def attention(atten_inputs, atten_size):
## attention mechanism uses Ilya Ivanov's implementation(https://github.com/ilivans/tf-rnn-attention)
print('attention inputs: '+str(atten_inputs))
max_time = int(atten_inputs.shape[1])
print("max time length: "+str(max_time))
combined_hidden_size = int(atten_inputs.shape[2])
print("combined hidden size: "+str(combined_hidden_size))
W_omega = tf.Variable(tf.random_normal([combined_hidden_size, atten_size], stddev=0.1, dtype=tf.float32))
b_omega = tf.Variable(tf.random_normal([atten_size], stddev=0.1, dtype=tf.float32))
u_omega = tf.Variable(tf.random_normal([atten_size], stddev=0.1, dtype=tf.float32))
v = tf.tanh(tf.matmul(tf.reshape(atten_inputs, [-1, combined_hidden_size]), W_omega) + tf.reshape(b_omega, [1, -1]))
print("v: "+str(v))
# u_omega is the summarizing question vector
vu = tf.matmul(v, tf.reshape(u_omega, [-1, 1]))
print("vu: "+str(vu))
exps = tf.reshape(tf.exp(vu), [-1, max_time])
print("exps: "+str(exps))
alphas = exps / tf.reshape(tf.reduce_sum(exps, 1), [-1, 1])
# alphas = exps / tf.reduce_sum(exps,1,keep_dim=TRUE)这么写也可以吧
print("alphas: "+str(alphas))
atten_outs = tf.reduce_sum(atten_inputs * tf.reshape(alphas, [-1, max_time, 1]), 1)
print("atten outs: "+str(atten_outs))
return atten_outs, alphas
def visualize_sentence_format(sent):
## remove the trailing 'STOP' symbols from sent
visual_sent = ' '.join(re.sub('STOP', '', sent).split())
return visual_sent
def visualize(sess, inputs, revlens, max_rev_length, keep_probs, index2word, alphas_words, alphas_sents, x_test, y_test, y_predict, visual_sample_index):
visual_dir = "../visualization"
# visualization
sents_visual_file = os.path.join(visual_dir, "sents_in_review_visualization_{}.html".format(visual_sample_index))
x_test_sample = x_test[visual_sample_index:visual_sample_index+1]
y_test_sample = y_test[visual_sample_index:visual_sample_index+1]
test_dict = {inputs:x_test_sample, revlens: [max_rev_length], keep_probs: [1.0, 1.0]}
alphas_words_test, alphas_sents_test = sess.run([alphas_words, alphas_sents], feed_dict=test_dict)
y_test_predict = sess.run(y_predict, feed_dict=test_dict)
print("test sample is {}".format(y_test_sample[0]))
print("test sample is predicted as {}".format(y_test_predict[0]))
print(alphas_words_test.shape)
# visualize a review
sents = [get_sentence(index2word, x_test_sample[0][i]) for i in range(max_rev_length)]
index_sent = 0
print("sents size is {}".format(len(sents)))
with open(sents_visual_file, "w") as html_file:
html_file.write('actual label: %f, predicted label: %f<br>' % (y_test_sample[0], y_test_predict[0]))
for sent, alpha in zip(sents, alphas_sents_test[0] / alphas_sents_test[0].max()):
if len(set(sent.split(' '))) == 1:
index_sent += 1
continue
visual_sent = visualize_sentence_format(sent)
# display each sent's importance by color
html_file.write('<font style="background: rgba(255, 0, 0, %f)">     </font>' % (alpha))
visual_words = visual_sent.split()
visual_words_alphas = alphas_words_test[index_sent][:len(visual_words)]
# for each sent, display its word importance by color
for word, alpha_w in zip(visual_words, visual_words_alphas / visual_words_alphas.max()):
html_file.write('<font style="background: rgba(255, 255, 0, %f)">%s </font>' % (alpha_w, word))
html_file.write('<br>')
index_sent += 1
if __name__ == '__main__':
pass