/
neural_network2_Sean.py
134 lines (99 loc) · 4.21 KB
/
neural_network2_Sean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime
import os
import sys
sys.path.append(os.path.abspath('.'))
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx
from markov import get_bigram_probs
'''
Neural Network Bigram Model
用 NN 架構(一層hidden layer)
用到auto-encoder的觀念,來 train 一個 Bigrame model
這一篇加上了 numpy 中 indexing trick 的技巧,可以加速計算
Note:
在這支code,cost 計算要放在權重更新之前就做,要不然算出來的數值會爆炸,講師說原因如下
We do a gradient descent step after cost
since the calculation of doutput will overwrite predictions
'''
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)
# idx2word = [ word2idx[i] for i in range(len(word2idx))]
V = len(word2idx)
D = 100
print('Vocab size:', V)
## 第一種使用的權重 : randomly initialize weights
W1 = np.random.randn(V, D) / np.sqrt(V)
W2 = np.random.randn(D, V) / np.sqrt(D)
# b = 這次省略bias 設置
# we will also treat beginning of sentence and end of sentence as bigrams
# START -> first word
# last word -> END
start_idx = word2idx['START']
end_idx = word2idx['END']
def sofmax(a):
a = a - a.max() # avoid numerical overflow. 避免算出來的數字大到爆炸
expA = np.exp(a)
return expA / expA.sum(axis=1, keepdims=True)
epochs = 1
costs = []
learning_rate = 1e-2
t0 = datetime.now()
for epoch in range(epochs):
random.shuffle(sentences)
j = 0
for sentence in sentences:
# word2idx = {'START': 0, 'END': 1}
train_sent = [start_idx] + sentence
target_sent = sentence + [end_idx] # 在這裡 train_sent and target_sent的長度一樣
n = len(sentence) + 1 # 每個句子的實際長度
## 這次不把input / target matrix做出來,而是指接使用list of index (train_sent / target_sent)
## 注意: input and target 都是one hot encoding matrix,會與 cost的寫法有關係(跟下一篇比較)
X = train_sent
Y = target_sent
# get output predictions===============================================================
Z = np.tanh(W1[X])
pY = sofmax(Z.dot(W2))
# cost funciton=========================================================================
c_sent = -np.sum(np.log(pY[np.arange(n), Y])) / n
costs.append(c_sent)
# do a gradient descent step=============================================================
# (pY - Y) 的簡化
doutput = pY # N x V
doutput[np.arange(n), target_sent] -= 1 # 取代 (pY - Y), 原本的Y 是 one-hot encoding
W2 = W2 - learning_rate*Z.T.dot(doutput) # (D x N) (N x V)
dhidden = doutput.dot(W2.T) * (1 - Z*Z) # (N x V) (V x D) * (N x D)
i = 0
for idx in X:
# dhidden 有 N x D 維度 , N 會從1~N
W1[idx] = W1[idx] - learning_rate * dhidden[i]
i += 1
## 更快速的update W1 算法
# np.subtract.at(W1, X, learning_rate * dhidden)
if j % 1000 == 0:
print("epoch:", epoch ," c_sent:", c_sent, "how much sentences has been trained: %i / %i" % (j, len(sentences)))
j += 1
print("Elapsed time training:", datetime.now() - t0)
# plot smoothed losses to reduce variability
# 還不知道原理,直接拿來用,目的是讓loss grapth 變得平順一點
def smoothed_loss(x, decay=0.99):
y = np.zeros(len(x))
last = 0
for t in range(len(x)):
z = decay * last + (1 - decay) * x[t]
y[t] = z / (1 - decay ** (t + 1))
last = z
return y
plt.plot(costs)
plt.plot(smoothed_loss(costs))
plt.show()
## 第二種使用的權重,bigram_probs(拿來比較用)
bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)
W_biagram = np.log(bigram_probs)
plt.subplot(1,2,1)
plt.title("Neural Network")
plt.imshow(np.tanh(W1).dot(W2))
plt.subplot(1,2,2)
plt.title("Bigram Probs")
plt.imshow(W_biagram)
plt.show()