## 3. Recurrent Neural Network

- feedforward 
- backpropation
- sequence 계열 데이터에 사용되는 모델

### data

### reference : cs231n RNN 참고

In [24]:
# import module
import numpy as np
import sys

In [25]:
# data
# reference : https://github.com/aisolab/CS20
sentences = [['I', 'feel', 'hungry'],
     ['tensorflow', 'is', 'very', 'difficult'],
     ['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
     ['tensorflow', 'is', 'very', 'fast', 'changing']]
# 형태소
pos = [['pronoun', 'verb', 'adjective'],
     ['noun', 'verb', 'adverb', 'adjective'],
     ['noun', 'verb', 'determiner', 'noun', 'preposition', 'adjective', 'noun'],
     ['noun', 'verb', 'adverb', 'adjective', 'verb']]


In [26]:
# word dict
word_list = []
for elm in sentences:
    word_list += elm
word_list = list(set(word_list)) # unique한 word만 list
word_list.sort()
word_list = ['<pad>'] + word_list # '<pad>' 추가
word_to_ix = {word : idx for idx, word in enumerate(word_list)}  # word
ix_to_word={idx: word for idx, word in enumerate(word_list)}

In [27]:
# 확인하기
print(word_to_ix)
print(ix_to_word)

{'<pad>': 0, 'I': 1, 'a': 2, 'changing': 3, 'deep': 4, 'difficult': 5, 'fast': 6, 'feel': 7, 'for': 8, 'framework': 9, 'hungry': 10, 'is': 11, 'learning': 12, 'tensorflow': 13, 'very': 14}
{0: '<pad>', 1: 'I', 2: 'a', 3: 'changing', 4: 'deep', 5: 'difficult', 6: 'fast', 7: 'feel', 8: 'for', 9: 'framework', 10: 'hungry', 11: 'is', 12: 'learning', 13: 'tensorflow', 14: 'very'}


In [28]:
# size
data_size, vocab_size=len([ word for i in sentences for word in i]), len(word_to_ix)
data=[ word for i in sentences for word in i]

In [59]:
# hyperparameter
hidden_size=100 # size of hidden layer of neurons
seq_length=10 # number of steps to unroll the RNN
learning_rate=1e-1

# model parameter
Wxh = np.random.randn(hidden_size,vocab_size)*0.01 # input to hidden
Whh= np.random.randn(hidden_size,hidden_size)*0.01 # hidden to hidden
Why= np.random.randn(vocab_size,hidden_size)*0.01 # hiddent to output
bh =np.zeros((hidden_size,1))
by = np.zeros((vocab_size,1))

In [30]:
# sampling
def sample(h,seed_ix,n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    return sample_ix
    """
    
    x=np.zeros((vocab_size,1)) # 가능한 정답지 vocab
    x[seed_ix]=1 # seed_ix에 해당하는 index만 1 나머지는 모두 0
    ixes=[]
    
    """ 
    xrange 타입은 수정이 불가한 순차적 접근 가능한 데이터 타입이다. 
    xrange 타입의 장점이라고 하면 지정한 데이터 크기에 상관없이 memory 할당량이 일정하다는 것이다.
    """
    for t in range(n):
        h=np.tanh(np.dot(Wxh,x)+np.dot(Whh,h)+bh) # next hidden state
        y=np.dot(Why,h)+by # y값 예측
        p=np.exp(y)/np.sum(np.exp(y)) # print(p) 해보기
        # np.random.choice : p를 지정하면 각 p에 속하는 확률로 sampling
        ix=np.random.choice(range(vocab_size),p=p.ravel()) # ravel : 다차원 배열을 1차원 배열로 평평하게 변환
        x=np.zeros((vocab_size,1))
        x[ix]=1
        ixes.append(ix)
        
    return ixes
        

In [61]:
# loss function : forward pass + backward pass
def lossFun(inputs, targets, hprev):
    """
    input : list of integer
    target : list of interger
    hprev :  Hx1 array of initial state
    return the loss, gradient on model parameters, and last hiddent state
    """
    xs, hs, ys, ps={},{},{},{}
    hs[-1]=np.copy(hprev)
    loss=0
    
    # forward pass
    print(len(inputs))
    for t in range(len(inputs)):
        xs[t]=np.zeros((vocab_size,1)) # encode one-hot
        xs[t][inputs[t]]=1
        hs[t]=np.tanh(np.dot(Wxh,xs[t])+np.dot(Whh,hs[t-1])+bh) # hidden state
        ys[t]=np.dot(Why,hs[t])+by # unnormalized log probabilities for next chars
        ps[t]=np.exp(ys[t])/np.sum(np.exp(ys[t])) # probabilites for next chars
        print(t) #18까지는 되는데
        loss+=-np.log(ps[t][targets[t],0]) # softmax (cross-entropy)
    # backward pass
    
    dWxh, dWhh, dWhy=np.zeros_like(Wxh),np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np. zeros_like(bh), np.zeros_like(by)
    dhnext=np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))): # 역순으로 진행한다
        dy=np.copy(ps[t])
        dy[targets[t]]-=1 # backpropagation into y
        dWhy+=np.dot(dy,hs[t].T)
        dby+=dy
        dh=np.dot(Why.T,dy)+dhnext
        dhraw=(1-hs[t]*hs[t])*dh
        dbh+=dhraw
        dWxh+=np.dot(dhraw,xs[t].T)
        dWhh+=np.dot(dhraw,hs[t-1].T)
        dhnext=np.dot(Whh.T,dhraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    return loss, dWxh, dWhh, dWhh, dbh,dby, hs[len(inputs)-1]
        
        

In [49]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0


In [73]:
while True:
    # input 준비하기
    if p+seq_length+1>=len(data) or n==0:
        hprev=np.zeros((hidden_size,1)) # Reset RNN hidden state
        p=0 # go from start of data
    inputs=[word_to_ix[word] for word in data[p:p+seq_length]]
    targets=[word_to_ix[word] for word in data[p+1:p+seq_length+1]]
    print(len(data))
    
    # sample from the model now and then
    if n%100==0:
        
        sample_ix=sample(hprev,inputs[0],200)
        txt=" ".join(ix_to_word[ix] for ix in sample_ix)
        print('----------\n %s \n ---------------'%txt)
        
    # forward pass
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
        
    # parameter updata
    # perform parameter update with Adagrad
    
    Wxh-=learning_rate*dWxh
    Whh-=learning_rate*dWhh
    print(dWhy.shape) # 100,100 에러가 생기는 부분
    Why-=learning_rate*dWhy
    bh-=learning_rate*dbh
    by-=learning_rate*dby
#     for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
#                                 [dWxh, dWhh, dWhy, dbh, dby], 
#                                 [mWxh, mWhh, mWhy, mbh, mby]):
# #         mem += dparam* dparam
# #         param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
# #         #print(dparam.shape)
# #         #param-=learning_rate*dparam


    p+=seq_length
    n+=1
    

19
----------
 deep <pad> difficult fast a very changing changing a feel learning changing for I for deep feel is I is fast tensorflow for hungry fast tensorflow feel a very very deep learning for hungry <pad> framework very difficult <pad> changing fast difficult very framework I difficult <pad> hungry difficult deep hungry <pad> for <pad> a learning changing a fast learning I changing feel deep tensorflow fast very feel deep for learning I feel fast <pad> for is is is fast framework deep hungry learning fast deep changing feel is fast difficult framework very is changing tensorflow difficult difficult difficult changing framework very learning a feel difficult feel for <pad> <pad> hungry very hungry changing deep hungry changing for very feel a <pad> feel fast learning deep fast hungry fast deep very fast framework a deep is hungry is deep I difficult hungry I learning hungry is feel difficult for is deep deep tensorflow for hungry tensorflow learning framework <pad> very deep <pad> 

ValueError: operands could not be broadcast together with shapes (15,100) (100,100) (15,100) 