In [None]:
#you can get the training data from https://github.com/clab/dynet_tutorial_examples

from __future__ import division
import minpy.numpy as np
from minpy.context import set_context, gpu
from minpy.nn.model_builder import *
from minpy.nn.modules import *
import re
import random
import time

class Vocab(object):
    def __init__(self, w2i):
        self.w2i = dict(w2i)
        self.i2w = {i:w for w,i in w2i.iteritems()}

    @classmethod
    def from_list(cls, words):
        w2i = {}
        idx = 0
        for word in words:
            w2i[word] = idx
            idx += 1
        return Vocab(w2i)

    @classmethod
    def from_file(cls, vocab_fname):
        words = []
        with file(vocab_fname) as fh:
            for line in fh:
                line.strip()
                word, count = line.split()
                words.append(word)
        return Vocab.from_list(words)

    def size(self): return len(self.w2i.keys())
    
def read_oracle(fname, vw, va):
    with file(fname) as fh:
        for line in fh:
            line = line.strip()
            ssent, sacts = re.split(r' \|\|\| ', line)
            sent = [vw.w2i[x] for x in ssent.split()]
            acts = [va.w2i[x] for x in sacts.split()]
            sent.reverse()
            acts.reverse()
            yield (sent, acts)
            
set_context(gpu(0)) # set the global context with gpu

def log_softmax(x):
    # x should be (batch, prob)
    # y should be (batch, )

    x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max()
    sm = x_dev - np.log(np.sum(np.exp(x_dev), axis=1, keepdims=True))
    return sm

In [None]:
acts = ['SHIFT', 'REDUCE_L', 'REDUCE_R']
vocab_acts = Vocab.from_list(acts)
SHIFT = vocab_acts.w2i['SHIFT']
REDUCE_L = vocab_acts.w2i['REDUCE_L']
REDUCE_R = vocab_acts.w2i['REDUCE_R']
# load training and dev data
vocab_words = Vocab.from_file('data/vocab.txt')
train = list(read_oracle('data/small-train.unk.txt', vocab_words, vocab_acts))
dev = list(read_oracle('data/small-dev.unk.txt', vocab_words, vocab_acts))

In [None]:
WORD_EMB_DIM = 64
ACT_EMB_DIM = 32
H_DIM = 64
class Stack_RNN(ModelBase):
    def __init__(self, batch_size=64, vocab=None):
        super(Stack_RNN, self).__init__()
        self.vocab = vocab
        self.WORD_DIM = vocab.size()
        self.batch_size = batch_size
        

        self._stack_lstm = LSTM(H_DIM, 'tanh')
        self._comp_linear = FullyConnected(H_DIM, WORD_EMB_DIM)
        self._buffer_lstm = LSTM(H_DIM, 'tanh')
        self._act_linear = FullyConnected(H_DIM, 3)
        self._s2h_linaer = FullyConnected(H_DIM, H_DIM)
        self._tok_emb = Embedding(sefl.WORD_DIM, WORD_EMB_DIM)
        
        self._buffer_head = Variable((self.batch_size, H_DIM))
        

    def forward(self, tokens, oracle_actions=None):
        
        if oracle_actions is not None:
            oracle_actions = list(oracle_actions) # aim to support push and pop
        buffer_list = []
        stack_list = []
        
        losses = []
        for tok in tokens:
            tok_emb = self._tok_emb(tok) #shape error, should be [tok] or tok.reshape((1, -1))
            if len(buffer_list) == 0:
                buffer_list.append((self._buffer_lstm(tok_emb, None, None), (tok_emb, self.vocab.i2w[tok])))
            else:
                buffer_list.append((self._buffer_lstm(tok_emb, buffer_list[-1][0][0], buffer_list[-1][0][1]), (tok_emb, self.vocab.i2w[tok])))

        while not (len(stack_list) == 1 and len(buffer_list) == 0):
            valid_acts = []
            if len(buffer_list) > 0:
                valid_acts += [SHIFT]
            if len(stack_list) >= 2:
                valid_acts += [REDUCE_L, REDUCE_R] 
            if len(valid_acts)==0:
                return 0.0
            log_probs = None
            action = valid_acts[0] # using SHIFT by default
            if len(valid_acts) > 1:
                buffer_rep = buffer_list[-1][0][0] if len(buffer_list)>0 else self._buffer_head()
                stack_rep = stack_list[-1][0][0]
                p_t = np.concatenate([buffer_rep, stack_rep], axis=1)
                h = np.tanh(self._s2h_linear(p_t))
                log_probs = log_softmax(self._act_linear(h))
                for act in [SHIFT, REDUCE_L, REDUCE_R]:
                    if act not in valid_acts:
                        log_probs[act] -= 99999.0  # shape error, log_probs[0][act] -= 99999.0
                    
                if oracle_actions is None:
                    action = int(np.argmax(log_probs[0], axis=0)[0]) #shape error, int(np.argmax(log_probs[0], axis=0)[0])
            if oracle_actions is not None:
                action = oracle_actions.pop()
            if log_probs is not None:
                losses.append(log_probs[action]) #shape error, losses.append(log_probs[0][action]) 

            #print len(stack_list), len(buffer_list), action
            if action == SHIFT:
                tok_emb, tok = buffer_list.pop()[1]
                if len(stack_list)==0:
                    stack_list.append((self._stack_lstm(tok_emb, None, None), (tok_emb, tok)))
                else:
                    stack_list.append((self._stack_lstm(tok_emb, stack_list[-1][0][0], stack_list[-1][0][1]), (tok_emb, tok)))
            else:
                right = stack_list.pop()[1]
                left = stack_list.pop()[1]
                head, modifier = (left, right) if action == REDUCE_R else (right, left)

                head_rep, head_tok = head
                mod_rep, mod_tok = modifier
                composed_rep = np.tanh(self._comp_linear(np.concatenate([head_rep, mod_rep], axis=1)))
                
                if len(stack_list)==0:
                    stack_list.append((self._stack_lstm(composed_rep, None, None), (composed_rep, tok)))
                else:
                    stack_list.append((self._stack_lstm(composed_rep, stack_list[-1][0][0], stack_list[-1][0][1]), (composed_rep, tok)))
                    
                if oracle_actions is None:
                    print '{0} --> {1}'.format(head_tok, mod_tok)
        if oracle_actions is None:
            head = stack_list.pop()[1][1]
            print 'ROOT --> {0}'.format(head)
        #minpy error, doesn't support np.sum(list)
        #total_loss = 0.
        #for each_loss in losses:
        #    total_loss += each_loss
        return - np.sum(total_loss)

In [None]:
model = Stack_RNN(batch_size=1, vocab=vocab_words)
updater = Updater(model, update_rule='rmsprop', learning_rate=0.002)
instances_processed = 0
validation_losses = []
#assert 1==2
for epoch in range(1):
    random.shuffle(train)
    words = 0
    total_loss = 0.0
    st = time.time()
    for (s,a) in train:
        # periodically report validation loss
        e = instances_processed / len(train)
        if instances_processed % 1000 == 0:
            dev_words = 0
            dev_loss = 0.0
            st = time.time()
            cnt = 0
            for (ds, da) in dev:
                loss = model.forward(ds, da)
                dev_words += len(ds)
                if loss is not None:
                    dev_loss += loss
                #print cnt, 
                cnt+=1
            #print ' '
            print('[validation] time {} epoch {}: per-word loss: {}'.format(time.time()-st, e, dev_loss / dev_words))
            validation_losses.append(dev_loss)
            st = time.time()

        # report training loss

        if instances_processed % 100 == 0 and words > 0:
            print('time {}: epoch {}: per-word loss: {}'.format(time.time()-st, e, total_loss / words))
            words = 0
            total_loss = 0.0
            st = time.time()
        if e>0.2:
            break
        # here we do training
        grad_dict, loss = model.grad_and_loss(s, a) # returns None for 1-word sentencs (it's clear how to parse them)
        updater(grad_dict) 
        words += len(s)
        instances_processed += 1
        if loss is not None:
            total_loss += float(loss)