# Chunker: baseline program

In [1]:
from default import *
import os

## Run the baseline solution on dev

In [2]:
chunker = LSTMTagger(os.path.join('data', 'train.txt.gz'), os.path.join('data', 'chunker'), '.tar')
decoder_output = chunker.decode('data/input/dev.txt')

100%|██████████| 1027/1027 [00:05<00:00, 202.11it/s]


## Evaluate the baseline output

In [3]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 12035 phrases; correct: 9216.
accuracy:  86.66%; (non-O)
accuracy:  87.74%; precision:  76.58%; recall:  77.47%; FB1:  77.02
             ADJP: precision:  50.00%; recall:  19.03%; FB1:  27.56  86
             ADVP: precision:  66.10%; recall:  48.49%; FB1:  55.94  292
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  75.99%; recall:  80.57%; FB1:  78.21  6613
               PP: precision:  92.48%; recall:  85.62%; FB1:  88.92  2260
              PRT: precision:  60.47%; recall:  57.78%; FB1:  59.09  43
             SBAR: precision:  79.55%; recall:  44.30%; FB1:  56.91  132
               VP: precision:  66.46%; recall:  75.26%; FB1:  70.59  2609


(76.57665143331948, 77.47141896435777, 77.02143663031215)

# Initial function for V1,V2 and V3

In [4]:
def prepare_initial_char_seq(seq, dic):
    idxs = []
    for w in seq:
        temp = torch.zeros([len(string.printable)], dtype=torch.float)
        temp[dic[w[0]]] = 1
        idxs.append(temp)
    result = torch.stack(idxs,0)
    return result


def prepare_last_char_seq(seq, dic):
    idxs = []
    for w in seq:
        temp = torch.zeros([len(string.printable)], dtype=torch.float)
        temp[dic[w[-1]]] = 1
        idxs.append(temp)
    result = torch.stack(idxs,0)
    return result

def prepare_mid_char_seq(seq, dic):
    idxs = []
    for w in seq:
        if len(w) >2:
            for char in w[1:-1]:
                temp = torch.zeros([len(string.printable)], dtype=torch.float)
                temp[dic[char]] += 1
            idxs.append(temp)
        else:
            idxs.append(torch.zeros([len(string.printable)], dtype=torch.float))
    result = torch.stack(idxs,0)
    return result

The prepare_initial_char_seq function take the first character of each word and set the corresponding index(the position in string.printable()) to one for each first character of the word and then use torch.stack() to stack each v1 for each word.

The prepare_last_char_seq function has the same logic as the prepare_initial_char_seq function except it is for the last character of each word.

The prepare_mid_char_seq function take the middle character (except first and last character) of each word and set the corresponding index to the count of that character in the word. If the word has no middle character (length <= 2), we use a vector which has 100 zeros to represent it.

The result of the functions above will be a 2D tensor of size sentence length times 100.

# Modified LSTMTaggerModel

In [5]:
class LSTMTaggerModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        torch.manual_seed(1)
        super(LSTMTaggerModel, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)
        self.lstm2 = nn.LSTM(428, hidden_dim, bidirectional=False)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence, char):
        embeds = self.word_embeddings(sentence)
        embeds_t = embeds.t()
        char_t = char.t()
        embeds = torch.cat((embeds_t,char_t),0)
        embeds = embeds.t()

        lstm_out, _ = self.lstm2(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Modified argmax function

In [6]:
def argmax(self, seq):
        output = []
        with torch.no_grad():
            inputs = prepare_sequence(seq, self.word_to_ix, self.unk)

            v1 = prepare_initial_char_seq(seq, self.diction)
            v2 = prepare_mid_char_seq(seq, self.diction)
            v3 = prepare_last_char_seq(seq, self.diction)
            v1 = v1.t()
            v2 = v2.t()
            v3 = v3.t()
            char_in = torch.cat((v1,v2,v3),0)
            char_in = char_in.t()

            tag_scores = self.model(inputs,char_in)
            for i in range(len(inputs)):
                output.append(self.ix_to_tag[int(tag_scores[i].argmax(dim=0))])
        return output

After concatenate v1, v2 and v3, we get a char_in vector which has size sentence length times 300. The char_in vector can then concatenate with the word embeddings which has size sentence length times 128.

# Modified train function

In [7]:
def train(self):
        loss_function = nn.NLLLoss()

        self.model.train()
        loss = float("inf")
        for epoch in range(self.epochs):
            for sentence, tags in tqdm.tqdm(self.training_data):
                # Step 1. Remember that Pytorch accumulates gradients.
                # We need to clear them out before each instance
                self.model.zero_grad()

                # Step 2. Get our inputs ready for the network, that is, turn them into
                # Tensors of word indices.
                sentence_in = prepare_sequence(sentence, self.word_to_ix, self.unk)
                targets = prepare_sequence(tags, self.tag_to_ix, self.unk)

                v1 = prepare_initial_char_seq(sentence, self.diction)
                v2 = prepare_mid_char_seq(sentence, self.diction)
                v3 = prepare_last_char_seq(sentence, self.diction)
                v1 = v1.t()
                v2 = v2.t()
                v3 = v3.t()

                char_in = torch.cat((v1,v2,v3),0)
                char_in = char_in.t()

                # Step 3. Run our forward pass.
                tag_scores = self.model(sentence_in,char_in)

                # Step 4. Compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = loss_function(tag_scores, targets)
                loss.backward()
                self.optimizer.step()

            if epoch == self.epochs-1:
                epoch_str = '' # last epoch so do not use epoch number in model filename
            else:
                epoch_str = str(epoch)
            savefile = self.modelfile + epoch_str + self.modelsuffix
            print("saving model file: {}".format(savefile), file=sys.stderr)
            torch.save({
                        'epoch': epoch,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': loss,
                        'unk': self.unk,
                        'word_to_ix': self.word_to_ix,
                        'tag_to_ix': self.tag_to_ix,
                        'ix_to_tag': self.ix_to_tag,
                    }, savefile)

## Analysis
The method we are using is referenced from the option one of combine the semi-Character RNN with phrasal chunker in the homework page. According to the baseline description, we tried to seperate each word into three vectors. V1 stores the first character, V3 stores the last character and V2 stores the remaining characters. When we created these three vectors, we use torch.stack() to stack each word vector in order to get the result. 
    
We transported three vectors which represented word fractions for every sentence then used torch.cat() to concatenate them to get a 2D tensor of size 300 times sentence length. After that, we transported that 2D tensor in order to get a new 2D tensor of size sentence length times 300. 

In class LSTMTaggerModel, we replaced "self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)" by "self.lstm2 = nn.LSTM(428, hidden_dim, bidirectional=False)" because len(string.printable) was 100 and we concatenated 3 such vectors which would change the input from 128 to 428. 

We used the same way to concatenate the sentence length 2D tensor of character vectors with the word embeddings which was a 2D tensor of size sentence length times 128. Eventually, we got Score(dev) = 77.0214.
