In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
DATADIR = 'data/'

In [44]:
def read_corpus(file):
    """
    returns:
        lines: [['hello', 'world'], ...]
        labels: [[!], [N], ...]
        vocab
    """
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    ret_lines = []
    labels = []
    vocab = set()
    for line in lines:
        if not line: 
            continue
        curr_line = []
        for token_label_str in line.split('\n'):
            if not token_label_str: 
                continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            labels.append(label)
            curr_line.append(token)
        ret_lines.append(curr_line)
    return ret_lines, labels, vocab

In [25]:
def encode_lines(lines, word2idx_map, window_size):
    """
    returns X: len(lines) x (2 * window_size + 1)
    """
    def encode_line(line, word2idx_map, window_size):
        num_repr = [] # numerical representation
        for word in line:
            num = word2idx_map.get(word, word2idx_map['UUUNKKK'])
            num_repr.append(num)
        # pad with start and end tokens
        start = [word2idx_map['<s>']] * window_size
        end = [word2idx_map['</s>']] * window_size
        padded = start + num_repr + end
        
        ret = []
        for i in range(window_size, len(padded) - window_size):
            windowed = padded[i - window_size : i + window_size + 1]
            ret.append(windowed)
            
        return ret
    
    res = []
    for line in lines:
        res.extend(encode_line(line, word2idx_map, window_size))
    return torch.tensor(res)

# Network

In [95]:
class FeedForwardTagger(nn.Module):
    
    def __init__(self, vocab_size, window_size, output_dim,
                 emb_dim=50, pretrained_emb=None, freeze=False):
        super(FeedForwardTagger, self).__init__()
        if pretrained_emb:
            self.emb = nn.Embedding.from_pretrained(pretrain_emb)
        else:
            self.emb = nn.Embedding(vocab_size, emb_dim)
        input_dim = (2 * window_size + 1) * emb_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        embeds = self.emb(inputs).squeeze()
        out = F.tanh(self.fc1(embeds))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [85]:
def train_util(model, X_train, Y_train, X_dev, Y_dev, n_epochs):
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=1)
    losses = []
    # TODO train_accu, dev_accu
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        log_probs = model(X_train)
        loss = loss_func(log_probs, Y_train)
        loss.backward()
        optimizer.step()
        print(epoch, loss.item())
        losses.append(loss)
    return losses

# Load Data

In [45]:
train, train_labels, train_vocab = read_corpus(DATADIR + 'twpos-train.tsv')
dev, dev_labels, dev_vocab = read_corpus(DATADIR + 'twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab = read_corpus(DATADIR + 'twpos-devtest.tsv')

In [75]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(train_labels)))
Y_train = label_encoder.transform(train_labels)
Y_dev = label_encoder.transform(dev_labels)
Y_devtest = label_encoder.transform(devtest_labels)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_dev = torch.tensor(Y_dev, dtype=torch.long)
Y_devtest = torch.tensor(Y_devtest, dtype=torch.long)

In [55]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

# 1. Baseline w/ Randomly Initialized Embeddings

In [11]:
# construct maps for randomly initialized embs
idx2word_rand = sorted(vocab)
idx2word_rand += ['<s>', '</s>', 'UUUNKKK']
word2idx_rand = {word: idx for idx, word in enumerate(idx2word_rand)}

## Encode Train, Dev, DevTest

In [29]:
# w = 0
X_train_w0 = encode_lines(train, word2idx_rand, window_size=0)
X_dev_w0 = encode_lines(dev, word2idx_rand, window_size=0)
X_devtest_w0 = encode_lines(devtest, word2idx_rand, window_size=0)

In [30]:
# w = 1
X_train_w3 = encode_lines(train, word2idx_rand, window_size=1)
X_dev_w3 = encode_lines(dev, word2idx_rand, window_size=1)
X_devtest_w3 = encode_lines(devtest, word2idx_rand, window_size=1)

## Train Model

### w = 0

In [96]:
model = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=0,
                          output_dim=len(all_labels))
train_util(model, X_train_w0, Y_train, X_dev_w0, Y_dev, n_epochs=10)

0 3.266125202178955
1 2.9613800048828125
2 2.7306466102600098
3 2.5591845512390137
4 2.421276807785034
5 2.310708522796631
6 2.221574544906616
7 2.148451328277588
8 2.0876646041870117
9 2.036442279815674


[tensor(3.2661, grad_fn=<NllLossBackward>),
 tensor(2.9614, grad_fn=<NllLossBackward>),
 tensor(2.7306, grad_fn=<NllLossBackward>),
 tensor(2.5592, grad_fn=<NllLossBackward>),
 tensor(2.4213, grad_fn=<NllLossBackward>),
 tensor(2.3107, grad_fn=<NllLossBackward>),
 tensor(2.2216, grad_fn=<NllLossBackward>),
 tensor(2.1485, grad_fn=<NllLossBackward>),
 tensor(2.0877, grad_fn=<NllLossBackward>),
 tensor(2.0364, grad_fn=<NllLossBackward>)]

In [None]:
# construct maps for pretrained word embs