In [97]:
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
DATADIR = 'data/'

In [44]:
def read_corpus(file):
    """
    returns:
        lines: [['hello', 'world'], ...]
        labels: [[!], [N], ...]
        vocab
    """
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    ret_lines = []
    labels = []
    vocab = set()
    for line in lines:
        if not line: 
            continue
        curr_line = []
        for token_label_str in line.split('\n'):
            if not token_label_str: 
                continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            labels.append(label)
            curr_line.append(token)
        ret_lines.append(curr_line)
    return ret_lines, labels, vocab

In [25]:
def encode_lines(lines, word2idx_map, window_size):
    """
    returns X: len(lines) x (2 * window_size + 1)
    """
    def encode_line(line, word2idx_map, window_size):
        num_repr = [] # numerical representation
        for word in line:
            num = word2idx_map.get(word, word2idx_map['UUUNKKK'])
            num_repr.append(num)
        # pad with start and end tokens
        start = [word2idx_map['<s>']] * window_size
        end = [word2idx_map['</s>']] * window_size
        padded = start + num_repr + end
        
        ret = []
        for i in range(window_size, len(padded) - window_size):
            windowed = padded[i - window_size : i + window_size + 1]
            ret.append(windowed)
            
        return ret
    
    res = []
    for line in lines:
        res.extend(encode_line(line, word2idx_map, window_size))
    return torch.tensor(res)

# Network

In [95]:
class FeedForwardTagger(nn.Module):
    
    def __init__(self, vocab_size, window_size, output_dim,
                 emb_dim=50, pretrained_emb=None, freeze=False):
        super(FeedForwardTagger, self).__init__()
        if pretrained_emb:
            self.emb = nn.Embedding.from_pretrained(pretrain_emb)
        else:
            self.emb = nn.Embedding(vocab_size, emb_dim)
        input_dim = (2 * window_size + 1) * emb_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        embeds = self.emb(inputs).squeeze()
        out = F.tanh(self.fc1(embeds))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [113]:
def train_util(model, X_train, Y_train, X_dev, Y_dev, n_epochs):
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=5)
    
    best_model = None
    losses = []
    train_accu_list, dev_accu_list = [], []
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        log_probs = model(X_train)
        loss = loss_func(log_probs, Y_train)
        loss.backward()
        optimizer.step()
        
        train_accu = accuracy_score(Y_train, 
                                    torch.argmax(log_probs, dim=1))
        # evaluate on dev
        dev_preds = torch.argmax(model(X_dev), dim=1)
        dev_accu = accuracy_score(Y_dev, dev_preds)
        
        print(epoch, loss.item(), train_accu, dev_accu)
        losses.append(loss)
        train_accu_list.append(train_accu)
        dev_accu_list.append(dev_accu)
    return losses, train_accu_list, dev_accu_list

# Load Data

In [45]:
train, train_labels, train_vocab = read_corpus(DATADIR + 'twpos-train.tsv')
dev, dev_labels, dev_vocab = read_corpus(DATADIR + 'twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab = read_corpus(DATADIR + 'twpos-devtest.tsv')

In [75]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(train_labels)))
Y_train = label_encoder.transform(train_labels)
Y_dev = label_encoder.transform(dev_labels)
Y_devtest = label_encoder.transform(devtest_labels)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_dev = torch.tensor(Y_dev, dtype=torch.long)
Y_devtest = torch.tensor(Y_devtest, dtype=torch.long)

In [55]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

# 1. Baseline w/ Randomly Initialized Embeddings

In [11]:
# construct maps for randomly initialized embs
idx2word_rand = sorted(vocab)
idx2word_rand += ['<s>', '</s>', 'UUUNKKK']
word2idx_rand = {word: idx for idx, word in enumerate(idx2word_rand)}

## Encode Train, Dev, DevTest

In [29]:
# w = 0
X_train_w0 = encode_lines(train, word2idx_rand, window_size=0)
X_dev_w0 = encode_lines(dev, word2idx_rand, window_size=0)
X_devtest_w0 = encode_lines(devtest, word2idx_rand, window_size=0)

In [30]:
# w = 1
X_train_w3 = encode_lines(train, word2idx_rand, window_size=1)
X_dev_w3 = encode_lines(dev, word2idx_rand, window_size=1)
X_devtest_w3 = encode_lines(devtest, word2idx_rand, window_size=1)

## Train Model

### w = 0

In [114]:
model = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=0,
                          output_dim=len(all_labels))
train_util(model, X_train_w0, Y_train, X_dev_w0, Y_dev, n_epochs=80)

0 3.2219648361206055 0.0655575014594279 0.3437046255963493
1 2.2807838916778564 0.34740221833041446 0.4517734909769757
2 1.9778130054473877 0.45125510799766494 0.45384774942957895
3 1.8543860912322998 0.46468184471687096 0.4781165733250363
4 1.7962263822555542 0.48488032691185057 0.46069280232316945
5 1.8289300203323364 0.4687098657326328 0.4706492428956648
6 1.8158886432647705 0.4692936368943374 0.39742791951877204
7 2.0002593994140625 0.40758902510215994 0.4463804190002074
8 1.904037594795227 0.43887915936952715 0.46608587429993775
9 1.82635498046875 0.47046117921774666 0.5121344119477287
10 1.7450987100601196 0.5076474022183304 0.5086081725783033
11 1.6692945957183838 0.508990075890251 0.5193943165318399
12 1.6115907430648804 0.5213076474022184 0.5098527276498652
13 1.6431019306182861 0.512784588441331 0.527276498651732
14 1.6350911855697632 0.5267367192060712 0.536403235843186
15 1.5908668041229248 0.5405137186223 0.5420037336652147
16 1.5037978887557983 0.5425569176882662 0.548641

([tensor(3.2220, grad_fn=<NllLossBackward>),
  tensor(2.2808, grad_fn=<NllLossBackward>),
  tensor(1.9778, grad_fn=<NllLossBackward>),
  tensor(1.8544, grad_fn=<NllLossBackward>),
  tensor(1.7962, grad_fn=<NllLossBackward>),
  tensor(1.8289, grad_fn=<NllLossBackward>),
  tensor(1.8159, grad_fn=<NllLossBackward>),
  tensor(2.0003, grad_fn=<NllLossBackward>),
  tensor(1.9040, grad_fn=<NllLossBackward>),
  tensor(1.8264, grad_fn=<NllLossBackward>),
  tensor(1.7451, grad_fn=<NllLossBackward>),
  tensor(1.6693, grad_fn=<NllLossBackward>),
  tensor(1.6116, grad_fn=<NllLossBackward>),
  tensor(1.6431, grad_fn=<NllLossBackward>),
  tensor(1.6351, grad_fn=<NllLossBackward>),
  tensor(1.5909, grad_fn=<NllLossBackward>),
  tensor(1.5038, grad_fn=<NllLossBackward>),
  tensor(1.5018, grad_fn=<NllLossBackward>),
  tensor(1.4518, grad_fn=<NllLossBackward>),
  tensor(1.4666, grad_fn=<NllLossBackward>),
  tensor(1.4643, grad_fn=<NllLossBackward>),
  tensor(1.4154, grad_fn=<NllLossBackward>),
  tensor(1

In [None]:
# construct maps for pretrained word embs