In [4]:
import re
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [5]:
np.set_printoptions(precision=4)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fe331513930>

In [6]:
DATADIR = 'data/'

In [7]:
def read_corpus(file):
    """
    returns:
        lines: [['hello', 'world'], ...]
        labels: [[!], [N], ...]
        vocab
    """
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    ret_lines = []
    labels = []
    vocab = set()
    for line in lines:
        if not line: 
            continue
        curr_line = []
        for token_label_str in line.split('\n'):
            if not token_label_str: 
                continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            labels.append(label)
            curr_line.append(token)
        ret_lines.append(curr_line)
    return ret_lines, labels, vocab

In [8]:
def encode_lines(lines, word2idx_map, window_size):
    """
    returns X: len(lines) x (2 * window_size + 1)
    """
    def encode_line(line, word2idx_map, window_size):
        num_repr = [] # numerical representation
        for word in line:
            num = word2idx_map.get(word, word2idx_map['UUUNKKK'])
            num_repr.append(num)
        # pad with start and end tokens
        start = [word2idx_map['<s>']] * window_size
        end = [word2idx_map['</s>']] * window_size
        padded = start + num_repr + end
        
        ret = []
        for i in range(window_size, len(padded) - window_size):
            windowed = padded[i - window_size : i + window_size + 1]
            ret.append(windowed)
            
        return ret
    
    res = []
    for line in lines:
        res.extend(encode_line(line, word2idx_map, window_size))
    return torch.tensor(res)

In [9]:
def construct_features(lines, features):
    """
    features: a set of characters to look for in each token
    also append to the end:
        (count features are normalized during training)
        - a count feature for digits
        - a count feature for the length of the entire token
    returns X: len(lines) x num_features
    """
    res = []
    for line in lines:
        for word in line:
            word_features = []
            char_set = set(word)
            for feature in features:
                val = 1 if feature in char_set else 0
                word_features.append(val)

            word_features.append(len(re.findall(r'\d', word))) # digit feature
            word_features.append(len(word)) # count feature
        
            res.append(word_features)

    return torch.tensor(res)

# Network

In [29]:
class FeedForwardTagger(nn.Module):
    
    def __init__(self, vocab_size, window_size, output_dim,
                 emb_dim=50, pretrained_emb=None, freeze=False,
                num_binary_features=0, num_count_features=0):
        """
        count features will be batch-normalized during forward
        """
        
        super(FeedForwardTagger, self).__init__()
        
        self.num_bin_feat = num_binary_features
        self.num_cnt_feat = num_count_features
        
        if pretrained_emb:
            self.emb = nn.Embedding.from_pretrained(pretrain_emb)
        else:
            self.emb = nn.Embedding(vocab_size, emb_dim)
            torch.nn.init.uniform_(self.emb.weight, -0.01, 0.01)
        
        input_dim = (2 * window_size + 1) * emb_dim
        input_dim += self.num_bin_feat + self.num_cnt_feat
        
        if self.num_cnt_feat != 0:
            self.batchnorm = nn.BatchNorm1d(self.num_cnt_feat)
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        """
        input: [emb | binary features | count features]
        """
        num_feat = self.num_bin_feat + self.num_cnt_feat
        if num_feat != 0:
            to_embed = inputs[:, :-num_feat]
            bin_feats = inputs[:, -num_feat : -self.num_cnt_feat]
            cnt_feats = inputs[:, -self.num_cnt_feat:]
            
            # embed up to num_extra_features
            embeds = self.emb(to_embed).view((inputs.shape[0], -1))
            # normalize count features
            cnt_feats = self.batchnorm(cnt_feats.float())
            # concat emb w/ extra features
            x = torch.cat((embeds, bin_feats, cnt_feats), dim=1)
        else:
            x = embeds = self.emb(inputs).view((inputs.shape[0], -1))
            
        out = torch.tanh(self.fc1(x))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [11]:
def train_util(model, X_train, Y_train, X_dev, Y_dev, n_epochs, lr, 
              batch_size):
    """
    returns: best_model, losses, train_accu_list, dev_accu_list
    """
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    best_model = None
    best_dev_accu = 0
    losses = []
    train_accu_list, dev_accu_list = [], []
    for epoch in range(n_epochs):
        epoch_loss = 0
        
        for i in range(0, X_train.shape[0], batch_size):
            optimizer.zero_grad()
            log_probs = model(X_train[i : i + batch_size])
            loss = loss_func(log_probs, Y_train[i : i + batch_size])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        train_preds = torch.argmax(model(X_train), dim=1)
        train_accu = accuracy_score(Y_train, train_preds)
        # evaluate on dev
        dev_preds = torch.argmax(model(X_dev), dim=1)
        dev_accu = accuracy_score(Y_dev, dev_preds)
        
        # early stopping, save the model if it has improved on dev
        if dev_accu > best_dev_accu:
            best_dev_accu = dev_accu
            best_model = deepcopy(model)
        
        print('Epoch {}: train_loss {:.4f}, train_accu: {:.4f}, dev_accu: {:.4f}'\
              .format(epoch, epoch_loss, train_accu, dev_accu))
        losses.append(epoch_loss)
        train_accu_list.append(train_accu)
        dev_accu_list.append(dev_accu)
        
    loss_accu_df = pd.DataFrame({
        'epoch': range(n_epochs), 
        'loss': losses,
        'train_accu': train_accu_list,
        'dev_accu': dev_accu_list})
        
    return best_model, loss_accu_df

# Plotting Utils

In [12]:
def plot_loss_accu(loss_accu_df_list, window_list):
    """
    input: two lists of the same length, loss_accu_df, window
    """
    dfs = []
    for df, w in zip(loss_accu_df_list, window_list):
        df1 = df.melt(
            'epoch', value_vars=['loss']).assign(window=w, plot='loss')
        df2 = df.melt(
            'epoch', value_vars=['train_accu', 'dev_accu']).assign(window=w, plot='accu')
        dfs.extend([df1, df2])
    plot_df = pd.concat(dfs)

    g = sns.FacetGrid(data=plot_df, row='plot', col='window', 
                      hue='variable', sharey=False)
    g.map_dataframe(sns.lineplot, x='epoch', y='value')
    g.add_legend()

def plot_confusion_matrix(matrix, labels, title):
    plt.figure(figsize=(10, 6))
    ax = sns.heatmap(matrix, xticklabels=labels, yticklabels=labels, 
                     annot=True, fmt='d', cmap='Blues')
    ax.set_xlabel('Predictions')
    ax.set_ylabel('True labels')
    ax.set_title(title)

# Load Data

In [13]:
train, train_labels, train_vocab = read_corpus(DATADIR + 'twpos-train.tsv')
dev, dev_labels, dev_vocab = read_corpus(DATADIR + 'twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab = read_corpus(DATADIR + 'twpos-devtest.tsv')

In [14]:
all_labels = np.unique(train_labels)
all_labels_devtest = np.unique(devtest_labels) # devtest labels

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
Y_train = label_encoder.transform(train_labels)
Y_dev = label_encoder.transform(dev_labels)
Y_devtest = label_encoder.transform(devtest_labels)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_dev = torch.tensor(Y_dev, dtype=torch.long)
Y_devtest = torch.tensor(Y_devtest, dtype=torch.long)

In [15]:
# devtest labels
all_labels_devtest = np.unique(devtest_labels)

In [16]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

# 1. Baseline w/ Randomly Initialized Embeddings

In [17]:
# construct maps for randomly initialized embs
idx2word_rand = sorted(vocab)
idx2word_rand += ['<s>', '</s>', 'UUUNKKK']
word2idx_rand = {word: idx for idx, word in enumerate(idx2word_rand)}

## Encode Train, Dev, DevTest

In [22]:
# w = 0
X_train_w0 = encode_lines(train, word2idx_rand, window_size=0)
X_dev_w0 = encode_lines(dev, word2idx_rand, window_size=0)
X_devtest_w0 = encode_lines(devtest, word2idx_rand, window_size=0)

In [23]:
# w = 1
X_train_w1 = encode_lines(train, word2idx_rand, window_size=1)
X_dev_w1 = encode_lines(dev, word2idx_rand, window_size=1)
X_devtest_w1 = encode_lines(devtest, word2idx_rand, window_size=1)

## Train Model

### w = 0

In [None]:
model_w0 = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=0,
                          output_dim=len(all_labels))
best_model_w0, df_w0 = train_util(model_w0, X_train_w0, Y_train, X_dev_w0, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=2000)

In [None]:
# evaluate on devtest
devtest_preds = torch.argmax(best_model_w0(X_devtest_w0), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w0 = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

### w = 1

In [None]:
model_w1 = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=1,
                          output_dim=len(all_labels))
best_model_w1, df_w1 = train_util(model_w1, X_train_w1, Y_train, X_dev_w1, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

In [None]:
devtest_preds = torch.argmax(best_model_w1(X_devtest_w1), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1 = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

## Plot losses and accuracy, and confusion matrix

In [None]:
plot_loss_accu([df_w0, df_w1], window_list=[0, 1])

In [None]:
plot_confusion_matrix(conf_matrix_w0, all_labels_devtest, 'w=0')

In [None]:
plot_confusion_matrix(conf_matrix_w0, all_labels_devtest, 'w=1')

# 2. Feature Engineering
In addition to the following binary features, I also added a count feature for digits and a count feature for the length of the entire token.

In [24]:
FEATURES = ['#', '%', "'", '/', ':', '’']

In [25]:
X_train_features = construct_features(train, FEATURES)
X_dev_features = construct_features(dev, FEATURES)
X_devtest_features = construct_features(devtest, FEATURES)

In [26]:
X_train_w0_feat = torch.cat((X_train_w0, X_train_features), dim=1)
X_dev_w0_feat = torch.cat((X_dev_w0, X_dev_features), dim=1)
X_devtest_w0_feat = torch.cat((X_devtest_w0, X_devtest_features), dim=1)

In [27]:
X_train_w1_feat = torch.cat((X_train_w1, X_train_features), dim=1)
X_dev_w1_feat = torch.cat((X_dev_w1, X_dev_features), dim=1)
X_devtest_w1_feat = torch.cat((X_devtest_w1, X_devtest_features), dim=1)

### W=0

In [30]:
model_w0_feat = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                                  window_size=0, 
                                  output_dim=len(all_labels),
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w0_feat, df_w0_feat = \
train_util(model_w0_feat, X_train_w0_feat, Y_train, X_dev_w0_feat, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

Epoch 0: train_loss 44.4099, train_accu: 0.2244, dev_accu: 0.2358
Epoch 1: train_loss 39.6456, train_accu: 0.3888, dev_accu: 0.3804
Epoch 2: train_loss 29.9395, train_accu: 0.5029, dev_accu: 0.5005
Epoch 3: train_loss 24.2968, train_accu: 0.5964, dev_accu: 0.5878
Epoch 4: train_loss 20.8838, train_accu: 0.6673, dev_accu: 0.6636
Epoch 5: train_loss 18.6234, train_accu: 0.6963, dev_accu: 0.6816
Epoch 6: train_loss 15.9197, train_accu: 0.7255, dev_accu: 0.7071
Epoch 7: train_loss 13.9421, train_accu: 0.7529, dev_accu: 0.7308
Epoch 8: train_loss 12.3033, train_accu: 0.8041, dev_accu: 0.7525
Epoch 9: train_loss 10.8349, train_accu: 0.8329, dev_accu: 0.7635
Epoch 10: train_loss 9.4517, train_accu: 0.8639, dev_accu: 0.7693
Epoch 11: train_loss 8.4453, train_accu: 0.8741, dev_accu: 0.7714
Epoch 12: train_loss 7.7078, train_accu: 0.8858, dev_accu: 0.7789
Epoch 13: train_loss 6.8678, train_accu: 0.8987, dev_accu: 0.7864
Epoch 14: train_loss 6.2967, train_accu: 0.9034, dev_accu: 0.7861
Epoch 15: 

In [31]:
devtest_preds = torch.argmax(best_model_w0_feat(X_devtest_w0_feat), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w0_feat = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8021


## w=1

In [32]:
model_w1_feat = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                                  window_size=1, 
                                  output_dim=len(all_labels),
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w1_feat, df_w1_feat = \
train_util(model_w1_feat, X_train_w1_feat, Y_train, X_dev_w1_feat, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

Epoch 0: train_loss 45.1374, train_accu: 0.1795, dev_accu: 0.1854
Epoch 1: train_loss 39.4513, train_accu: 0.2642, dev_accu: 0.2539
Epoch 2: train_loss 34.2350, train_accu: 0.3862, dev_accu: 0.3748
Epoch 3: train_loss 25.4433, train_accu: 0.6241, dev_accu: 0.6169
Epoch 4: train_loss 19.7850, train_accu: 0.6755, dev_accu: 0.6677
Epoch 5: train_loss 16.8015, train_accu: 0.7195, dev_accu: 0.7019
Epoch 6: train_loss 14.6386, train_accu: 0.7490, dev_accu: 0.7283
Epoch 7: train_loss 12.7468, train_accu: 0.7805, dev_accu: 0.7484
Epoch 8: train_loss 11.1426, train_accu: 0.8099, dev_accu: 0.7650
Epoch 9: train_loss 9.8024, train_accu: 0.8365, dev_accu: 0.7772
Epoch 10: train_loss 8.6743, train_accu: 0.8507, dev_accu: 0.7818
Epoch 11: train_loss 7.5694, train_accu: 0.8794, dev_accu: 0.7963
Epoch 12: train_loss 6.6847, train_accu: 0.8977, dev_accu: 0.8023
Epoch 13: train_loss 5.9266, train_accu: 0.9105, dev_accu: 0.8108
Epoch 14: train_loss 5.2365, train_accu: 0.9197, dev_accu: 0.8146
Epoch 15: t

In [33]:
devtest_preds = torch.argmax(best_model_w1_feat(X_devtest_w1_feat), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1_feat = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8441


# 3. Pretrained Embeddings

In [None]:
# construct maps for pretrained word embs

# 4. Architecture Engineering
## w = 2

In [None]:
# w = 2
X_train_w2 = encode_lines(train, word2idx_rand, window_size=2)
X_dev_w2 = encode_lines(dev, word2idx_rand, window_size=2)
X_devtest_w2 = encode_lines(devtest, word2idx_rand, window_size=2)