In [4]:
import re
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [5]:
np.set_printoptions(precision=4)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fe331513930>

In [50]:
DATADIR = 'data/'
EMB_DIM = 50 # embedding dimension

In [7]:
def read_corpus(file):
    """
    returns:
        lines: [['hello', 'world'], ...]
        labels: [[!], [N], ...]
        vocab
    """
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    ret_lines = []
    labels = []
    vocab = set()
    for line in lines:
        if not line: 
            continue
        curr_line = []
        for token_label_str in line.split('\n'):
            if not token_label_str: 
                continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            labels.append(label)
            curr_line.append(token)
        ret_lines.append(curr_line)
    return ret_lines, labels, vocab

In [8]:
def encode_lines(lines, word2idx_map, window_size):
    """
    returns X: len(lines) x (2 * window_size + 1)
    """
    def encode_line(line, word2idx_map, window_size):
        num_repr = [] # numerical representation
        for word in line:
            num = word2idx_map.get(word, word2idx_map['UUUNKKK'])
            num_repr.append(num)
        # pad with start and end tokens
        start = [word2idx_map['<s>']] * window_size
        end = [word2idx_map['</s>']] * window_size
        padded = start + num_repr + end
        
        ret = []
        for i in range(window_size, len(padded) - window_size):
            windowed = padded[i - window_size : i + window_size + 1]
            ret.append(windowed)
            
        return ret
    
    res = []
    for line in lines:
        res.extend(encode_line(line, word2idx_map, window_size))
    return torch.tensor(res)

In [9]:
def construct_features(lines, features):
    """
    features: a set of characters to look for in each token
    also append to the end:
        (count features are normalized during training)
        - a count feature for digits
        - a count feature for the length of the entire token
    returns X: len(lines) x num_features
    """
    res = []
    for line in lines:
        for word in line:
            word_features = []
            char_set = set(word)
            for feature in features:
                val = 1 if feature in char_set else 0
                word_features.append(val)

            word_features.append(len(re.findall(r'\d', word))) # digit feature
            word_features.append(len(word)) # count feature
        
            res.append(word_features)

    return torch.tensor(res)

# Network

In [127]:
class FeedForwardTagger(nn.Module):
    
    def __init__(self, window_size, output_dim, emb_dim=EMB_DIM, 
                 vocab_size=None, pretrained_emb=None, freeze=False,
                num_binary_features=0, num_count_features=0):
        """
        vocab_size is None when using pretrained emb
        count features will be batch-normalized during forward
        """
        
        super(FeedForwardTagger, self).__init__()
        
        self.num_bin_feat = num_binary_features
        self.num_cnt_feat = num_count_features
        
        if pretrained_emb is None:
            self.emb = nn.Embedding(vocab_size, emb_dim)
            torch.nn.init.uniform_(self.emb.weight, -0.01, 0.01)
        else:
            self.emb = nn.Embedding.from_pretrained(pretrained_emb, freeze=freeze)
        
        input_dim = (2 * window_size + 1) * emb_dim
        input_dim += self.num_bin_feat + self.num_cnt_feat
        
        if self.num_cnt_feat != 0:
            self.batchnorm = nn.BatchNorm1d(self.num_cnt_feat)
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        """
        input: [emb | binary features | count features]
        """
        num_feat = self.num_bin_feat + self.num_cnt_feat
        if num_feat != 0:
            to_embed = inputs[:, :-num_feat]
            bin_feats = inputs[:, -num_feat : -self.num_cnt_feat]
            cnt_feats = inputs[:, -self.num_cnt_feat:]
            
            # embed up to num_extra_features
            embeds = self.emb(to_embed).view((inputs.shape[0], -1))
            # normalize count features
            cnt_feats = self.batchnorm(cnt_feats.float())
            # concat emb w/ extra features
            x = torch.cat((embeds, bin_feats, cnt_feats), dim=1)
        else:
            x = self.emb(inputs).view((inputs.shape[0], -1))
            
        out = torch.tanh(self.fc1(x))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [11]:
def train_util(model, X_train, Y_train, X_dev, Y_dev, n_epochs, lr, 
              batch_size):
    """
    returns: best_model, losses, train_accu_list, dev_accu_list
    """
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    best_model = None
    best_dev_accu = 0
    losses = []
    train_accu_list, dev_accu_list = [], []
    for epoch in range(n_epochs):
        epoch_loss = 0
        
        for i in range(0, X_train.shape[0], batch_size):
            optimizer.zero_grad()
            log_probs = model(X_train[i : i + batch_size])
            loss = loss_func(log_probs, Y_train[i : i + batch_size])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        train_preds = torch.argmax(model(X_train), dim=1)
        train_accu = accuracy_score(Y_train, train_preds)
        # evaluate on dev
        dev_preds = torch.argmax(model(X_dev), dim=1)
        dev_accu = accuracy_score(Y_dev, dev_preds)
        
        # early stopping, save the model if it has improved on dev
        if dev_accu > best_dev_accu:
            best_dev_accu = dev_accu
            best_model = deepcopy(model)
        
        print('Epoch {}: train_loss {:.4f}, train_accu: {:.4f}, dev_accu: {:.4f}'\
              .format(epoch, epoch_loss, train_accu, dev_accu))
        losses.append(epoch_loss)
        train_accu_list.append(train_accu)
        dev_accu_list.append(dev_accu)
        
    loss_accu_df = pd.DataFrame({
        'epoch': range(n_epochs), 
        'loss': losses,
        'train_accu': train_accu_list,
        'dev_accu': dev_accu_list})
        
    return best_model, loss_accu_df

# Plotting Utils

In [12]:
def plot_loss_accu(loss_accu_df_list, window_list):
    """
    input: two lists of the same length, loss_accu_df, window
    """
    dfs = []
    for df, w in zip(loss_accu_df_list, window_list):
        df1 = df.melt(
            'epoch', value_vars=['loss']).assign(window=w, plot='loss')
        df2 = df.melt(
            'epoch', value_vars=['train_accu', 'dev_accu']).assign(window=w, plot='accu')
        dfs.extend([df1, df2])
    plot_df = pd.concat(dfs)

    g = sns.FacetGrid(data=plot_df, row='plot', col='window', 
                      hue='variable', sharey=False)
    g.map_dataframe(sns.lineplot, x='epoch', y='value')
    g.add_legend()

def plot_confusion_matrix(matrix, labels, title):
    plt.figure(figsize=(10, 6))
    ax = sns.heatmap(matrix, xticklabels=labels, yticklabels=labels, 
                     annot=True, fmt='d', cmap='Blues')
    ax.set_xlabel('Predictions')
    ax.set_ylabel('True labels')
    ax.set_title(title)

# Load Data

In [13]:
train, train_labels, train_vocab = read_corpus(DATADIR + 'twpos-train.tsv')
dev, dev_labels, dev_vocab = read_corpus(DATADIR + 'twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab = read_corpus(DATADIR + 'twpos-devtest.tsv')

In [14]:
all_labels = np.unique(train_labels)
all_labels_devtest = np.unique(devtest_labels) # devtest labels

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
Y_train = label_encoder.transform(train_labels)
Y_dev = label_encoder.transform(dev_labels)
Y_devtest = label_encoder.transform(devtest_labels)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_dev = torch.tensor(Y_dev, dtype=torch.long)
Y_devtest = torch.tensor(Y_devtest, dtype=torch.long)

In [15]:
# devtest labels
all_labels_devtest = np.unique(devtest_labels)

In [16]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

# 1. Baseline w/ Randomly Initialized Embeddings

In [17]:
# construct maps for randomly initialized embs
idx2word_rand = sorted(vocab)
idx2word_rand += ['<s>', '</s>', 'UUUNKKK']
word2idx_rand = {word: idx for idx, word in enumerate(idx2word_rand)}

## Encode Train, Dev, DevTest

In [22]:
# w = 0
X_train_w0 = encode_lines(train, word2idx_rand, window_size=0)
X_dev_w0 = encode_lines(dev, word2idx_rand, window_size=0)
X_devtest_w0 = encode_lines(devtest, word2idx_rand, window_size=0)

In [23]:
# w = 1
X_train_w1 = encode_lines(train, word2idx_rand, window_size=1)
X_dev_w1 = encode_lines(dev, word2idx_rand, window_size=1)
X_devtest_w1 = encode_lines(devtest, word2idx_rand, window_size=1)

## Train Model

### w = 0

In [67]:
model_w0 = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=0,
                          output_dim=len(all_labels))
best_model_w0, df_w0 = train_util(model_w0, X_train_w0, Y_train, X_dev_w0, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=2000)

Epoch 0: train_loss 25.5599, train_accu: 0.1514, dev_accu: 0.1558
Epoch 1: train_loss 23.7730, train_accu: 0.2286, dev_accu: 0.2340
Epoch 2: train_loss 22.1405, train_accu: 0.2680, dev_accu: 0.2690
Epoch 3: train_loss 20.1998, train_accu: 0.3710, dev_accu: 0.3727
Epoch 4: train_loss 17.8418, train_accu: 0.4738, dev_accu: 0.4723
Epoch 5: train_loss 15.4302, train_accu: 0.4986, dev_accu: 0.4960
Epoch 6: train_loss 14.1085, train_accu: 0.5089, dev_accu: 0.5065
Epoch 7: train_loss 13.2731, train_accu: 0.5258, dev_accu: 0.5248
Epoch 8: train_loss 12.4983, train_accu: 0.5381, dev_accu: 0.5403
Epoch 9: train_loss 11.7002, train_accu: 0.6463, dev_accu: 0.6304
Epoch 10: train_loss 11.0439, train_accu: 0.5687, dev_accu: 0.5671
Epoch 11: train_loss 10.4354, train_accu: 0.6447, dev_accu: 0.6320
Epoch 12: train_loss 9.8166, train_accu: 0.7130, dev_accu: 0.6743
Epoch 13: train_loss 8.9609, train_accu: 0.6921, dev_accu: 0.6737
Epoch 14: train_loss 8.3989, train_accu: 0.7376, dev_accu: 0.6988
Epoch 15

In [None]:
# evaluate on devtest
devtest_preds = torch.argmax(best_model_w0(X_devtest_w0), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w0 = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

### w = 1

In [None]:
model_w1 = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=1,
                          output_dim=len(all_labels))
best_model_w1, df_w1 = train_util(model_w1, X_train_w1, Y_train, X_dev_w1, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

In [None]:
devtest_preds = torch.argmax(best_model_w1(X_devtest_w1), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1 = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

## Plot losses and accuracy, and confusion matrix

In [None]:
plot_loss_accu([df_w0, df_w1], window_list=[0, 1])

In [None]:
plot_confusion_matrix(conf_matrix_w0, all_labels_devtest, 'w=0')

In [None]:
plot_confusion_matrix(conf_matrix_w0, all_labels_devtest, 'w=1')

# 2. Feature Engineering
In addition to the following binary features, I also added a count feature for digits and a count feature for the length of the entire token.

In [24]:
FEATURES = ['#', '%', "'", '/', ':', '’']

In [25]:
X_train_features = construct_features(train, FEATURES)
X_dev_features = construct_features(dev, FEATURES)
X_devtest_features = construct_features(devtest, FEATURES)

In [26]:
X_train_w0_feat = torch.cat((X_train_w0, X_train_features), dim=1)
X_dev_w0_feat = torch.cat((X_dev_w0, X_dev_features), dim=1)
X_devtest_w0_feat = torch.cat((X_devtest_w0, X_devtest_features), dim=1)

In [27]:
X_train_w1_feat = torch.cat((X_train_w1, X_train_features), dim=1)
X_dev_w1_feat = torch.cat((X_dev_w1, X_dev_features), dim=1)
X_devtest_w1_feat = torch.cat((X_devtest_w1, X_devtest_features), dim=1)

### W=0

In [30]:
model_w0_feat = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                                  window_size=0, 
                                  output_dim=len(all_labels),
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w0_feat, df_w0_feat = \
train_util(model_w0_feat, X_train_w0_feat, Y_train, X_dev_w0_feat, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

Epoch 0: train_loss 44.4099, train_accu: 0.2244, dev_accu: 0.2358
Epoch 1: train_loss 39.6456, train_accu: 0.3888, dev_accu: 0.3804
Epoch 2: train_loss 29.9395, train_accu: 0.5029, dev_accu: 0.5005
Epoch 3: train_loss 24.2968, train_accu: 0.5964, dev_accu: 0.5878
Epoch 4: train_loss 20.8838, train_accu: 0.6673, dev_accu: 0.6636
Epoch 5: train_loss 18.6234, train_accu: 0.6963, dev_accu: 0.6816
Epoch 6: train_loss 15.9197, train_accu: 0.7255, dev_accu: 0.7071
Epoch 7: train_loss 13.9421, train_accu: 0.7529, dev_accu: 0.7308
Epoch 8: train_loss 12.3033, train_accu: 0.8041, dev_accu: 0.7525
Epoch 9: train_loss 10.8349, train_accu: 0.8329, dev_accu: 0.7635
Epoch 10: train_loss 9.4517, train_accu: 0.8639, dev_accu: 0.7693
Epoch 11: train_loss 8.4453, train_accu: 0.8741, dev_accu: 0.7714
Epoch 12: train_loss 7.7078, train_accu: 0.8858, dev_accu: 0.7789
Epoch 13: train_loss 6.8678, train_accu: 0.8987, dev_accu: 0.7864
Epoch 14: train_loss 6.2967, train_accu: 0.9034, dev_accu: 0.7861
Epoch 15: 

In [31]:
devtest_preds = torch.argmax(best_model_w0_feat(X_devtest_w0_feat), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w0_feat = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8021


## w=1

In [32]:
model_w1_feat = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                                  window_size=1, 
                                  output_dim=len(all_labels),
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w1_feat, df_w1_feat = \
train_util(model_w1_feat, X_train_w1_feat, Y_train, X_dev_w1_feat, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

Epoch 0: train_loss 45.1374, train_accu: 0.1795, dev_accu: 0.1854
Epoch 1: train_loss 39.4513, train_accu: 0.2642, dev_accu: 0.2539
Epoch 2: train_loss 34.2350, train_accu: 0.3862, dev_accu: 0.3748
Epoch 3: train_loss 25.4433, train_accu: 0.6241, dev_accu: 0.6169
Epoch 4: train_loss 19.7850, train_accu: 0.6755, dev_accu: 0.6677
Epoch 5: train_loss 16.8015, train_accu: 0.7195, dev_accu: 0.7019
Epoch 6: train_loss 14.6386, train_accu: 0.7490, dev_accu: 0.7283
Epoch 7: train_loss 12.7468, train_accu: 0.7805, dev_accu: 0.7484
Epoch 8: train_loss 11.1426, train_accu: 0.8099, dev_accu: 0.7650
Epoch 9: train_loss 9.8024, train_accu: 0.8365, dev_accu: 0.7772
Epoch 10: train_loss 8.6743, train_accu: 0.8507, dev_accu: 0.7818
Epoch 11: train_loss 7.5694, train_accu: 0.8794, dev_accu: 0.7963
Epoch 12: train_loss 6.6847, train_accu: 0.8977, dev_accu: 0.8023
Epoch 13: train_loss 5.9266, train_accu: 0.9105, dev_accu: 0.8108
Epoch 14: train_loss 5.2365, train_accu: 0.9197, dev_accu: 0.8146
Epoch 15: t

In [33]:
devtest_preds = torch.argmax(best_model_w1_feat(X_devtest_w1_feat), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1_feat = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8441


# 3. Pretrained Embeddings

In [170]:
twitter_vocab = []
twitter_emb = []
with open(DATADIR + 'twitter-embeddings.txt', 'rt') as f:
    for line in f:
        tokens = line.split(' ')
        word, emb = tokens[0], tokens[1:]
        emb = [float(elm) for elm in emb]
        twitter_vocab.append(word)
        twitter_emb.append(emb)

In [171]:
twitter_emb = torch.tensor(twitter_emb)
# for <s>, use the emb for </s>
idx2word_pretrained = twitter_vocab + ['<s>']
temp = twitter_emb[word2idx_pretrained['</s>']].view((1, -1))
# construct maps for pretrained word embs
twitter_emb = torch.cat((twitter_emb, temp))
word2idx_pretrained = {word: idx for idx, word in enumerate(idx2word_pretrained)}

## Encode Train, Dev, Devtest

In [172]:
# w = 0
X_train_w0_pre = encode_lines(train, word2idx_pretrained, window_size=0)
X_dev_w0_pre = encode_lines(dev, word2idx_pretrained, window_size=0)
X_devtest_w0_pre = encode_lines(devtest, word2idx_pretrained, window_size=0)

# w = 1
X_train_w1_pre = encode_lines(train, word2idx_pretrained, window_size=1)
X_dev_w1_pre = encode_lines(dev, word2idx_pretrained, window_size=1)
X_devtest_w1_pre = encode_lines(devtest, word2idx_pretrained, window_size=1)

## Train Model

### Fine-tuning emb

In [188]:
model_w0_tune = FeedForwardTagger(window_size=0, 
                                  output_dim=len(all_labels),
                                  pretrained_emb=twitter_emb)
best_model_w0_tune, df_w0_tune = \
train_util(model_w0_tune, X_train_w0_pre, Y_train, X_dev_w0_pre, Y_dev, 
                                  n_epochs=25, lr=0.5, batch_size=1000)

Epoch 0: train_loss 41.3375, train_accu: 0.6489, dev_accu: 0.6380
Epoch 1: train_loss 21.0796, train_accu: 0.7587, dev_accu: 0.7306
Epoch 2: train_loss 14.6463, train_accu: 0.8270, dev_accu: 0.7864
Epoch 3: train_loss 11.5945, train_accu: 0.8475, dev_accu: 0.7998
Epoch 4: train_loss 9.8695, train_accu: 0.8653, dev_accu: 0.8158
Epoch 5: train_loss 8.7981, train_accu: 0.8726, dev_accu: 0.8222
Epoch 6: train_loss 8.0842, train_accu: 0.8772, dev_accu: 0.8260
Epoch 7: train_loss 7.5828, train_accu: 0.8817, dev_accu: 0.8297
Epoch 8: train_loss 7.2167, train_accu: 0.8850, dev_accu: 0.8339
Epoch 9: train_loss 6.9412, train_accu: 0.8791, dev_accu: 0.8301
Epoch 10: train_loss 6.7282, train_accu: 0.8814, dev_accu: 0.8322
Epoch 11: train_loss 6.5592, train_accu: 0.8824, dev_accu: 0.8332
Epoch 12: train_loss 6.4219, train_accu: 0.8834, dev_accu: 0.8341
Epoch 13: train_loss 6.3080, train_accu: 0.8844, dev_accu: 0.8351
Epoch 14: train_loss 6.2117, train_accu: 0.8853, dev_accu: 0.8351
Epoch 15: train_

In [179]:
devtest_preds = torch.argmax(best_model_w0_tune(X_devtest_w0_pre), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w0_tune = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8362


In [187]:
model_w1_tune = FeedForwardTagger(window_size=1, 
                                  output_dim=len(all_labels),
                                  pretrained_emb=twitter_emb)
best_model_w1_tune, df_w1_tune = \
train_util(model_w1_tune, X_train_w1_pre, Y_train, X_dev_w1_pre, Y_dev, 
                                  n_epochs=25, lr=0.5, batch_size=1000)

Epoch 0: train_loss 43.0116, train_accu: 0.6402, dev_accu: 0.6246
Epoch 1: train_loss 22.6771, train_accu: 0.7425, dev_accu: 0.7156
Epoch 2: train_loss 15.2831, train_accu: 0.8259, dev_accu: 0.7868
Epoch 3: train_loss 11.5952, train_accu: 0.8640, dev_accu: 0.8183
Epoch 4: train_loss 9.4116, train_accu: 0.8842, dev_accu: 0.8299
Epoch 5: train_loss 8.0289, train_accu: 0.8972, dev_accu: 0.8405
Epoch 6: train_loss 7.0805, train_accu: 0.9078, dev_accu: 0.8492
Epoch 7: train_loss 6.3909, train_accu: 0.9172, dev_accu: 0.8579
Epoch 8: train_loss 5.8713, train_accu: 0.9218, dev_accu: 0.8627
Epoch 9: train_loss 5.4698, train_accu: 0.9265, dev_accu: 0.8654
Epoch 10: train_loss 5.1525, train_accu: 0.9302, dev_accu: 0.8672
Epoch 11: train_loss 4.8964, train_accu: 0.9326, dev_accu: 0.8708
Epoch 12: train_loss 4.6851, train_accu: 0.9351, dev_accu: 0.8722
Epoch 13: train_loss 4.5071, train_accu: 0.9372, dev_accu: 0.8731
Epoch 14: train_loss 4.3546, train_accu: 0.9386, dev_accu: 0.8747
Epoch 15: train_

In [184]:
devtest_preds = torch.argmax(best_model_w1_tune(X_devtest_w1_pre), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1_tune = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8866


### Freeze emb

In [186]:
model_w1_freeze = FeedForwardTagger(window_size=1, 
                                  output_dim=len(all_labels),
                                  pretrained_emb=twitter_emb, freeze=True)
best_model_w1_freeze, df_w1_freeze = \
train_util(model_w1_freeze, X_train_w1_pre, Y_train, X_dev_w1_pre, Y_dev, 
                                  n_epochs=25, lr=0.5, batch_size=1000)

Epoch 0: train_loss 43.4228, train_accu: 0.6442, dev_accu: 0.6322
Epoch 1: train_loss 22.9755, train_accu: 0.7451, dev_accu: 0.7185
Epoch 2: train_loss 15.4768, train_accu: 0.8243, dev_accu: 0.7810
Epoch 3: train_loss 11.8207, train_accu: 0.8623, dev_accu: 0.8162
Epoch 4: train_loss 9.6331, train_accu: 0.8804, dev_accu: 0.8293
Epoch 5: train_loss 8.2220, train_accu: 0.8945, dev_accu: 0.8392
Epoch 6: train_loss 7.2455, train_accu: 0.9048, dev_accu: 0.8459
Epoch 7: train_loss 6.5327, train_accu: 0.9132, dev_accu: 0.8533
Epoch 8: train_loss 5.9938, train_accu: 0.9194, dev_accu: 0.8594
Epoch 9: train_loss 5.5757, train_accu: 0.9248, dev_accu: 0.8635
Epoch 10: train_loss 5.2443, train_accu: 0.9291, dev_accu: 0.8664
Epoch 11: train_loss 4.9770, train_accu: 0.9319, dev_accu: 0.8699
Epoch 12: train_loss 4.7574, train_accu: 0.9344, dev_accu: 0.8699
Epoch 13: train_loss 4.5739, train_accu: 0.9361, dev_accu: 0.8714
Epoch 14: train_loss 4.4177, train_accu: 0.9384, dev_accu: 0.8743
Epoch 15: train_

In [189]:
devtest_preds = torch.argmax(best_model_w1_freeze(X_devtest_w1_pre), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1_freeze = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8832


### With features

In [194]:
X_train_w1_feat_pre = torch.cat((X_train_w1_pre, X_train_features), dim=1)
X_dev_w1_feat_pre = torch.cat((X_dev_w1_pre, X_dev_features), dim=1)
X_devtest_w1_feat_pre = torch.cat((X_devtest_w1_pre, X_devtest_features), dim=1)

In [196]:
model_w1_feat_tune = FeedForwardTagger(window_size=1, 
                                  output_dim=len(all_labels),
                                       pretrained_emb=twitter_emb,
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w1_feat_tune, df_w1_feat_tune = \
train_util(model_w1_feat_tune, X_train_w1_feat_pre, Y_train, X_dev_w1_feat_pre, Y_dev, 
                                  n_epochs=25, lr=0.5, batch_size=1000)

Epoch 0: train_loss 39.8018, train_accu: 0.6590, dev_accu: 0.6432
Epoch 1: train_loss 19.2448, train_accu: 0.7908, dev_accu: 0.7642
Epoch 2: train_loss 12.7845, train_accu: 0.8592, dev_accu: 0.8189
Epoch 3: train_loss 9.6095, train_accu: 0.8886, dev_accu: 0.8386
Epoch 4: train_loss 7.8084, train_accu: 0.9058, dev_accu: 0.8504
Epoch 5: train_loss 6.6785, train_accu: 0.9156, dev_accu: 0.8540
Epoch 6: train_loss 5.9093, train_accu: 0.9228, dev_accu: 0.8602
Epoch 7: train_loss 5.3505, train_accu: 0.9286, dev_accu: 0.8668
Epoch 8: train_loss 4.9259, train_accu: 0.9327, dev_accu: 0.8731
Epoch 9: train_loss 4.5934, train_accu: 0.9375, dev_accu: 0.8751
Epoch 10: train_loss 4.3263, train_accu: 0.9412, dev_accu: 0.8780
Epoch 11: train_loss 4.1067, train_accu: 0.9440, dev_accu: 0.8787
Epoch 12: train_loss 3.9223, train_accu: 0.9466, dev_accu: 0.8795
Epoch 13: train_loss 3.7645, train_accu: 0.9486, dev_accu: 0.8814
Epoch 14: train_loss 3.6274, train_accu: 0.9497, dev_accu: 0.8830
Epoch 15: train_l

In [198]:
devtest_preds = torch.argmax(best_model_w1_feat_tune(X_devtest_w1_feat_pre), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w1_feat_tune = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8935


# 4. Architecture Engineering

## w=2, random emb, no features baseline
Slightly better than w=0 or 1 baseline.

In [205]:
X_train_w2 = encode_lines(train, word2idx_rand, window_size=2)
X_dev_w2 = encode_lines(dev, word2idx_rand, window_size=2)
X_devtest_w2 = encode_lines(devtest, word2idx_rand, window_size=2)

In [206]:
model_w2 = FeedForwardTagger(vocab_size=len(word2idx_rand), 
                          window_size=2,
                          output_dim=len(all_labels))
best_model_w2, df_w2 = train_util(model_w2, X_train_w2, Y_train, X_dev_w2, Y_dev, 
                                  n_epochs=25, lr=2, batch_size=1000)

Epoch 0: train_loss 50.0884, train_accu: 0.1172, dev_accu: 0.1037
Epoch 1: train_loss 46.5405, train_accu: 0.2074, dev_accu: 0.1998
Epoch 2: train_loss 37.5295, train_accu: 0.4801, dev_accu: 0.4754
Epoch 3: train_loss 27.3074, train_accu: 0.6084, dev_accu: 0.5997
Epoch 4: train_loss 22.2459, train_accu: 0.6507, dev_accu: 0.6401
Epoch 5: train_loss 19.6511, train_accu: 0.6891, dev_accu: 0.6716
Epoch 6: train_loss 17.2149, train_accu: 0.7301, dev_accu: 0.7057
Epoch 7: train_loss 14.8958, train_accu: 0.7721, dev_accu: 0.7366
Epoch 8: train_loss 12.9335, train_accu: 0.8093, dev_accu: 0.7571
Epoch 9: train_loss 11.2575, train_accu: 0.8377, dev_accu: 0.7700
Epoch 10: train_loss 9.8069, train_accu: 0.8634, dev_accu: 0.7814
Epoch 11: train_loss 8.5472, train_accu: 0.8847, dev_accu: 0.7880
Epoch 12: train_loss 7.4676, train_accu: 0.9030, dev_accu: 0.7961
Epoch 13: train_loss 6.5649, train_accu: 0.9165, dev_accu: 0.8042
Epoch 14: train_loss 5.8176, train_accu: 0.9262, dev_accu: 0.8110
Epoch 15: 

In [207]:
devtest_preds = torch.argmax(best_model_w2(X_devtest_w2), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w2 = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8267


## w=2, using best config so far: pretrained emb, fine-tuning, features

Not too different from w=1, pretrained, fine-tuning, features

In [201]:
X_train_w2_pre = encode_lines(train, word2idx_pretrained, window_size=2)
X_dev_w2_pre = encode_lines(dev, word2idx_pretrained, window_size=2)
X_devtest_w2_pre = encode_lines(devtest, word2idx_pretrained, window_size=2)
X_train_w2_feat_pre = torch.cat((X_train_w2_pre, X_train_features), dim=1)
X_dev_w2_feat_pre = torch.cat((X_dev_w2_pre, X_dev_features), dim=1)
X_devtest_w2_feat_pre = torch.cat((X_devtest_w2_pre, X_devtest_features), dim=1)

In [202]:
model_w2_feat_tune = FeedForwardTagger(window_size=2, 
                                  output_dim=len(all_labels),
                                       pretrained_emb=twitter_emb,
                                 num_binary_features=len(FEATURES),
                                 num_count_features=2)
best_model_w2_feat_tune, df_w2_feat_tune = \
train_util(model_w2_feat_tune, X_train_w2_feat_pre, Y_train, X_dev_w2_feat_pre, Y_dev, 
                                  n_epochs=25, lr=0.5, batch_size=1000)

Epoch 0: train_loss 40.5641, train_accu: 0.6668, dev_accu: 0.6495
Epoch 1: train_loss 19.5058, train_accu: 0.7853, dev_accu: 0.7540
Epoch 2: train_loss 12.8262, train_accu: 0.8566, dev_accu: 0.8121
Epoch 3: train_loss 9.5552, train_accu: 0.8905, dev_accu: 0.8372
Epoch 4: train_loss 7.6819, train_accu: 0.9074, dev_accu: 0.8529
Epoch 5: train_loss 6.5106, train_accu: 0.9180, dev_accu: 0.8598
Epoch 6: train_loss 5.7169, train_accu: 0.9255, dev_accu: 0.8641
Epoch 7: train_loss 5.1424, train_accu: 0.9323, dev_accu: 0.8720
Epoch 8: train_loss 4.7072, train_accu: 0.9372, dev_accu: 0.8751
Epoch 9: train_loss 4.3673, train_accu: 0.9413, dev_accu: 0.8766
Epoch 10: train_loss 4.0948, train_accu: 0.9441, dev_accu: 0.8778
Epoch 11: train_loss 3.8709, train_accu: 0.9474, dev_accu: 0.8791
Epoch 12: train_loss 3.6828, train_accu: 0.9500, dev_accu: 0.8805
Epoch 13: train_loss 3.5217, train_accu: 0.9511, dev_accu: 0.8816
Epoch 14: train_loss 3.3814, train_accu: 0.9527, dev_accu: 0.8820
Epoch 15: train_l

In [203]:
devtest_preds = torch.argmax(best_model_w2_feat_tune(X_devtest_w2_feat_pre), dim=1)
devtest_accu = accuracy_score(Y_devtest, devtest_preds)
conf_matrix_w2_feat_tune = confusion_matrix(Y_devtest, devtest_preds)
print('devtest_accu: {:.4f}'.format(devtest_accu))

devtest_accu: 0.8916


# Plots and Analysis