In [115]:
import re
from copy import deepcopy

import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

In [116]:
np.set_printoptions(precision=4)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f5ebbf70eb0>

In [117]:
DATADIR = 'data/'

Use slightly different utility functions from feed-forward tagger

In [118]:
def read_corpus(file):
    """
    returns:
        lines: [['hello', 'world'], ...]
        labels: [[!, N], ...], a list of variable len tensors
        vocab, all_labels
    """
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    ret_lines = []
    ret_labels = []
    vocab = set()
    all_labels = set()
    for line in lines:
        if not line: 
            continue
        curr_line = []
        curr_line_labels = []
        for token_label_str in line.split('\n'):
            if not token_label_str: 
                continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            all_labels.add(label)
            curr_line_labels.append(label)
            curr_line.append(token)
            
        ret_labels.append(curr_line_labels)
        ret_lines.append(curr_line)
        
    return ret_lines, ret_labels, vocab, all_labels

In [119]:
def encode_lines(lines, word2idx_map):
    """
    returns a list of len(line) x 1 tensors
    a list of variable length tensor for pad_sequence
    """
    ret = []
    for line in lines:
        encoded_line = []
        for word in line:
            num = word2idx_map.get(word, word2idx_map['UUUNKKK'])
            encoded_line.append(num)
        ret.append(torch.tensor(encoded_line))
    return ret

In [120]:
def encode_labels(line_labels, encoder):
    """
    returns a list of len(line) x 1 tensors
    a list of variable length tensor for pad_sequence
    """
    ret = []
    for line_label in line_labels:
        encoded = encoder.transform(line_label)
        ret.append(torch.tensor(encoded, dtype=torch.long))
    return ret

In [121]:
class Seq2SeqTagger(nn.Module):
    def __init__(self, pretrained_embedding, output_dim, rnn_layer_func, 
                 bidirectional=False):
        """
        rnn_layer_func could be nn.RNN, nn.GRU, or nn.LSTM
        output_dim is the number of tag classes
        use 128 for hidden
        """
        super(Seq2SeqTagger, self).__init__()
    
        input_dim = pretrained_embedding.shape[1]
        self.hidden_dim = 128
        self.emb = nn.Embedding.from_pretrained(pretrained_embedding)
        
        self.rnn = rnn_layer_func(input_dim, hidden_size=self.hidden_dim, 
                                  bidirectional=bidirectional)
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        
    def forward(self, inputs):
        x = self.emb(inputs)
        hidden = torch.zeros(1, x.shape[1], self.hidden_dim)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        log_probs = F.log_softmax(out, dim=2)
        return log_probs

In [144]:
def train_util(model, X_train, Y_train, X_dev, Y_dev, n_epochs, lr, 
              batch_size):
    """
    returns: best_model, losses, train_accu_list, dev_accu_list
    """
    # for accuracy_score
    Y_train_1d = Y_train.flatten()
    Y_dev_1d = Y_dev.flatten()
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    best_model = None
    best_dev_accu = 0
    losses = []
    train_accu_list, dev_accu_list = [], []
    
    progress_bar = tqdm.trange(1, n_epochs + 1)
    for epoch in progress_bar:
        epoch_loss = 0
        
        for i in range(0, X_train.shape[0], batch_size):
            optimizer.zero_grad()
            log_probs = model(X_train[i : i + batch_size])
            # reshape log_probs and labels
            log_probs_2d = log_probs.view(
                log_probs.shape[0] * log_probs.shape[1], -1)
            targets = Y_train[i : i + batch_size]
            targets_1d = targets.view(targets.shape[0] * targets.shape[1]) 
           
            loss = loss_func(log_probs_2d, targets_1d)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        train_preds = torch.argmax(model(X_train), dim=2).flatten()
        train_accu = accuracy_score(Y_train_1d, train_preds)
        # evaluate on dev
        dev_preds = torch.argmax(model(X_dev), dim=2).flatten()
        dev_accu = accuracy_score(Y_dev_1d, dev_preds)
        
        # early stopping, save the model if it has improved on dev
        if dev_accu > best_dev_accu:
            best_dev_accu = dev_accu
            best_model = deepcopy(model)
        
        progress_bar.set_description(
            'Epoch {}: train_loss {:.4f}, train_accu: {:.4f}, dev_accu: {:.4f}'\
              .format(epoch, epoch_loss, train_accu, dev_accu))
        losses.append(epoch_loss)
        train_accu_list.append(train_accu)
        dev_accu_list.append(dev_accu)
        
    loss_accu_df = pd.DataFrame({
        'epoch': range(1, n_epochs + 1), 
        'loss': losses,
        'train_accu': train_accu_list,
        'dev_accu': dev_accu_list})
        
    return best_model, loss_accu_df

In [123]:
def plot_loss_accu(loss_accu_df_list, window_list):
    """
    input: two lists of the same length, loss_accu_df, window
    """
    dfs = []
    for df, w in zip(loss_accu_df_list, window_list):
        df1 = df.melt(
            'epoch', value_vars=['loss']).assign(window=w, plot='loss')
        df2 = df.melt(
            'epoch', value_vars=['train_accu', 'dev_accu']).assign(window=w, plot='accu')
        dfs.extend([df1, df2])
    plot_df = pd.concat(dfs)

    g = sns.FacetGrid(data=plot_df, row='plot', col='window', 
                      hue='variable', sharey=False)
    g.map_dataframe(sns.lineplot, x='epoch', y='value')
    g.add_legend()

def plot_confusion_matrix(matrix, labels, title):
    plt.figure(figsize=(10, 6))
    ax = sns.heatmap(matrix, xticklabels=labels, yticklabels=labels, 
                     annot=True, fmt='d', cmap='Blues')
    ax.set_xlabel('Predictions')
    ax.set_ylabel('True labels')
    ax.set_title(title)

# Load pretrained embedding 

In [124]:
twitter_vocab = []
twitter_emb = []
with open(DATADIR + 'twitter-embeddings.txt', 'rt') as f:
    for line in f:
        tokens = line.split(' ')
        word, emb = tokens[0], tokens[1:]
        emb = [float(elm) for elm in emb]
        twitter_vocab.append(word)
        twitter_emb.append(emb)

In [125]:
twitter_emb = torch.tensor(twitter_emb)
# for <s>, use the emb for </s>
idx2word_pretrained = twitter_vocab + ['<s>']
# construct maps for pretrained word embs
word2idx_pretrained = {word: idx for idx, word in enumerate(idx2word_pretrained)}
temp = twitter_emb[word2idx_pretrained['</s>']].view((1, -1))
twitter_emb = torch.cat((twitter_emb, temp))

# Load data

In [126]:
train, train_labels, train_vocab, all_labels = read_corpus(DATADIR + 'twpos-train.tsv')
dev, dev_labels, dev_vocab, _ = read_corpus(DATADIR + 'twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab, all_labels_devtest = read_corpus(DATADIR + 'twpos-devtest.tsv')

In [127]:
# need to zero-pad lines and labels
# add empty label before encoding all labels
all_labels.add('')
all_labels = np.array(list(all_labels))
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
label_encoder.classes_

array(['', '!', '#', '$', '&', ',', '@', 'A', 'D', 'E', 'G', 'L', 'M',
       'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z', '^', '~'],
      dtype='<U1')

# Encode and Zero-pad data

In [130]:
X_train = encode_lines(train, word2idx_pretrained)
X_dev = encode_lines(dev, word2idx_pretrained)
X_devtest = encode_lines(devtest, word2idx_pretrained)
Y_train = encode_labels(train_labels, label_encoder)
Y_dev = encode_labels(dev_labels, label_encoder)
Y_devtest = encode_labels(devtest_labels, label_encoder)

In [131]:
# pad train, dev, devtest to same input_dim
# (num_samples, input_dim)
X_padded = pad_sequence([*X_train, *X_dev, *X_devtest], batch_first=True)
Y_padded = pad_sequence([*Y_train, *Y_dev, *Y_devtest], batch_first=True)

In [132]:
X_padded.shape, len(X_train), len(X_dev), len(X_devtest)

(torch.Size([1827, 38]), 1173, 327, 327)

In [133]:
X_train_padded = X_padded[:len(X_train)]
X_dev_padded = X_padded[len(X_train) : len(X_train) + len(X_dev)]
X_devtest_padded = X_padded[-len(X_devtest):]

In [134]:
X_train_padded.shape, X_dev_padded.shape, X_devtest_padded.shape

(torch.Size([1173, 38]), torch.Size([327, 38]), torch.Size([327, 38]))

In [135]:
Y_train_padded = Y_padded[:len(Y_train)]
Y_dev_padded = Y_padded[len(Y_train) : len(Y_train) + len(Y_dev)]
Y_devtest_padded = Y_padded[-len(Y_devtest):]

# Train

In [None]:
model_rnn = Seq2SeqTagger(
    pretrained_embedding=twitter_emb, 
    output_dim=all_labels.size, rnn_layer_func=nn.RNN)
train_util(model_rnn, X_train_padded, Y_train_padded,
          X_dev_padded, Y_dev_padded, n_epochs=25, lr=0.5, batch_size=500)

Epoch 1: train_loss 8.0948, train_accu: 0.6158, dev_accu: 0.6121:   4%|▍         | 1/25 [00:00<00:20,  1.19it/s]