# Load Util Files

In [1]:
from copy import deepcopy
from tqdm.notebook import tqdm
import torch.optim as optim
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# %load dataset.py
import re
import pickle
import numpy as np
from collections import defaultdict

from nltk.tokenize import word_tokenize

import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

UNK = '<unk>'
PAD = '<pad>'
SOS = '<s>' # start of sentence
EOS = '</s>'

BATCH_SIZE = 500

class GloveVocabulary:
    def __init__(self, glove_vocab_path, glove_emb_path):
        self.idx_to_str = [PAD, SOS, EOS] # <unk> is in GloVe
        self.start_idx = len(self.idx_to_str) # length at which real tokens starts
        # load glove into self.idx_to_str and self.str_to_idx
        with open(glove_vocab_path, 'rb') as f:
            glove_vocab = pickle.load(f)
        with open(glove_emb_path, 'rb') as f:
            glove_emb = pickle.load(f)
        self.idx_to_str += glove_vocab
        self.str_to_idx = {s: idx for idx, s in enumerate(self.idx_to_str)}

        # TODO: initialize emb for special tokens
        # instead of random vector, use the mean of all glove vectors for special tokens
        glove_emb = torch.tensor(glove_emb)
        mean_vec = glove_emb.mean(dim=0, keepdim=True)
        self.embedding = torch.cat(
            [mean_vec, mean_vec, mean_vec, glove_emb], dim=0
        )

    def __len__(self):
        return len(self.idx_to_str)

    @staticmethod
    def tokenize(line):
        # TODO: try different tokenizers
        return word_tokenize(line.lower())

    def numericalize(self, line):
        """
        Call this only after the vocab has been built
        """
        tokens = self.tokenize(line)
        ret = [self.str_to_idx[SOS]]
        for token in tokens:
            if token in self.str_to_idx:
                ret.append(self.str_to_idx[token])
            else:
                ret.append(self.str_to_idx[UNK])
        ret.append(self.str_to_idx[EOS])
        return torch.LongTensor(ret)

    def denumericalize(self, token_indices):
        """
        Invert numericalize, returns a string
        """
        # remove start and end token
        ret = []
        for idx in token_indices[1 : -1]:
            token = self.idx_to_str[idx]
            # break early when hitting <pad> token
            if token == PAD:
                break
            else:
                ret.append(token)
        return ' '.join(ret)

class TrainDataset(Dataset):
    def __init__(self, train_path, glove_vocab_path, glove_emb_path, num_transforms=3):
        """
        num_transforms: number of transforms to apply to the line to generate a negative sample
        """
        self.num_transforms = num_transforms
        self.first_column_lines = [] # lines in the first column
        self.second_column_lines = []
        with open(train_path, 'rt') as f:
            for line in f:
                # do minimal amount of preprocessing here, lowercasing is done in vocab
                first, second = line.split('\t')
                self.first_column_lines.append(first)
                self.second_column_lines.append(second)

        self.vocab = GloveVocabulary(glove_vocab_path, glove_emb_path)

    def __len__(self):
        return len(self.first_column_lines)

    def generate_negative_example(self, numericalized_line):
        """
        numericalized_line: torch.LongTensor
        """
        # randomly substitute in words after vocab.start_idx
        # TODO: insertion, deletion, permutation
        ret = numericalized_line.detach().clone()
        # position in line to perturb
        token_indices = np.random.choice(range(len(numericalized_line)),
        self.num_transforms, replace=False)
        # the last token is <unk>
        vocab_indices = np.random.choice(range(self.vocab.start_idx, len(self.vocab) - 1),
        self.num_transforms)
        for tok_idx, vocab_idx in zip(token_indices, vocab_indices):
            ret[tok_idx] = vocab_idx
        return ret

    def __getitem__(self, index):
        """
        Return a triplet of numericalized lines
        original, positive, and negative example
        (line, paraphrased line, non-paraphrasal line)
        """
        line = self.first_column_lines[index]
        positive_line = self.second_column_lines[index]
        # convert tokens to indices
        numericalized_line = self.vocab.numericalize(line)
        numericalized_positive = self.vocab.numericalize(positive_line)
        # generate a negative numericalized example
        numericalized_negattive = self.generate_negative_example(numericalized_line)

        return numericalized_line, numericalized_positive, numericalized_negattive

class PadCollate:
    """
    Pad lines in the same batch to the same length
    """
    def __init__(self, pad_idx):
        """
        pad_idx
        """
        self.pad_idx = pad_idx

    def __call__(self, batch):
        # use long tensor for embedding
        lines = [item[0] for item in batch]
        positives = [item[1] for item in batch]
        negatives = [item[2] for item in batch]
        lines = pad_sequence(lines, batch_first=False, padding_value=self.pad_idx)
        positives = pad_sequence(positives, batch_first=False, padding_value=self.pad_idx)
        negatives = pad_sequence(negatives, batch_first=False, padding_value=self.pad_idx)
        return lines, positives, negatives

def get_train_loader(train_path, glove_vocab_path, glove_emb_path):
    dataset = TrainDataset(train_path, glove_vocab_path, glove_emb_path)
    pad_idx = dataset.vocab.str_to_idx[PAD]
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, collate_fn=PadCollate(pad_idx=pad_idx))
    return loader

def load_dev_devtest(vocab, path):
    """
    calls vocab.numericalize
    returns first_lines, second_lines, labels
    """
    first_lines = []
    second_lines = []
    labels = []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = vocab.numericalize(chunks[0])
            second = vocab.numericalize(chunks[1])
            label = int(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
            labels.append(label)
    return first_lines, second_lines, labels

def load_test(vocab, path):
    """
    skip id column, calls vocab.numericalize
    """
    first_lines, second_lines = [], []
    with open(path, 'rt') as f:
        for line in f:
            chunks = line.strip().split('\t')
            first = vocab.numericalize(chunks[1])
            second = vocab.numericalize(chunks[2])
            first_lines.append(first)
            second_lines.append(second)
    return first_lines, second_lines


In [3]:
# %load model.py
import torch
import torch.nn as nn

EMB_DIM = 50

class RNNClassifier(nn.Module):
    def __init__(self, pretrained_emb=None, freeze_emb=False, vocab_size=None, emb_dim=EMB_DIM):
        """
        vocab_size must be not None if no pretrained_emb is given
        """
        super(RNNClassifier, self).__init__()
        if pretrained_emb is None:
            self.emb = nn.Embedding(vocab_size, emb_dim)
            torch.nn.init.uniform_(self.emb.weight, -0.01, 0.01)
        else:
            self.emb = nn.Embedding.from_pretrained(pretrained_emb, freeze=freeze_emb)

        rnn_input_dim = self.emb.weight.shape[1] # EMB_DIM
        rnn_output_dim = 128
        # TODO: bidirectional?
        self.rnn = nn.GRU(rnn_input_dim, rnn_output_dim, batch_first=False)

        # pass the concatenation of two RNN outputs to fully connected layers
        fc_input_dim = rnn_output_dim * 2
        fc_hidden_dim = 128
        self.fc1 = nn.Linear(fc_input_dim, fc_hidden_dim)
        self.fc2 = nn.Linear(fc_hidden_dim, 1) # output a scalar for class probability

    def forward(self, x1, x2):
        """
        x1: first sentence, x2: second setences
        (seq_len, batch_size)
        """
        x1 = self.emb(x1)
        x2 = self.emb(x2)
        hidden = None
        for token in x1:
            out1, hidden = self.rnn(token.unsqueeze(0), hidden)
        # TODO: is it better to pass hidden=hidden, hidden=output, or hidden=None
        # can do truncate or pad
        hidden = None
        for token in x2:
            out2, hidden = self.rnn(token.unsqueeze(0), hidden)
        fc_input = torch.cat([out1, out2], dim=-1).squeeze()
        out = torch.tanh(self.fc1(fc_input))
        # use sigmoid with BCELoss
        out = torch.sigmoid(self.fc2(out))
        return out


In [4]:
def train(model, train_loader, dev_data, device, n_epochs=3):
    loss_func = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    # positive and negative labels
    ones = torch.ones((BATCH_SIZE, 1)).to(device)
    zeros = torch.zeros((BATCH_SIZE, 1)).to(device)
    labels_flat = np.concatenate([np.ones(BATCH_SIZE, dtype=int),
                                  np.zeros(BATCH_SIZE, dtype=int)])
    # record training stats
    best_model = None
    best_dev_accu = 0
    batch_losses = []
    batch_accu_list = []
    dev_accu_list = []
    for epoch in tqdm(range(1, n_epochs + 1)):
        progress_bar = tqdm(enumerate(loader))
        for batch_idx, tup in progress_bar:
            line, positive, negative = tup
            line = line.to(device)
            positive = positive.to(device)
            negative = negative.to(device)
            out_positive = model(line, positive)
            out_negative = model(line, negative)

            optimizer.zero_grad()
            loss = loss_func(out_positive, ones)
            loss += loss_func(out_negative, zeros)
            loss.backward()
            optimizer.step()
            
            # batch accuracy
            preds = torch.cat([out_positive, out_negative]).cpu().flatten()
            preds = np.where(preds > 0.5, 1, 0).astype(int)
            batch_accu = accuracy_score(labels_flat, preds)
            
            # evaluate on dev every batch
            model.eval()
            with torch.no_grad():
                dev_accu = eval(model, dev_data, device)
            model.train()
            
            progress_bar.set_description(
            'Epoch {} batch {}: batch_loss {:.4f}, batch_accu: {:.4f}, dev_accu: {:.4f}'\
              .format(epoch, batch_idx, loss.item(), batch_accu, dev_accu))
            # save model
            if dev_accu > best_dev_accu:
                best_dev_accu = dev_accu
                best_model = deepcopy(model)
                torch.save(best_model.state_dict(), 'output/best_model.pth')
            
            # record stats
            batch_losses.append(loss.item())
            batch_accu_list.append(batch_accu)
            dev_accu_list.append(dev_accu)
            
    loss_accu_df = pd.DataFrame({
        'epoch': range(1, n_epochs + 1), 
        'loss': batch_losses,
        'train_accu': batch_accu_list,
        'dev_accu': dev_accu_list})
        
    return best_model, loss_accu_df

In [5]:
def eval(model, data, device):
    first_lines, second_lines, labels = data
    preds = np.empty(len(first_lines))
    for idx, tup in enumerate(zip(first_lines, second_lines)):
        first, second = tup
        first = first.to(device)
        second = second.to(device)
        # unsqueeze in batch_size
        pred = model(first.unsqueeze(1), 
                     second.unsqueeze(1)).cpu().numpy()
        preds[idx] = pred
    preds = np.where(preds > 0.5, 1, 0)
    accu = accuracy_score(labels, preds)
    return accu

In [6]:
np.set_printoptions(precision=4)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f0c4c0c7a70>

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

# Load Train, Dev, DevTest

In [8]:
loader = get_train_loader('data/train.tsv', 'data/glove_vocab.pkl', 
                          'data/glove_emb.pkl')

In [9]:
vocab = loader.dataset.vocab

In [10]:
dev = load_dev_devtest(vocab, 'data/dev+devtest/dev.tsv')
devtest = load_dev_devtest(vocab, 'data/dev+devtest/devtest.tsv')

# Build Model and Train

In [11]:
model = RNNClassifier(pretrained_emb=loader.dataset.vocab.embedding)
model = model.to(device)

## Train

In [None]:
best_model, df = train(model, loader, dev, device, n_epochs=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

In [None]:
torch.save(best_model.state_dict(), 'data/best_model.pth')

In [None]:
loaded = RNNClassifier(pretrained_emb=loader.dataset.vocab.embedding)
loaded.load_state_dict(torch.load('data/best_model.pth'))
loaded.eval()

# Predict

In [None]:
eval(loaded, devtest, device)

# Error Analysis