In [29]:
import spacy
import checklist
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.editor import Editor
from checklist.pred_wrapper import PredictorWrapper
import numpy as np
from pattern.en import sentiment
import torch
import torch.nn as nn
import torch.optim as optim

## Load the data

In [30]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('./en_ewt-ud-train.iob2')
dev_data = read_iob2_file('./en_ewt-ud-dev.iob2')

print(train_data[0])
print(len(train_data))
for sentences in train_data:
    if 'Steve' in sentences[0] and 'Maviglio' in sentences[0]:
        print(sentences[0])
        print(sentences[1])

(['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'])
12543
['Davis', 'spokesman', 'Steve', 'Maviglio', 'said', 'the', 'governor', 'felt', '"', 'betrayed', '"', 'by', 'the', 'actions', 'of', 'Winter', '.']
['B-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O']


### Train LSTM baseline

In [3]:
# Hyperparameters
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 5
PAD = '<PAD>'

In [4]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = word_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)

# Changed it to only first 5 data points!

In [32]:
# convert to batches
num_batches = int(len(train_features)/BATCH_SIZE)
train_feats_batches = train_features[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)
train_labels_batches = train_labels[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)

In [33]:
# Our model consisting of word embeddings, a single bilstm layer, and an output labels
class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, lstm_dim, bidirectional=False, batch_first=True)
        self.hidden_to_tag = nn.Linear(lstm_dim, NTAGS)
        self.lstm_dim = lstm_dim
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, inputs):
        # First encode the input into word representations and run the bilstm
        word_vectors = self.word_embeddings(inputs)
        bilstm_out, _ = self.bilstm(word_vectors)
        #  Now combine (concatenate) the last state of each layer
        # backward_out = bilstm_out[:,0,-self.lstm_dim:]
        # forward_out = bilstm_out[:,-1,:self.lstm_dim]
        # bilstm_out = torch.cat((forward_out, backward_out), 1)
        # And get the prediction
        y = self.hidden_to_tag(bilstm_out)
        # return y # softmax this in order to get probs, check out for axis, has to sum up to 1
        # print(torch.sum(m(y), dim=1))
        return y # Return logits before softmax for predictions
    
    def predict(self, inputs):
        # Disable updating the weights
        with torch.no_grad():
            data_feats, data_labels = data2feats(inputs, token_vocab, label_vocab)

            logits = self.forward(data_feats)
            probabilities = self.softmax(logits)
            # return torch.argmax(probabilities, axis=1)
            return torch.argmax(probabilities, 2), probabilities  # Return both probs and labels


# define the model
langid_model = LangID(DIM_EMBEDDING, LSTM_HIDDEN, NWORDS)
loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
optimizer = optim.Adam(langid_model.parameters(), lr=LEARNING_RATE)
print('model overview: ')
print(langid_model)
print()

model overview: 
LangID(
  (word_embeddings): Embedding(19674, 100)
  (bilstm): LSTM(100, 50, batch_first=True)
  (hidden_to_tag): Linear(in_features=50, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)



In [34]:
print('epoch   loss      Train acc.')
for epoch in range(EPOCHS):
    langid_model.train() 
    langid_model.zero_grad()

    # Loop over batches
    loss = 0
    match = 0
    total = 0
    for batchIdx in range(0, num_batches):
        output_scores = langid_model.forward(train_feats_batches[batchIdx])
        
        output_scores = output_scores.view(BATCH_SIZE * max_len, -1)
        flat_labels = train_labels_batches[batchIdx].view(BATCH_SIZE * max_len)
        batch_loss = loss_function(output_scores, flat_labels)

        predicted_labels = torch.argmax(output_scores, 1)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)

        # Run backward pass
        batch_loss.backward()
        optimizer.step()
        langid_model.zero_grad()
        loss += batch_loss.item()
        # Update the number of correct tags and total tags
        for gold_sent, pred_sent in zip(train_labels_batches[batchIdx], predicted_labels):
            for gold_label, pred_label in zip(gold_sent, pred_sent):
                if gold_label != 0:
                    total += 1
                    if gold_label == pred_label:
                        match+= 1
    print('{0: <8}{1: <10}{2}'.format(epoch, '{:.2f}'.format(loss/num_batches), '{:.4f}'.format(match / total)))

epoch   loss      Train acc.
0       279.29    0.9419
1       117.44    0.9669
2       61.18     0.9820
3       36.83     0.9899
4       24.54     0.9933


In [11]:
# dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)
# print(langid_model.predict(dev_data[0][))
# print(dev_labels[4])


In [35]:
# # Usage for Checklist with test.run
# nlp = spacy.load("en_core_web_sm")
# dataset = []
# new_sentences = []
# new_tagged_dataset = []
# for sentence in train_data:
#     dataset.append(" ".join(sentence[0]))
# pdataset = list(nlp.pipe(dataset))
# t_names = Perturb.perturb(pdataset, Perturb.change_names, n=2)

# # Tokenize
# for sentences in t_names.data:
#     for sentence in sentences:
#         new_sentences.append(sentence.split())
# # Assign NER tags to the generated data
# for index, new_sentence in enumerate(new_sentences):
#     for sentence in train_data:
#         if new_sentence == sentence[0]:
#             ner_tag = sentence[1]
#     if index % 3 != 0:
#         new_tagged_dataset.append((new_sentences[index],ner_tag))


# changed_names_feats, dev_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)

# # Use our model
# test = INV(**t_names)
# test.run(langid_model.predict)
# test.summary()

In [36]:
sentences = []
predictions = []

def run_eval(feats_batches, labels_batches):
    langid_model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        output_scores = langid_model.forward(sents)
        predicted_tags  = torch.argmax(output_scores, 2)
        for sentence in sents:
            sentenceWords = []
            for wordIndex in sentence:
                sentenceWords.append(token_vocab.getWord(wordIndex.item()))
            sentences.append(sentenceWords)
        for sentenceTags in predicted_tags:
                predictionTagOneSentence = []
                for tag in sentenceTags:
                    predictionTagOneSentence.append(label_vocab.idx2word[tag.item()])
                predictions.append(predictionTagOneSentence)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)


dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)
# print(token_vocab.getWord(dev_feats[0][0].item()))
num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches)

print(dev_data[1][0])
print(dev_data[1][1])
print('')
print(sentences[1])
print(predictions[1])
print('Accuracy for dev data: {:.4f}'.format(score))

['I', 'searched', 'all', 'over', 'the', 'internet', ',', 'but', 'I', 'could', 'not', 'find', 'one', 'place', 'in', 'Tampa', 'Bay', 'that', 'sells', 'morcillas', ',', 'also', 'known', 'as', 'blood', 'pudding', ',', 'black', 'pudding', 'and', 'blood', 'sausages', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['I', '<PAD>', 'all', 'over', 'the', 'internet', ',', 'but', 'I', 'could', 'not', 'find', 'one', 'place', 'in', 'Tampa', 'Bay', 'that', '<PAD>', '<PAD>', ',', 'also', 'known', 'as', 'blood', '<PAD>', ',', 'black', '<PAD>', 'and', 'blood', '<PAD>', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',

In [37]:
nlp = spacy.load("en_core_web_sm")
dataset = []
for sentence in dev_data:
    dataset.append(" ".join(sentence[0]))
pdataset = list(nlp.pipe(dataset))

## Change names

In [41]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_names = Perturb.perturb(pdataset, Perturb.change_names, n=2)
original_sentences = []
for sentences in t_names.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_names.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('changed_names_data.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")


sentences = []
predictions = []
print(len(new_tagged_dataset))

changed_names_feats, dev_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_names = int(len(changed_names_feats)/BATCH_SIZE)

changed_names_feats_batches = changed_names_feats[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
changed_names_labels_batches = dev_labels[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
score = run_eval(changed_names_feats_batches, changed_names_labels_batches)

print(new_tagged_dataset[1][0])
print(new_tagged_dataset[1][1])
print('')
print(sentences[5])
print(predictions[5])
print('\033[32mAccuracy for changed names data: \033[0m {:.4f}'.format(score))

with open('predictions_changed_names_data.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
    

predictionsNames = read_iob2_file('./predictions_changed_names_data.iob2')

print(new_tagged_dataset[0][1])
print(predictionsNames[0][1])

# python3 span_f1.py changed_names_data.iob2 predictions_changed_names_data.iob2 <- run this in terminal to get span f1 score

302
['Which', 'of', 'these', 'do', 'you', 'like', ':', 'McDonald', 's', ',', 'Burger', 'King', ',', 'Taco', 'Bell', ',', 'Wendy', 's', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O']

['<PAD>', 'is', 'pregnant', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD

## Change location

In [42]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_location = Perturb.perturb(pdataset, Perturb.change_location, n=2)
original_sentences = []
for sentences in t_location.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_location.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('changed_location_data.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")


sentences = []
predictions = []
print(len(new_tagged_dataset))

changed_location_feats, dev_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_location = int(len(changed_location_feats)/BATCH_SIZE)

changed_location_feats_batches = changed_location_feats[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
changed_location_labels_batches = dev_labels[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
score = run_eval(changed_location_feats_batches, changed_location_labels_batches)

print(new_tagged_dataset[1][0])
print(new_tagged_dataset[1][1])
print('')
print(sentences[5])
print(predictions[5])
print('\033[32mAccuracy for changed location data: \033[0m {:.4f}'.format(score))

with open('predictions_changed_location_data.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
    

predictionsLocation = read_iob2_file('./predictions_changed_location_data.iob2')

print(new_tagged_dataset[0][1])
print(predictionsLocation[0][1])

# python3 span_f1.py changed_location_data.iob2 predictions_changed_location_data.iob2 <- run this in terminal to get span f1 score

196
['There', 'are', 'way', 'more', 'stranger', 'names', 'in', 'the', 'U.S', 'for', 'areas', 'than', 'Bakersfield', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC', 'O']

['How', 'about', '<PAD>', '<PAD>', 'or', 'other', '<PAD>', 'from', 'that', 'area', 'of', '<PAD>', '?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

## Change numbers

In [43]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_number = Perturb.perturb(pdataset, Perturb.change_number, n=2)
original_sentences = []
for sentences in t_number.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_number.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('changed_number_data.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")


sentences = []
predictions = []
print(len(new_tagged_dataset))

changed_number_feats, dev_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_number = int(len(changed_number_feats)/BATCH_SIZE)

changed_number_feats_batches = changed_number_feats[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
changed_number_labels_batches = dev_labels[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
score = run_eval(changed_number_feats_batches, changed_number_labels_batches)

print(new_tagged_dataset[1][0])
print(new_tagged_dataset[1][1])
print('')
print(sentences[5])
print(predictions[5])
print('\033[32mAccuracy for changed number data: \033[0m {:.4f}'.format(score))

with open('predictions_changed_number_data.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
    

predictionsNumber = read_iob2_file('./predictions_changed_number_data.iob2')

print(new_tagged_dataset[0][1])
print(predictionsNumber[0][1])

# python3 span_f1.py changed_number_data.iob2 predictions_changed_number_data.iob2 <- run this in terminal to get span f1 score

281
['2', 'cup', 'of', 'empanadas']
['O', 'O', 'O', 'O']

['2', 'cup', 'of', 'other', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD