### Imports

In [1]:
import torch
import spacy
from checklist.perturb import Perturb
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset

### Load data

In [3]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('data//en_ewt-ud-train.iob2')
dev_data = read_iob2_file('data//en_ewt-ud-dev.iob2')

### Data preprocessing

In [4]:
# Hyperparameters
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 5
PAD = '<PAD>'

In [5]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = word_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)

In [7]:
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=NTAGS)

In [8]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  flat_labels = labels.view(-1)
  flat_preds = preds.view(-1)
  
  precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_preds, average=None)

  return {
      'precision': precision.tolist(),
      'recall': recall.tolist(),
      'f1': f1.tolist(),
  }

# Fine tuning BERT

### Length of training data = 12543
Please edit the **number_of_sentences** to the desired number

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    logging_steps=250,
    weight_decay=0.0,
    warmup_ratio=0.1,
    save_strategy="epoch",
    save_steps=100,
    lr_scheduler_type="linear",
    gradient_checkpointing=True,
)

In [10]:
number_of_sentences = 12543
features = {'input_ids': train_features[:number_of_sentences], 'label': train_labels[:number_of_sentences]}
train_dataset = Dataset.from_dict(features)

###  Part below needs to be run only **once**! - if we have the desired model saved in finetuned_bert, then you can skip

~ 50s / it for 100 data

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics, 
)

trainer.train()
trainer.save_model("distil_bert")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/3920 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'loss': 0.4057, 'learning_rate': 1.2755102040816327e-05, 'epoch': 0.32}
{'loss': 0.0282, 'learning_rate': 1.9387755102040817e-05, 'epoch': 0.64}
{'loss': 0.0244, 'learning_rate': 1.7970521541950115e-05, 'epoch': 0.96}




{'loss': 0.022, 'learning_rate': 1.655328798185941e-05, 'epoch': 1.28}
{'loss': 0.0212, 'learning_rate': 1.5136054421768709e-05, 'epoch': 1.59}
{'loss': 0.0179, 'learning_rate': 1.3718820861678006e-05, 'epoch': 1.91}




{'loss': 0.0155, 'learning_rate': 1.2301587301587303e-05, 'epoch': 2.23}
{'loss': 0.0143, 'learning_rate': 1.0884353741496601e-05, 'epoch': 2.55}
{'loss': 0.0132, 'learning_rate': 9.467120181405896e-06, 'epoch': 2.87}




{'loss': 0.0114, 'learning_rate': 8.049886621315193e-06, 'epoch': 3.19}
{'loss': 0.0107, 'learning_rate': 6.63265306122449e-06, 'epoch': 3.51}
{'loss': 0.01, 'learning_rate': 5.2154195011337876e-06, 'epoch': 3.83}




{'loss': 0.0087, 'learning_rate': 3.7981859410430844e-06, 'epoch': 4.15}
{'loss': 0.0087, 'learning_rate': 2.380952380952381e-06, 'epoch': 4.46}
{'loss': 0.0079, 'learning_rate': 9.63718820861678e-07, 'epoch': 4.78}
{'train_runtime': 3337.8783, 'train_samples_per_second': 18.789, 'train_steps_per_second': 1.174, 'train_loss': 0.03988697303801167, 'epoch': 5.0}


In [5]:
class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, lstm_dim, bidirectional=False, batch_first=True)
        self.hidden_to_tag = nn.Linear(lstm_dim, NTAGS)
        self.lstm_dim = lstm_dim
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, inputs):
        word_vectors = self.word_embeddings(inputs)
        bilstm_out, _ = self.bilstm(word_vectors)
        y = self.hidden_to_tag(bilstm_out)
        return y # softmax this in order to get probs, check out for axis, has to sum up to 1
    
    def predict(self, inputs):
        with torch.no_grad():
            data_feats, data_labels = data2feats(inputs, token_vocab, label_vocab)

            logits = self.forward(data_feats)
            probabilities = self.softmax(logits)
            return torch.argmax(probabilities, 2)


# define the model
langid_model = LangID(DIM_EMBEDDING, LSTM_HIDDEN, NWORDS)
loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
optimizer = optim.Adam(langid_model.parameters(), lr=LEARNING_RATE)
print('model overview: ')
print(langid_model)
print()

model overview: 
LangID(
  (word_embeddings): Embedding(19674, 100)
  (bilstm): LSTM(100, 50, batch_first=True)
  (hidden_to_tag): Linear(in_features=50, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)



In [6]:
# convert to batches
num_batches = int(len(train_features)/BATCH_SIZE)
train_feats_batches = train_features[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)
train_labels_batches = train_labels[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)

In [7]:
print('epoch   loss      Train acc.')
for epoch in range(EPOCHS):
    langid_model.train() 
    langid_model.zero_grad()

    # Loop over batches
    loss = 0
    match = 0
    total = 0
    for batchIdx in range(0, num_batches):
        output_scores = langid_model.forward(train_feats_batches[batchIdx])
        
        output_scores = output_scores.view(BATCH_SIZE * max_len, -1)
        flat_labels = train_labels_batches[batchIdx].view(BATCH_SIZE * max_len)
        batch_loss = loss_function(output_scores, flat_labels)

        predicted_labels = torch.argmax(output_scores, 1)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)

        # Run backward pass
        batch_loss.backward()
        optimizer.step()
        langid_model.zero_grad()
        loss += batch_loss.item()
        # Update the number of correct tags and total tags
        for gold_sent, pred_sent in zip(train_labels_batches[batchIdx], predicted_labels):
            for gold_label, pred_label in zip(gold_sent, pred_sent):
                if gold_label != 0:
                    total += 1
                    if gold_label == pred_label:
                        match+= 1
    print('{0: <8}{1: <10}{2}'.format(epoch, '{:.2f}'.format(loss/num_batches), '{:.4f}'.format(match / total)))

epoch   loss      Train acc.
0       277.50    0.9401
1       114.62    0.9677
2       61.09     0.9822
3       37.46     0.9893
4       25.42     0.9930


# Load fine tuned BERT

In [10]:
# Load our finetuned model
fine_tuned = AutoModelForTokenClassification.from_pretrained("distil_bert")

In [11]:
print(fine_tuned)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

## Evaluate our model on dev data

In [13]:
sentences = []
predictions = []

def run_eval(feats_batches, labels_batches, model):
    if model == 'LSTM':
        langid_model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        if model == 'LSTM':
            output_scores = langid_model.forward(sents)
            predicted_tags  = torch.argmax(output_scores, 2)
        elif model == 'BERT':
            output_scores = fine_tuned(sents) 
            predicted_tags  = torch.argmax(output_scores.logits, dim=-1)
        else:
            print('Please specify supported model.')
            return
        for sentence in sents:
            sentenceWords = []
            for wordIndex in sentence:
                sentenceWords.append(token_vocab.getWord(wordIndex.item()))
            sentences.append(sentenceWords)
        for sentenceTags in predicted_tags:
                predictionTagOneSentence = []
                for tag in sentenceTags:
                    predictionTagOneSentence.append(label_vocab.idx2word[tag.item()])
                predictions.append(predictionTagOneSentence)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)

dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)
num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches, 'BERT')

print('Accuracy for dev data: {:.4f}'.format(score))

with open('bert_predictions_dev.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/en_ewt-ud-dev.iob2 data/bert_predictions_dev.iob2 <- run this in terminal to get span f1 score

In [14]:
nlp = spacy.load("en_core_web_sm")
dataset = []
for sentence in dev_data:
    dataset.append(" ".join(sentence[0]))
pdataset = list(nlp.pipe(dataset))

# Change names

In [15]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_names = Perturb.perturb(pdataset, Perturb.change_names, n=2)
original_sentences = []
for sentences in t_names.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_names.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('gold_names.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

sentences = []
predictions = []

changed_names_feats, dev_names_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_names = int(len(changed_names_feats)/BATCH_SIZE)

changed_names_feats_batches = changed_names_feats[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
changed_names_labels_batches = dev_names_labels[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
score = run_eval(changed_names_feats_batches, changed_names_labels_batches, 'BERT')

print('\033[32mAccuracy for changed names data: \033[0m {:.4f}'.format(score))

with open('bert_predictions_names.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
    
# python3 span_f1.py data/gold_names.iob2 data/bert_predictions_names.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed names data: [0m 0.8046


# Change location

In [12]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_location = Perturb.perturb(pdataset, Perturb.change_location, n=2)
original_sentences = []
for sentences in t_location.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_location.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('gold_location.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

sentences = []
predictions = []

changed_location_feats, dev_location_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_location = int(len(changed_location_feats)/BATCH_SIZE)

changed_location_feats_batches = changed_location_feats[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
changed_location_labels_batches = dev_location_labels[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
score = run_eval(changed_location_feats_batches, changed_location_labels_batches, 'BERT')

print('\033[32mAccuracy for changed location data: \033[0m {:.4f}'.format(score))

with open('bert_predictions_location.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/gold_location.iob2 data/bert_predictions_location.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed location data: [0m 0.8827


# Change numbers

In [13]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_number = Perturb.perturb(pdataset, Perturb.change_number, n=2)
original_sentences = []
for sentences in t_number.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_number.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
with open('gold_numbers.iob2', 'w') as f:
    for sentence, tag in new_tagged_dataset:
        for index, (token, pred) in enumerate(zip(sentence, tag)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

sentences = []
predictions = []

changed_number_feats, dev_numbers_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_number = int(len(changed_number_feats)/BATCH_SIZE)

changed_number_feats_batches = changed_number_feats[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
changed_number_labels_batches = dev_numbers_labels[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
score = run_eval(changed_number_feats_batches, changed_number_labels_batches, 'BERT')

print('\033[32mAccuracy for changed number data: \033[0m {:.4f}'.format(score))

with open('bert_predictions_numbers.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/gold_numbers.iob2 data/bert_predictions_numbers.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed number data: [0m 0.9080


# LSTM predictions

## Evuluating LSTM on dev

In [14]:
sentences = []
predictions = []

score = run_eval(dev_feats_batches, dev_labels_batches, 'LSTM')

print('Accuracy for dev data: {:.4f}'.format(score))

with open('lstm_predictions_dev.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/en_ewt-ud-dev.iob2 data/lstm_predictions_dev.iob2 <- run this in terminal to get span f1 score

Accuracy for dev data: 0.9603


## Change names for LSTM

In [15]:
sentences = []
predictions = []

score = run_eval(changed_names_feats_batches, changed_names_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed names data: \033[0m {:.4f}'.format(score))

with open('lstm_predictions_names.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
        
# python3 span_f1.py data/gold_names.iob2 data/lstm_predictions_names.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed names data: [0m 0.9117


## Change location for LSTM

In [16]:
sentences = []
predictions = []

score = run_eval(changed_location_feats_batches, changed_location_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed location data: \033[0m {:.4f}'.format(score))

with open('lstm_predictions_location.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/gold_location.iob2 data/lstm_predictions_location.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed location data: [0m 0.9222


## Change numbers for LSTM

In [17]:
sentences = []
predictions = []

score = run_eval(changed_number_feats_batches, changed_number_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed number data: \033[0m {:.4f}'.format(score))

with open('lstm_predictions_numbers.iob2', 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/gold_numbers.iob2 data/lstm_predictions_numbers.iob2 <- run this in terminal to get span f1 score

[32mAccuracy for changed number data: [0m 0.9381
