### Imports

In [1]:
import torch
import spacy
from checklist.perturb import Perturb
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
import os.path
import sklearn

### Load data

In [2]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('data//en_ewt-ud-train.iob2')
dev_data = read_iob2_file('data//en_ewt-ud-dev.iob2')

### Data preprocessing

In [3]:
# Hyperparameters
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 5
PAD = '<PAD>'

In [4]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = word_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)
dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)

In [7]:
model_name = "distilbert/distilbert-base-uncased"
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=NTAGS)
def get_model():
    return AutoModelForTokenClassification.from_pretrained(model_name, num_labels=NTAGS)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  flat_labels = labels.flatten()
  flat_preds = preds.flatten()
  precision, recall, f1, _ = precision_recall_fscore_support(flat_labels, flat_preds, average='weighted')
  
  return {
      'precision': precision,
      'recall': recall,
      'f1': f1,
  }

# Fine tuning BERT

### Length of training data = 12543
Please edit the **number_of_sentences** to the desired number

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-05, # was 2e-5 = 0.00002
    per_device_train_batch_size=4, # was 16
    num_train_epochs=5, # was 5
    logging_steps=50, # was 250
    weight_decay=0.0,
    warmup_ratio=0.1,
    save_strategy="epoch",
    save_steps=100,
    lr_scheduler_type="linear",
    gradient_checkpointing=True,
)

In [10]:
number_of_sentences = 12543 # was 12543
features = {'input_ids': train_features[:number_of_sentences], 'label': train_labels[:number_of_sentences]}
train_dataset = Dataset.from_dict(features)
dev_features = {'input_ids': dev_feats[:number_of_sentences], 'label': dev_labels[:number_of_sentences]}
dev_dataset = Dataset.from_dict(dev_features)

###  Part below needs to be run only **once**! - if we have the desired model saved in finetuned_bert, then you can skip

~ 50s / it for 100 data

In [10]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [2,3,4,5]),
        "weight_decay": trial.suggest_categorical("weight_decay", [0, 0.3]),
    }

In [11]:
trainer = Trainer(
    # model_init=get_model,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics, 
)
#BestRun(run_id='0', objective=2.978584935884372, hyperparameters={'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0}, run_summary=None) - newest





# BestRun(run_id='2', objective=2.9824558048233256, hyperparameters={'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0}, run_summary=None)
# best_run = trainer.hyperparameter_search(
#     direction="maximize",
#     backend="optuna",
#     hp_space=optuna_hp_space,
#     n_trials=10,
#     )
    
# best_run


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [12]:
# for n, v in best_run.hyperparameters.items():
#     setattr(trainer.args, n, v)

trainer.train()
trainer.save_model("distil_bert_new")

  0%|          | 0/15680 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'loss': 1.7798, 'learning_rate': 6.377551020408164e-07, 'epoch': 0.02}
{'loss': 0.9568, 'learning_rate': 1.2755102040816329e-06, 'epoch': 0.03}
{'loss': 0.3542, 'learning_rate': 1.913265306122449e-06, 'epoch': 0.05}
{'loss': 0.1699, 'learning_rate': 2.5510204081632657e-06, 'epoch': 0.06}
{'loss': 0.1047, 'learning_rate': 3.1887755102040818e-06, 'epoch': 0.08}
{'loss': 0.0931, 'learning_rate': 3.826530612244898e-06, 'epoch': 0.1}
{'loss': 0.082, 'learning_rate': 4.464285714285715e-06, 'epoch': 0.11}
{'loss': 0.0621, 'learning_rate': 5.1020408163265315e-06, 'epoch': 0.13}
{'loss': 0.0545, 'learning_rate': 5.739795918367348e-06, 'epoch': 0.14}
{'loss': 0.0343, 'learning_rate': 6.3775510204081635e-06, 'epoch': 0.16}
{'loss': 0.0325, 'learning_rate': 7.01530612244898e-06, 'epoch': 0.18}
{'loss': 0.0334, 'learning_rate': 7.653061224489796e-06, 'epoch': 0.19}
{'loss': 0.0257, 'learning_rate': 8.290816326530612e-06, 'epoch': 0.21}
{'loss': 0.0265, 'learning_rate': 8.92857142857143e-06, 'epoch



{'loss': 0.0181, 'learning_rate': 1.7757936507936507e-05, 'epoch': 1.0}
{'loss': 0.0139, 'learning_rate': 1.7687074829931973e-05, 'epoch': 1.02}
{'loss': 0.018, 'learning_rate': 1.761621315192744e-05, 'epoch': 1.04}
{'loss': 0.0211, 'learning_rate': 1.7545351473922905e-05, 'epoch': 1.05}
{'loss': 0.0186, 'learning_rate': 1.7474489795918368e-05, 'epoch': 1.07}
{'loss': 0.0208, 'learning_rate': 1.7403628117913834e-05, 'epoch': 1.08}
{'loss': 0.0139, 'learning_rate': 1.73327664399093e-05, 'epoch': 1.1}
{'loss': 0.0197, 'learning_rate': 1.7261904761904763e-05, 'epoch': 1.12}
{'loss': 0.0184, 'learning_rate': 1.7191043083900226e-05, 'epoch': 1.13}
{'loss': 0.0173, 'learning_rate': 1.7120181405895692e-05, 'epoch': 1.15}
{'loss': 0.0171, 'learning_rate': 1.7049319727891158e-05, 'epoch': 1.16}
{'loss': 0.0138, 'learning_rate': 1.6978458049886624e-05, 'epoch': 1.18}
{'loss': 0.0164, 'learning_rate': 1.6907596371882087e-05, 'epoch': 1.2}
{'loss': 0.0145, 'learning_rate': 1.6836734693877553e-05, 



{'loss': 0.0103, 'learning_rate': 1.3293650793650794e-05, 'epoch': 2.01}
{'loss': 0.0113, 'learning_rate': 1.322278911564626e-05, 'epoch': 2.02}
{'loss': 0.0107, 'learning_rate': 1.3151927437641725e-05, 'epoch': 2.04}
{'loss': 0.0093, 'learning_rate': 1.308106575963719e-05, 'epoch': 2.06}
{'loss': 0.0108, 'learning_rate': 1.3010204081632653e-05, 'epoch': 2.07}
{'loss': 0.0072, 'learning_rate': 1.293934240362812e-05, 'epoch': 2.09}
{'loss': 0.0077, 'learning_rate': 1.2868480725623584e-05, 'epoch': 2.1}
{'loss': 0.0072, 'learning_rate': 1.2797619047619048e-05, 'epoch': 2.12}
{'loss': 0.0115, 'learning_rate': 1.2726757369614513e-05, 'epoch': 2.14}
{'loss': 0.0097, 'learning_rate': 1.2655895691609979e-05, 'epoch': 2.15}
{'loss': 0.0094, 'learning_rate': 1.2585034013605443e-05, 'epoch': 2.17}
{'loss': 0.0099, 'learning_rate': 1.2514172335600908e-05, 'epoch': 2.18}
{'loss': 0.0088, 'learning_rate': 1.2443310657596372e-05, 'epoch': 2.2}
{'loss': 0.0097, 'learning_rate': 1.2372448979591838e-05



{'loss': 0.0074, 'learning_rate': 8.82936507936508e-06, 'epoch': 3.01}
{'loss': 0.0061, 'learning_rate': 8.758503401360546e-06, 'epoch': 3.03}
{'loss': 0.0067, 'learning_rate': 8.68764172335601e-06, 'epoch': 3.05}
{'loss': 0.0064, 'learning_rate': 8.616780045351474e-06, 'epoch': 3.06}
{'loss': 0.0047, 'learning_rate': 8.545918367346939e-06, 'epoch': 3.08}
{'loss': 0.0038, 'learning_rate': 8.475056689342405e-06, 'epoch': 3.09}
{'loss': 0.0043, 'learning_rate': 8.40419501133787e-06, 'epoch': 3.11}
{'loss': 0.0055, 'learning_rate': 8.333333333333334e-06, 'epoch': 3.12}
{'loss': 0.005, 'learning_rate': 8.262471655328798e-06, 'epoch': 3.14}
{'loss': 0.0049, 'learning_rate': 8.191609977324264e-06, 'epoch': 3.16}
{'loss': 0.0049, 'learning_rate': 8.120748299319729e-06, 'epoch': 3.17}
{'loss': 0.0068, 'learning_rate': 8.049886621315193e-06, 'epoch': 3.19}
{'loss': 0.0053, 'learning_rate': 7.979024943310658e-06, 'epoch': 3.2}
{'loss': 0.0061, 'learning_rate': 7.908163265306124e-06, 'epoch': 3.2



{'loss': 0.0041, 'learning_rate': 4.4359410430839e-06, 'epoch': 4.0}
{'loss': 0.0036, 'learning_rate': 4.365079365079366e-06, 'epoch': 4.02}
{'loss': 0.0042, 'learning_rate': 4.29421768707483e-06, 'epoch': 4.03}
{'loss': 0.0033, 'learning_rate': 4.223356009070295e-06, 'epoch': 4.05}
{'loss': 0.0036, 'learning_rate': 4.15249433106576e-06, 'epoch': 4.07}
{'loss': 0.0028, 'learning_rate': 4.081632653061225e-06, 'epoch': 4.08}
{'loss': 0.0038, 'learning_rate': 4.0107709750566894e-06, 'epoch': 4.1}
{'loss': 0.0043, 'learning_rate': 3.939909297052155e-06, 'epoch': 4.11}
{'loss': 0.0048, 'learning_rate': 3.869047619047619e-06, 'epoch': 4.13}
{'loss': 0.0033, 'learning_rate': 3.7981859410430844e-06, 'epoch': 4.15}
{'loss': 0.0026, 'learning_rate': 3.7273242630385492e-06, 'epoch': 4.16}
{'loss': 0.0041, 'learning_rate': 3.656462585034014e-06, 'epoch': 4.18}
{'loss': 0.0026, 'learning_rate': 3.585600907029479e-06, 'epoch': 4.19}
{'loss': 0.0042, 'learning_rate': 3.5147392290249437e-06, 'epoch': 

In [5]:
class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, lstm_dim, bidirectional=False, batch_first=True)
        self.hidden_to_tag = nn.Linear(lstm_dim, NTAGS)
        self.lstm_dim = lstm_dim
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, inputs):
        word_vectors = self.word_embeddings(inputs)
        bilstm_out, _ = self.bilstm(word_vectors)
        y = self.hidden_to_tag(bilstm_out)
        return y # softmax this in order to get probs, check out for axis, has to sum up to 1
    
    def predict(self, inputs):
        with torch.no_grad():
            data_feats, data_labels = data2feats(inputs, token_vocab, label_vocab)

            logits = self.forward(data_feats)
            probabilities = self.softmax(logits)
            return torch.argmax(probabilities, 2)


# define the model
langid_model = LangID(DIM_EMBEDDING, LSTM_HIDDEN, NWORDS)
loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
optimizer = optim.Adam(langid_model.parameters(), lr=LEARNING_RATE)
print('model overview: ')
print(langid_model)
print()

model overview: 
LangID(
  (word_embeddings): Embedding(19674, 100)
  (bilstm): LSTM(100, 50, batch_first=True)
  (hidden_to_tag): Linear(in_features=50, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)



In [6]:
# convert to batches
num_batches = int(len(train_features)/BATCH_SIZE)
train_feats_batches = train_features[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)
train_labels_batches = train_labels[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)

In [7]:
print('epoch   loss      Train acc.')
for epoch in range(EPOCHS):
    langid_model.train() 
    langid_model.zero_grad()

    # Loop over batches
    loss = 0
    match = 0
    total = 0
    for batchIdx in range(0, num_batches):
        output_scores = langid_model.forward(train_feats_batches[batchIdx])
        
        output_scores = output_scores.view(BATCH_SIZE * max_len, -1)
        flat_labels = train_labels_batches[batchIdx].view(BATCH_SIZE * max_len)
        batch_loss = loss_function(output_scores, flat_labels)

        predicted_labels = torch.argmax(output_scores, 1)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)

        # Run backward pass
        batch_loss.backward()
        optimizer.step()
        langid_model.zero_grad()
        loss += batch_loss.item()
        # Update the number of correct tags and total tags
        for gold_sent, pred_sent in zip(train_labels_batches[batchIdx], predicted_labels):
            for gold_label, pred_label in zip(gold_sent, pred_sent):
                if gold_label != 0:
                    total += 1
                    if gold_label == pred_label:
                        match+= 1
    print('{0: <8}{1: <10}{2}'.format(epoch, '{:.2f}'.format(loss/num_batches), '{:.4f}'.format(match / total)))

epoch   loss      Train acc.
0       271.02    0.9431
1       113.19    0.9683
2       59.08     0.9827
3       35.58     0.9899
4       23.35     0.9935


# Load fine tuned BERT

In [10]:
# Load our finetuned model
fine_tuned = AutoModelForTokenClassification.from_pretrained("distil_bert_new")

In [11]:
print(fine_tuned)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

## Evaluate our model on dev data

In [12]:
sentences = []
predictions = []

def run_eval(feats_batches, labels_batches, model):
    if model == 'LSTM':
        langid_model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        if model == 'LSTM':
            output_scores = langid_model.forward(sents)
            predicted_tags  = torch.argmax(output_scores, 2)
        elif model == 'BERT':
            output_scores = fine_tuned(sents) 
            predicted_tags  = torch.argmax(output_scores.logits, dim=-1)
        else:
            print('Please specify supported model.')
            return
        for sentence in sents:
            sentenceWords = []
            for wordIndex in sentence:
                sentenceWords.append(token_vocab.getWord(wordIndex.item()))
            sentences.append(sentenceWords)
        for sentenceTags in predicted_tags:
                predictionTagOneSentence = []
                for tag in sentenceTags:
                    predictionTagOneSentence.append(label_vocab.idx2word[tag.item()])
                predictions.append(predictionTagOneSentence)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)

num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches, 'BERT')

print('Accuracy for dev data: {:.4f}'.format(score))

# with open(os.path.join('data', 'bert_predictions_dev.iob2'), 'w') as f:
#     for sent_tokens, sent_preds in zip(sentences, predictions):
#         for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

# python3 span_f1.py data/bert_predictions_dev.iob2 data/en_ewt-ud-dev.iob2  <- run this in terminal to get span f1 score

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Accuracy for dev data: 0.9278


In [13]:
nlp = spacy.load("en_core_web_sm")
dataset = []
for sentence in dev_data:
    dataset.append(" ".join(sentence[0]))
pdataset = list(nlp.pipe(dataset))

# Change names

In [15]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_names = Perturb.perturb(pdataset, Perturb.change_names, n=2)
original_sentences = []
for sentences in t_names.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_names.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
# with open(os.path.join('data', 'gold_names.iob2'), 'w') as f:
#     for sentence, tag in new_tagged_dataset:
#         for index, (token, pred) in enumerate(zip(sentence, tag)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

sentences = []
predictions = []

changed_names_feats, dev_names_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_names = int(len(changed_names_feats)/BATCH_SIZE)

changed_names_feats_batches = changed_names_feats[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
changed_names_labels_batches = dev_names_labels[:BATCH_SIZE*num_batches_changed_names].view(num_batches_changed_names, BATCH_SIZE, max_len)
# score = run_eval(changed_names_feats_batches, changed_names_labels_batches, 'BERT')

# print('\033[32mAccuracy for changed names data: \033[0m {:.4f}'.format(score))

# with open(os.path.join('data', 'bert_predictions_names.iob2'), 'w') as f:
#     for sent_tokens, sent_preds in zip(sentences, predictions):
#         for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")
    
# python3 span_f1.py data/bert_predictions_names.iob2 data/gold_names.iob2 <- run this in terminal to get span f1 score

# Change location

In [20]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_location = Perturb.perturb(pdataset, Perturb.change_location, n=2)
original_sentences = []
for sentences in t_location.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_location.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
# with open(os.path.join('data', 'gold_location.iob2'), 'w') as f:
#     for sentence, tag in new_tagged_dataset:
#         for index, (token, pred) in enumerate(zip(sentence, tag)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

sentences = []
predictions = []

changed_location_feats, dev_location_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_location = int(len(changed_location_feats)/BATCH_SIZE)

changed_location_feats_batches = changed_location_feats[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
changed_location_labels_batches = dev_location_labels[:BATCH_SIZE*num_batches_changed_location].view(num_batches_changed_location, BATCH_SIZE, max_len)
# score = run_eval(changed_location_feats_batches, changed_location_labels_batches, 'BERT')

# print('\033[32mAccuracy for changed location data: \033[0m {:.4f}'.format(score))

# with open(os.path.join('data', 'bert_predictions_location.iob2'), 'w') as f:
#     for sent_tokens, sent_preds in zip(sentences, predictions):
#         for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

# python3 span_f1.py data/bert_predictions_location.iob2 data/gold_location.iob2 <- run this in terminal to get span f1 score

# Change numbers

In [19]:
new_sentences = []
new_tagged_dataset = []
# nsamples = how many 'name' sentences we want to take into account
# n = represents number of sentences that we want to generate for each 'name' sentence
t_number = Perturb.perturb(pdataset, Perturb.change_number, n=2)
original_sentences = []
for sentences in t_number.data:
    original_sentences.append(sentences[0])
# Tokenize
for sentences in t_number.data:
    for sentence in sentences:
        new_sentences.append(sentence.split())
# Assign NER tags to the generated data
for index, new_sentence in enumerate(new_sentences):
    for sentence in dev_data:
        if new_sentence == sentence[0]:
            ner_tag = sentence[1]
    if index % 3 != 0:
        new_tagged_dataset.append((new_sentences[index],ner_tag))

# Create gold labels file: index<TAB>word<TAB>label. 
# with open(os.path.join('data', 'gold_numbers.iob2'), 'w') as f:
#     for sentence, tag in new_tagged_dataset:
#         for index, (token, pred) in enumerate(zip(sentence, tag)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

sentences = []
predictions = []

changed_number_feats, dev_numbers_labels = data2feats(new_tagged_dataset, token_vocab, label_vocab)
num_batches_changed_number = int(len(changed_number_feats)/BATCH_SIZE)

changed_number_feats_batches = changed_number_feats[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
changed_number_labels_batches = dev_numbers_labels[:BATCH_SIZE*num_batches_changed_number].view(num_batches_changed_number, BATCH_SIZE, max_len)
# score = run_eval(changed_number_feats_batches, changed_number_labels_batches, 'BERT')

# print('\033[32mAccuracy for changed number data: \033[0m {:.4f}'.format(score))

# with open(os.path.join('data', 'bert_predictions_numbers.iob2'), 'w') as f:
#     for sent_tokens, sent_preds in zip(sentences, predictions):
#         for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
#             f.write(f"{index}\t{token}\t{pred}\n")
#         f.write("\n")

# python3 span_f1.py data/bert_predictions_numbers.iob2 data/gold_numbers.iob2 <- run this in terminal to get span f1 score

# LSTM predictions

## Evuluating LSTM on dev

In [9]:
sentences = []
predictions = []

def run_eval(feats_batches, labels_batches, model):
    if model == 'LSTM':
        langid_model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        if model == 'LSTM':
            output_scores = langid_model.forward(sents)
            predicted_tags  = torch.argmax(output_scores, 2)
        elif model == 'BERT':
            output_scores = fine_tuned(sents) 
            predicted_tags  = torch.argmax(output_scores.logits, dim=-1)
        else:
            print('Please specify supported model.')
            return
        for sentence in sents:
            sentenceWords = []
            for wordIndex in sentence:
                sentenceWords.append(token_vocab.getWord(wordIndex.item()))
            sentences.append(sentenceWords)
        for sentenceTags in predicted_tags:
                predictionTagOneSentence = []
                for tag in sentenceTags:
                    predictionTagOneSentence.append(label_vocab.idx2word[tag.item()])
                predictions.append(predictionTagOneSentence)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)


num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches, 'LSTM')

print('Accuracy for dev data: {:.4f}'.format(score))

with open(os.path.join('data', 'new_lstm_predictions_dev.iob2'), 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/new_lstm_predictions_dev.iob2 data/en_ewt-ud-dev.iob2  <- run this in terminal to get span f1 score

Accuracy for dev data: 0.9606


In [21]:

# print(sentences[20])
# print(predictions[20])
# print(new_tagged_dataset[20][1])
# def toSpans(tags):
#     spans = set()
#     for beg in range(len(tags)):
#         if tags[beg][0] == 'B':
#             end = beg
#             for end in range(beg+1, len(tags)):
#                 if tags[end][0] != 'I':
#                     break
#             spans.add(str(beg) + '-' + str(end) + ':' + tags[beg][2:])
#     return spans

# predSpans = toSpans(predictions)
# goldSpans = toSpans(new_tagged_dataset)
# print(len(predSpans.intersection(goldSpans)))
# print(predictions)
print(len(predictions))
print(len(dev_feats))
goldLabels = []
for sentence in dev_data:
    goldLabels.append(sentence[1])

print(len(goldLabels))

# sklearn.metrics.confusion_matrix(new_tagged_dataset, predictions)

1984
2001
2001


## Change names for LSTM

In [16]:
sentences = []
predictions = []

score = run_eval(changed_names_feats_batches, changed_names_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed names data: \033[0m {:.4f}'.format(score))

with open(os.path.join('data', 'new_lstm_predictions_names.iob2'), 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")
        
# python3 span_f1.py data/new_lstm_predictions_names.iob2 data/gold_names.iob2  <- run this in terminal to get span f1 score

[32mAccuracy for changed names data: [0m 0.9126


## Change location for LSTM

In [21]:
sentences = []
predictions = []

score = run_eval(changed_location_feats_batches, changed_location_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed location data: \033[0m {:.4f}'.format(score))

with open(os.path.join('data', 'new_lstm_predictions_location.iob2'), 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/new_lstm_predictions_location.iob2 data/gold_location.iob2  <- run this in terminal to get span f1 score

[32mAccuracy for changed location data: [0m 0.9233


## Change numbers for LSTM

In [23]:
sentences = []
predictions = []

score = run_eval(changed_number_feats_batches, changed_number_labels_batches, 'LSTM')

print('\033[32mAccuracy for changed number data: \033[0m {:.4f}'.format(score))

with open(os.path.join('data', 'new_lstm_predictions_numbers.iob2'), 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/new_lstm_predictions_numbers.iob2 data/gold_numbers.iob2  <- run this in terminal to get span f1 score

[32mAccuracy for changed number data: [0m 0.9374
