In [10]:
import torch
import spacy
from checklist.perturb import Perturb
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
import os.path
import sklearn
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np

In [2]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('data//en_ewt-ud-train.iob2')
dev_data = read_iob2_file('data//en_ewt-ud-dev.iob2')

In [3]:
# Hyperparameters
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.01
EPOCHS = 5
PAD = '<PAD>'

In [4]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = word_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)
dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)

## Expected data format\
{\
    'id': '0',\
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],\
 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']\
}

In [4]:
print(train_data[0])

(['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'])


In [10]:
# from datasets import load_dataset

# wnut = load_dataset("wnut_17")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# label_list = wnut["train"].features[f"ner_tags"].feature.names
# example = wnut["train"][0]

# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [12]:
# tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [20]:
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# seqeval = evaluate.load("seqeval")

# labels = [label_list[i] for i in example[f"ner_tags"]]


# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = seqeval.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

In [15]:
# id2label = {
#     0: "O",
#     1: "B-corporation",
#     2: "I-corporation",
#     3: "B-creative-work",
#     4: "I-creative-work",
#     5: "B-group",
#     6: "I-group",
#     7: "B-location",
#     8: "I-location",
#     9: "B-person",
#     10: "I-person",
#     11: "B-product",
#     12: "I-product",
# }
# label2id = {
#     "O": 0,
#     "B-corporation": 1,
#     "I-corporation": 2,
#     "B-creative-work": 3,
#     "I-creative-work": 4,
#     "B-group": 5,
#     "I-group": 6,
#     "B-location": 7,
#     "I-location": 8,
#     "B-person": 9,
#     "I-person": 10,
#     "B-product": 11,
#     "I-product": 12,
# }

In [19]:
# model = AutoModelForTokenClassification.from_pretrained(
#     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
# )

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# training_args = TrainingArguments(
#     output_dir="wnut_model",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     # push_to_hub=True,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_wnut["train"],
#     eval_dataset=tokenized_wnut["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.29800260066986084, 'eval_precision': 0.5586734693877551, 'eval_recall': 0.20296570898980537, 'eval_f1': 0.29775662814411963, 'eval_accuracy': 0.9373690735753067, 'eval_runtime': 10.316, 'eval_samples_per_second': 124.757, 'eval_steps_per_second': 7.852, 'epoch': 1.0}


  0%|          | 0/81 [00:00<?, ?it/s]

{'eval_loss': 0.28243619203567505, 'eval_precision': 0.5871559633027523, 'eval_recall': 0.2965708989805375, 'eval_f1': 0.3940886699507389, 'eval_accuracy': 0.9416442221367193, 'eval_runtime': 6.4864, 'eval_samples_per_second': 198.414, 'eval_steps_per_second': 12.488, 'epoch': 2.0}
{'train_runtime': 128.5943, 'train_samples_per_second': 52.786, 'train_steps_per_second': 3.313, 'train_loss': 0.21657641057117444, 'epoch': 2.0}


TrainOutput(global_step=426, training_loss=0.21657641057117444, metrics={'train_runtime': 128.5943, 'train_samples_per_second': 52.786, 'train_steps_per_second': 3.313, 'train_loss': 0.21657641057117444, 'epoch': 2.0})

# Our data

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
example = train_data[0]
tokenized_input = tokenizer(example[0], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens



['[CLS]',
 'where',
 'in',
 'the',
 'world',
 'is',
 'i',
 '##gua',
 '##zu',
 '?',
 '[SEP]']

In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def align_labels(tokenized_input, labels):
  word_ids = tokenized_input.word_ids()  # Get word IDs for each token
  aligned_labels = []
  previous_word_idx = None
  for word_idx in word_ids:
    # Set special tokens and padding to -100 (ignored during training)
    if word_idx is None or word_idx == tokenizer.pad_token_id:
      aligned_labels.append(-100)
    # Only label the first token of a word (assuming B-LOC etc. are for the whole word) 
    elif word_idx != previous_word_idx:
      aligned_labels.append(labels[word_idx])
    else:
      aligned_labels.append(-100)
    previous_word_idx = word_idx
  return aligned_labels

# TRAIN DATA
tokenized_train = []
for sentence, labels in train_data:
  tokenized_input = tokenizer(sentence, is_split_into_words=True)
  labelsIndices = []
  for label in labels:
    labelsIndices.append(label_vocab.getIdx(label))
  tokenized_input['labels'] = labelsIndices
  tokenized_train.append(tokenized_input)  # Store tokenized data and labels together
print(tokenized_train[0])

# Add aligned labels to each data point
for tokenized_input in tokenized_train:
  aligned_labels = align_labels(tokenized_input, tokenized_input['labels'])
  tokenized_input['labels'] = aligned_labels

# DEV DATA
tokenized_dev = []
for sentence, labels in dev_data:
  tokenized_input = tokenizer(sentence, is_split_into_words=True)
  labelsIndices = []
  for label in labels:
    labelsIndices.append(label_vocab.getIdx(label))
  tokenized_input['labels'] = labelsIndices
  tokenized_dev.append(tokenized_input)  # Store tokenized data and labels together
print(tokenized_dev[0])

# Add aligned labels to each data point
for tokenized_input in tokenized_dev:
  aligned_labels = align_labels(tokenized_input, tokenized_input['labels'])
  tokenized_input['labels'] = aligned_labels

tokenized_train[0]

{'input_ids': [101, 2073, 1999, 1996, 2088, 2003, 1045, 19696, 9759, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1, 1, 1, 1, 1, 2, 1]}
{'input_ids': [101, 2073, 2064, 1045, 2131, 22822, 6895, 25816, 1999, 9925, 3016, 1010, 1045, 2097, 2066, 1996, 23157, 15830, 2828, 1010, 2021, 1045, 2097, 2000, 3046, 2178, 2015, 3531, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


{'input_ids': [101, 2073, 1999, 1996, 2088, 2003, 1045, 19696, 9759, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, 1, 1, 1, 1, 2, -100, -100, 1, -100]}

In [63]:
# print(tokenized_wnut["train"][3])
# print(tokenized_wnut["train"][3]['ner_tags'])
print(tokenized_wnut["train"][3]['labels']) # have it 
# print(tokenized_wnut["train"][3]['input_ids'])
# print(tokenized_wnut["train"][3]['attention_mask'])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

seqeval = evaluate.load("seqeval")

# labels = [label_list[i] for i in example[1]]


# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = seqeval.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=NTAGS
    # , id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
training_args = TrainingArguments(
    output_dir="testing_bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

trainer.train()

MlflowException: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='testing_bert/runs/May18_10-18-20_airuzivarichard.lan' for run ID='876b6b1581644298bee74b41b6054c5e'. Attempted logging new value 'testing_bert/runs/May18_10-20-11_airuzivarichard.lan'.

In [18]:
# Load our finetuned model
fine_tuned = AutoModelForTokenClassification.from_pretrained("testing_bert/checkpoint-250/")

In [20]:
sentences = []
predictions = []

def run_eval(feats_batches, labels_batches, model):
    if model == 'LSTM':
        langid_model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        if model == 'LSTM':
            output_scores = langid_model.forward(sents)
            predicted_tags  = torch.argmax(output_scores, 2)
        elif model == 'BERT':
            output_scores = fine_tuned(sents) 
            predicted_tags  = torch.argmax(output_scores.logits, dim=-1)
        else:
            print('Please specify supported model.')
            return
        for sentence in sents:
            sentenceWords = []
            for wordIndex in sentence:
                sentenceWords.append(token_vocab.getWord(wordIndex.item()))
            sentences.append(sentenceWords)
        for sentenceTags in predicted_tags:
                predictionTagOneSentence = []
                for tag in sentenceTags:
                    predictionTagOneSentence.append(label_vocab.idx2word[tag.item()])
                predictions.append(predictionTagOneSentence)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)

num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches, 'BERT')

print('Accuracy for dev data: {:.4f}'.format(score))

with open(os.path.join('data', 'new_bert_predictions_dev.iob2'), 'w') as f:
    for sent_tokens, sent_preds in zip(sentences, predictions):
        for index, (token, pred) in enumerate(zip(sent_tokens, sent_preds)):
            f.write(f"{index}\t{token}\t{pred}\n")
        f.write("\n")

# python3 span_f1.py data/new_bert_predictions_dev.iob2 data/en_ewt-ud-dev.iob2  <- run this in terminal to get span f1 score

Accuracy for dev data: 0.9404
