### Imports

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

### Hyperparameters

In [2]:
# Hyperparameters
MAX_LEN=32
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 10
PAD = '<PAD>'
torch.manual_seed(8446)

<torch._C.Generator at 0x217eda1ffb0>

## Import data

In [3]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('./en_ewt-ud-train.iob2')
dev_data = read_iob2_file('./en_ewt-ud-dev.iob2')

print(train_data[0])

(['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'])


### Prepare data

In [7]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = token_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)
test_data = read_iob2_file('./en_ewt-ud-test-masked.iob2')

### Batches

In [8]:
# convert to batches
num_batches = int(len(train_features)/BATCH_SIZE)
train_feats_batches = train_features[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)
train_labels_batches = train_labels[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)

# Convert test data to features and labels
test_features, test_labels = data2feats(test_data, token_vocab, label_vocab)

# Convert to batches
num_test_batches = int(len(test_features)/BATCH_SIZE)
test_feats_batches = test_features[:BATCH_SIZE*num_test_batches].view(num_test_batches, BATCH_SIZE, max_len)
test_labels_batches = test_labels[:BATCH_SIZE*num_test_batches].view(num_test_batches, BATCH_SIZE, max_len)

## Fine Tuning BERTweet


In [17]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments
import accelerate
import transformers

print(transformers.__version__, accelerate.__version__)

# Load the BERTweet model
model = BertForTokenClassification.from_pretrained("vinai/bertweet-base", num_labels=NTAGS)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Define a function to align the predictions and labels
def align_predictions(predictions, labels):
    preds = np.argmax(predictions, axis=2)
    out_label_list = [[] for _ in range(labels.shape[0])]
    preds_list = [[] for _ in range(labels.shape[0])]

    for i in range(labels.shape[0]):
        for j in range(labels.shape[1]):
            if labels[i, j] != label_vocab.getIdx(PAD):
                out_label_list[i].append(label_vocab.getWord(labels[i][j]))
                preds_list[i].append(label_vocab.getWord(preds[i][j]))

    return preds_list, out_label_list

# Define a function to compute metrics
def compute_metrics(p):
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    return {
        "accuracy": accuracy_score(out_label_list, preds_list),
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

from torch.nn.utils.rnn import pad_sequence
class MyDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Pad the sequences to the maximum length
        features = pad_sequence([self.features[idx]], batch_first=True)
        labels = pad_sequence([self.labels[idx]], batch_first=True)
        # Flatten the batch dimension
        features = features.view(-1)
        labels = labels.view(-1)
        return {"input_ids": features, "labels": labels}

# Calculate the maximum length of the input sequences
max_len = max(max(len(seq) for seq in batch) for batch in train_feats_batches)

# Pad the sequences to the maximum length
train_features_padded = [pad_sequence(seq, batch_first=True, padding_value=0) for seq in train_feats_batches]
train_labels_padded = [pad_sequence(seq, batch_first=True, padding_value=0) for seq in train_labels_batches]
test_features_padded = [pad_sequence(seq, batch_first=True, padding_value=0) for seq in test_feats_batches]
test_labels_padded = [pad_sequence(seq, batch_first=True, padding_value=0) for seq in test_labels_batches]

# Flatten the batched data
train_features_flat = torch.cat(train_features_padded, dim=0).view(-1, max_len)
train_labels_flat = torch.cat(train_labels_padded, dim=0).view(-1, max_len)
test_features_flat = torch.cat(test_features_padded, dim=0).view(-1, max_len)
test_labels_flat = torch.cat(test_labels_padded, dim=0).view(-1, max_len)

# Create datasets for training and evaluation
train_dataset = MyDataset(train_features_flat, train_labels_flat)
eval_dataset = MyDataset(test_features_flat, test_labels_flat)

# Create the Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

4.40.2 0.30.0


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer

  0%|          | 0/1173 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (159) must match the existing size (130) at non-singleton dimension 1.  Target sizes: [32, 159].  Tensor sizes: [1, 130]

### Training

In [20]:
print('epoch   loss      Train acc.')
for epoch in range(EPOCHS):
    model.train() 
    model.zero_grad()

    # Loop over batches
    loss = 0
    match = 0
    total = 0
    for batchIdx in range(0, 1): # num_batches
        output_scores = model.forward(train_feats_batches[batchIdx])        
        output_scores = output_scores.view(BATCH_SIZE * max_len, -1)
        flat_labels = train_labels_batches[batchIdx].view(BATCH_SIZE * max_len)
        batch_loss = loss_function(output_scores, flat_labels)

        predicted_labels = torch.argmax(output_scores, 1)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)

        # Run backward pass
        batch_loss.backward()
        optimizer.step()
        model.zero_grad()
        loss += batch_loss.item()
        # Update the number of correct tags and total tags
        for gold_sent, pred_sent in zip(train_labels_batches[batchIdx], predicted_labels):
            for gold_label, pred_label in zip(gold_sent, pred_sent):
                if gold_label != 0:
                    total += 1
                    if gold_label == pred_label:
                        match+= 1
    print('{0: <8}{1: <10}{2}'.format(epoch, '{:.2f}'.format(loss/num_batches), '{:.4f}'.format(match / total)))

epoch   loss      Train acc.
0       0.05      0.9845
1       0.04      0.9865
2       0.03      0.9903
3       0.05      0.9884
4       0.03      0.9903
5       0.02      0.9942
6       0.03      0.9923
7       0.02      0.9923
8       0.03      0.9923
9       0.03      0.9942


### Evaluate the model

In [None]:
def run_eval(feats_batches, labels_batches):
    model.eval()
    match = 0
    total = 0
    for sents, labels in zip(feats_batches, labels_batches):
        output_scores = model.forward(sents)
        predicted_tags  = torch.argmax(output_scores, 2)
        for goldSent, predSent in zip(labels, predicted_tags):
            for goldLabel, predLabel in zip(goldSent, predSent):
                if goldLabel.item() != 0:
                    total += 1
                    if goldLabel.item() == predLabel.item():
                        match+= 1
    return(match/total)

print()

BATCH_SIZE=1
dev_feats, dev_labels = data2feats(dev_data, token_vocab, label_vocab)
num_batches_dev = int(len(dev_feats)/BATCH_SIZE)

dev_feats_batches = dev_feats[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
dev_labels_batches = dev_labels[:BATCH_SIZE*num_batches_dev].view(num_batches_dev, BATCH_SIZE, max_len)
score = run_eval(dev_feats_batches, dev_labels_batches)
print('Accuracy for dev data: {:.4f}'.format(score))


Accuracy for dev data: 0.9595
