### Mateusz Kantorski

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report
import torch

In [7]:
def load_data(filepath):
    token_lists = []
    label_lists = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or '\t' not in line:
                continue
            label_part, token_part = line.split('\t')
            labels = label_part.strip().split()
            tokens = token_part.strip().split()
            if len(tokens) != len(labels):
                print("Warning: mismatch in lengths")
                continue
            token_lists.append(tokens)
            label_lists.append(labels)
    return token_lists, label_lists

In [8]:
device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [9]:
def align_predictions(tokens, preds):
    aligned_labels = ['O'] * len(tokens)

    for ent in preds:
        start, end, label = ent['start'], ent['end'], ent['entity_group']
        sub_text = tokenizer.convert_ids_to_tokens(tokenizer.encode(ent['word'], add_special_tokens=False))
        words = ent['word'].split()
        ent_tokens = tokenizer.tokenize(ent['word'])
        word_idx = 0
        for i, token in enumerate(tokens):
            if ' '.join(tokens[i:i+len(words)]) == ent['word']:
                aligned_labels[i] = f"B-{label}"
                for j in range(1, len(words)):
                    if i + j < len(aligned_labels):
                        aligned_labels[i + j] = f"I-{label}"
                break
    return aligned_labels

In [13]:
true_labels = []
pred_labels = []

tokens_list, labels_list = load_data("train/train.tsv")

for tokens, true in zip(tokens_list, labels_list):
    preds = nlp(' '.join(tokens))
    pred_bio = align_predictions(tokens, preds)

    if len(pred_bio) != len(true):
        print("Length mismatch! Skipping.")
        continue

    true_labels.append(true)
    pred_labels.append(pred_bio)

print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

         LOC       0.96      0.51      0.67      7139
        MISC       0.91      0.43      0.58      3436
         ORG       0.89      0.55      0.68      6317
         PER       0.72      0.44      0.54      6600

   micro avg       0.86      0.49      0.63     23492
   macro avg       0.87      0.48      0.62     23492
weighted avg       0.87      0.49      0.63     23492

