In [19]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import unicodedata

In [2]:
def read_conll(filepath):
    tokens, labels, examples = [], [], []
    with open(filepath, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if tokens:
                    examples.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
            else:
                parts = line.split()
                if len(parts) != 2:
                    print(f"Skipping malformed line {line_num} in {filepath}: '{line}'")
                    continue
                token, tag = parts
                tokens.append(token)
                labels.append(tag)
        if tokens:
            examples.append({"tokens": tokens, "ner_tags": labels})
    return examples

def load_conll_folder(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".conll"):
            full_path = os.path.join(folder_path, filename)
            examples = read_conll(full_path)
            all_data.extend(examples)
    return all_data

In [3]:

def infer(sentence: str):
    # Fix UTF-8 encoding
    sentence = unicodedata.normalize("NFC", sentence)

    enc = tokenizer(sentence, return_tensors="pt", truncation=True)
    enc.pop("token_type_ids", None)

    with torch.no_grad():
        outputs = model(**enc)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).squeeze().tolist()
    input_ids = enc["input_ids"].squeeze().tolist()
    word_ids = enc.word_ids()
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    results = []
    prev_word_id = None

    for tok, pred_id, wid in zip(tokens, preds, word_ids):
        if wid is None:
            continue
        
        label = id2label[pred_id]

        # ModernBERT space indicator "Ġ"
        if tok.startswith("Ġ"):
            tok = " " + tok[1:]

        # Merge subwords into readable text
        if wid != prev_word_id:
            results.append([tok, label])
        else:
            results[-1][0] += tok

        prev_word_id = wid

    # Final cleanup: strip spaces and remove special tokens
    final = [(w.strip(), l) for w, l in results if w.strip() not in tokenizer.all_special_tokens]
    return final


def print_entities(results):
    print("\nNamed Entity Predictions:")
    for word, label in results:
        if label != "O":
            print(f"{word:25} → {label}")

# 1. Inference from an input sentence

In [60]:

# Adjust model dir as needed:
MODEL_DIR = "VSSA-SDSA/LT-NER-modernBERT"  
TRAIN_DIR = "data/conll_train/"
TEST_DIR  = "data/conll_test/"

# Load label mappings from dataset 
train_data = load_conll_folder(TRAIN_DIR)
test_data  = load_conll_folder(TEST_DIR)

unique_labels = sorted({label for ex in train_data for label in ex["ner_tags"]})
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}

# Load tokenizer + model 
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_DIR,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
_ = model.eval()


In [61]:
text = "2004 m. balandžio 21 d. Europos Parlamento reglamentas Nr. 805/2004, sukuriantis reikalavimus Europos vykdomajam raštui"
text = "Ieškovas: Petras Petraitis"
text = "UAB „Valstybės investicinis kapitalas“ išplatino 25 mln. eurų vertės obligacijas pagal atnaujintą 400 mln. eurų vertybinių popierių programą. Nominali obligacijos vertė – 1 000 eurų, o pajamingumas – 3,119 %. Obligacijų išpirkimo data – 2029 m. rugsėjo 24 d."
text = "UAB „Valstybės investicinis kapitalas“ valdyba taip pat pritarė UAB „EPSO-G Invest“ paprastųjų vardinių nematerialiųjų 1 791 244 vnt. akcijų, kurių bendra emisijos kaina yra lygi 17 912 440 EUR, įsigijimui."
text = "Įgalioti AB „Ignitis grupė“ vadovą (su teise perįgalioti) pasirašyti sutartis dėl AB Ignitis grupė"
text = "nepriklausomam nariui 4 070 Eur (neatskaičius mokesčių)"
text = "Proceso kalba: ispanų."


predictions = infer(text)


print("\nNER Predictions:")
for word, label in predictions:
    #if label != "O" and len(word.strip()) > 1:
    decoded = tokenizer.convert_tokens_to_string([word])
    print(f"{decoded:20} --> {label}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



NER Predictions:
Proceso              --> O
kalba                --> O
:                    --> O
ispanų               --> B-NATIONALITY
.                    --> O
