In [1]:
!pip install datasets huggingface-hub



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
%%capture
!pip install datasets --no-build-isolation
!pip install seqeval
!pip install transformers[torch]

In [46]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
import torch
import numpy as np
from datasets import load_metric

In [47]:
# Load dataset
dataset = load_dataset("procit002/conll2003AndNameStreetCitySep18_and_negative_words_ConfirmationAnswer")

In [48]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 98151
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 13764
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 13969
    })
})

In [49]:
# Model checkpoint
checkpoint = "bert-base-cased"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [50]:
# Tokenize and align labels without fixed padding
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/98151 [00:00<?, ? examples/s]

Map:   0%|          | 0/13764 [00:00<?, ? examples/s]

Map:   0%|          | 0/13969 [00:00<?, ? examples/s]

In [51]:
# Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# Load seqeval metric
metric = load_metric("seqeval")

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [53]:
# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names

# Set up data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

In [57]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./NER",
    eval_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,        # Load the best model when early stopping is triggered
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping (can be adjusted)
    greater_is_better=False
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [58]:
# Initialize Trainer with compute_metrics and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [None]:
# Train model
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss


In [29]:
# Evaluate model
results = trainer.evaluate()
print(results)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 5.768160553998314e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 4.0089, 'eval_samples_per_second': 409.093, 'eval_steps_per_second': 12.971, 'epoch': 3.0}


In [33]:
trainer.push_to_hub("procit008/NER_test_conformation_answer")
tokenizer.push_to_hub("procit008/NER_test_conformation_answer")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/procit008/NER_test_conformation_answer/commit/7657ab8dec0fe44ceae492772c828877ea881ee3', commit_message='Upload tokenizer', commit_description='', oid='7657ab8dec0fe44ceae492772c828877ea881ee3', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import BertConfig

In [None]:
config = BertConfig.from_pretrained(checkpoint)

In [None]:
config

In [35]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch

tokenizer = AutoTokenizer.from_pretrained("procit008/NER")
model = AutoModelForTokenClassification.from_pretrained("procit008/NER")
dataset = load_dataset("conll2002", 'nl')
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.23k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

The repository for conll2002 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2002.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  Y


Downloading data:   0%|          | 0.00/571k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15807 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2896 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5196 [00:00<?, ? examples/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'LABEL_0', 'score': 0.99991894, 'index': 1, 'word': 'My', 'start': 0, 'end': 2}, {'entity': 'LABEL_0', 'score': 0.99990904, 'index': 2, 'word': 'name', 'start': 3, 'end': 7}, {'entity': 'LABEL_0', 'score': 0.99989164, 'index': 3, 'word': 'is', 'start': 8, 'end': 10}, {'entity': 'LABEL_1', 'score': 0.94205797, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'LABEL_0', 'score': 0.9997439, 'index': 5, 'word': 'and', 'start': 20, 'end': 23}, {'entity': 'LABEL_0', 'score': 0.9999007, 'index': 6, 'word': 'I', 'start': 24, 'end': 25}, {'entity': 'LABEL_0', 'score': 0.99736565, 'index': 7, 'word': 'live', 'start': 26, 'end': 30}, {'entity': 'LABEL_0', 'score': 0.7104487, 'index': 8, 'word': 'in', 'start': 31, 'end': 33}, {'entity': 'LABEL_2', 'score': 0.8114601, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [36]:
def aggregate_word_level_predictions(tokenized_sentence, predictions, dataset):
    print("tokenized_sentence",tokenized_sentence)
    print("predictions",predictions)
    aggregated_predictions = []
    current_word = ""
    current_entity = ""

    for token, pred in zip(tokenized_sentence, predictions):
        if token.startswith("##"):
            print("token",token)
            current_word += token[2:]  # Append sub-token to current word
            print("current_word",current_word)
        else:
            if current_word:  # If there's a current word, add it with its entity
                aggregated_predictions.append((current_word, current_entity))
                print("aggregated_predictions",aggregated_predictions)
            # Update the current word and entity
            current_word = token if token not in ["[CLS]", "[SEP]"] else ""
            print("current_word",current_word)
            current_entity = dataset['train'].features['ner_tags'].feature.int2str(pred.item())
            print("current_entity",current_entity)

    # Add the last word
    if current_word:
        aggregated_predictions.append((current_word, current_entity))

    return aggregated_predictions

In [37]:
def analyze(sentence):
    with torch.no_grad():
        inputs = tokenizer(sentence, return_tensors="pt")
        print("analyze_input",inputs)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        print("tokens",tokens)
        outputs = model(**inputs).logits
        print("outputs",outputs)
        predictions = torch.argmax(outputs, dim=2)
        print("predictions",predictions)
        word_predictions = aggregate_word_level_predictions(tokens, predictions[0], dataset)
        print("word_predictions",word_predictions)
        return {word: entity for word, entity in word_predictions}

In [45]:
text="not at all i am no not "  #"not", "at", "all", "i", "am", "asli", "van", "wolferen"
analyze(text)


analyze_input {'input_ids': tensor([[ 101, 1136, 1120, 1155,  178, 1821, 1185, 1136,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tokens ['[CLS]', 'not', 'at', 'all', 'i', 'am', 'no', 'not', '[SEP]']
outputs tensor([[[10.3798, -1.2104, -1.1759, -1.7893, -1.1212, -1.3278, -1.0857,
          -1.3441, -1.3477],
         [10.7368, -1.4306, -1.1729, -1.6698, -1.1264, -1.3050, -1.2019,
          -1.4658, -1.3785],
         [10.6712, -1.1913, -1.2801, -1.6008, -1.1140, -1.3716, -1.1806,
          -1.6142, -1.3119],
         [10.6658, -1.2626, -1.0853, -1.7277, -1.1187, -1.3363, -1.2859,
          -1.5139, -1.3117],
         [10.7467, -1.0718, -1.1834, -1.6853, -1.1443, -1.3569, -1.2845,
          -1.4553, -1.4184],
         [10.6376, -0.6513, -1.2209, -1.8052, -1.0854, -1.3287, -1.3430,
          -1.6658, -1.4340],
         [ 1.9586,  3.2967,  5.6409, -1.8571, -1.3867, -1.9369, -1.9718,
          -2.1170, -1.9145],


{'not': 'I-PER', 'at': 'O', 'all': 'O', 'i': 'O', 'am': 'O', 'no': 'I-PER'}