# NER with Transformers (BERT)


In [1]:
 !python --version

Python 3.10.11


In [1]:
! pip install transformers==4.28.0
! pip install datasets

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
     ---------------------------------------- 0.0/7.0 MB ? eta -:--:--
      --------------------------------------- 0.2/7.0 MB 4.5 MB/s eta 0:00:02
     -- ------------------------------------- 0.4/7.0 MB 4.9 MB/s eta 0:00:02
     --- ------------------------------------ 0.6/7.0 MB 5.1 MB/s eta 0:00:02
     ----- ---------------------------------- 0.9/7.0 MB 5.0 MB/s eta 0:00:02
     ------ --------------------------------- 1.1/7.0 MB 5.1 MB/s eta 0:00:02
     ------- -------------------------------- 1.4/7.0 MB 5.0 MB/s eta 0:00:02
     --------- ------------------------------ 1.6/7.0 MB 5.1 MB/s eta 0:00:02
     ---------- ----------------------------- 1.9/7.0 MB 5.1 MB/s eta 0:00:01
     ----------- ---------------------------- 2.0/7.0 MB 5.1 MB/s eta 0:00:01
     ------------- -------------------------- 2.3/7.0 MB 5.0 MB/s eta 0:00:01
     -------------- ------------------------- 2.5/7.0 MB 

In [1]:
print('hello')

hello


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric
import numpy as np
from sklearn.metrics import classification_report

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Tokenize the dataset
def tokenize_and_align_labels(examples):
    tokenized_input = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input["labels"] = labels
    return tokenized_input

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    "ner_finetuning",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_accumulation_steps = 20,
)
label_names = dataset["test"].features["ner_tags"].feature.names
# Create the model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_names))

# Create the trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DataCollatorForTokenClassification(tokenizer),
    
)



# Train the model
trainer.train()



Found cached dataset conll2003 (C:/Users/grego/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 11.60it/s]
Loading cached processed dataset at C:\Users\grego\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-a62f025911dcbac9.arrow
Loading cached processed dataset at C:\Users\grego\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-fb505a67c510ba66.arrow


ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [6]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

In [7]:
true_labels = []
pred_labels = []
for batch_preds, batch_labels in zip(predictions, labels):
    batch_true_labels = [label for label in batch_labels if label != -100]
    batch_pred_labels = [pred for pred, label in zip(batch_preds, batch_labels) if label != -100]
    
    true_labels.extend(batch_true_labels)
    pred_labels.extend(batch_pred_labels)

label_names = dataset["test"].features["ner_tags"].feature.names
true_labels = [label_names[label_id] for label_id in true_labels]
pred_labels = [label_names[label_id] for label_id in pred_labels]

print(classification_report(true_labels, pred_labels, digits=2))


              precision    recall  f1-score   support

       B-LOC       0.94      0.94      0.94      1668
      B-MISC       0.83      0.85      0.84       702
       B-ORG       0.90      0.92      0.91      1661
       B-PER       0.97      0.96      0.97      1617
       I-LOC       0.87      0.89      0.88       257
      I-MISC       0.64      0.78      0.70       216
       I-ORG       0.89      0.93      0.91       835
       I-PER       0.98      0.99      0.99      1156
           O       1.00      0.99      0.99     38323

    accuracy                           0.98     46435
   macro avg       0.89      0.92      0.90     46435
weighted avg       0.98      0.98      0.98     46435

