In [1]:
%%capture
!pip install datasets --no-build-isolation
!pip install seqeval
!pip install transformers[torch]

In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
import torch
import numpy as np
from datasets import load_metric

In [6]:
# Load dataset
dataset = load_dataset("Procit004/NER_StreetCity_conll2003_Aug28", token = "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")

Downloading readme:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/673k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/128981 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16123 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/16123 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 128981
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 16123
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 16123
    })
})

In [8]:
# Model checkpoint
checkpoint = "bert-base-cased"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [9]:
# Tokenize and align labels without fixed padding
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/128981 [00:00<?, ? examples/s]

Map:   0%|          | 0/16123 [00:00<?, ? examples/s]

Map:   0%|          | 0/16123 [00:00<?, ? examples/s]

In [10]:
# Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=9)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Load seqeval metric
metric = load_metric("seqeval")

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [12]:
# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names

# Set up data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./NER",
    eval_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,        # Load the best model when early stopping is triggered
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping (can be adjusted)
    greater_is_better=False
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [17]:
# Initialize Trainer with compute_metrics and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [18]:
# Train model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111375165555728, max=1.0)…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0698,0.058902,0.953666,0.961066,0.957352,0.980371
2,0.045,0.05711,0.954019,0.962,0.957993,0.981173
3,0.0289,0.063281,0.961191,0.959663,0.960427,0.981888


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=12093, training_loss=0.060201338914136066, metrics={'train_runtime': 2843.7533, 'train_samples_per_second': 136.068, 'train_steps_per_second': 4.252, 'total_flos': 7069581575945946.0, 'train_loss': 0.060201338914136066, 'epoch': 3.0})

In [19]:
# Evaluate model
results = trainer.evaluate()
print(results)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.05711023136973381, 'eval_precision': 0.9540187262445536, 'eval_recall': 0.962000467398925, 'eval_f1': 0.9579929716772557, 'eval_accuracy': 0.981172596463598, 'eval_runtime': 43.3539, 'eval_samples_per_second': 371.893, 'eval_steps_per_second': 11.625, 'epoch': 3.0}


In [20]:
trainer.push_to_hub("Procit004/NER", token = "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")
tokenizer.push_to_hub("Procit004/NER", token = "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")

events.out.tfevents.1725939396.0bbe60347efa.36.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1725936509.0bbe60347efa.36.0:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Procit004/NER/commit/36b5c4a875c358ade9ca90743aa91a6be4bba08a', commit_message='Upload tokenizer', commit_description='', oid='36b5c4a875c358ade9ca90743aa91a6be4bba08a', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
from transformers import BertConfig

In [22]:
config = BertConfig.from_pretrained(checkpoint)

In [25]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}