Installing the Required Libraries

In [1]:
pip install transformers datasets seqeval accelerate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manyl

In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Importing and Preprocessing data

In [3]:
from datasets import load_dataset, DatasetDict

# Load dataset
dataset = load_dataset("conll2003",trust_remote_code=True)

# Split data into train, validation, and test sets
# We'll use 80% for training, 10% for validation, and 10% for testing
dataset = dataset["train"].train_test_split(test_size=0.2)
train_val = dataset["train"].train_test_split(test_size=0.125)  # 10% of original for validation

# Recombine into a DatasetDict
dataset = DatasetDict({
    "train": train_val["train"],
    "validation": train_val["test"],
    "test": dataset["test"]
})

print(dataset)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 9828
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 1404
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 2809
    })
})


In [4]:
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]

Defining the Tokenizer and Model

In [5]:
from transformers import AutoTokenizer

# Load tokenizer
model_checkpoint = "bert-base-cased"  # Or "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        max_length=512,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # First token of the word
            else:
                label_ids.append(label[word_idx] if label[word_idx] % 2 == 1 else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/9828 [00:00<?, ? examples/s]

Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

Map:   0%|          | 0/2809 [00:00<?, ? examples/s]

Training the model

In [6]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list)
)

# Load metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]

    return metric.compute(predictions=true_predictions, references=true_labels)

# Data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mswachhith572[0m ([33mswachhith572-mahindra-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Loc,Misc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.2184,0.071983,"{'precision': 0.9556724267468069, 'recall': 0.9284671532846716, 'f1': 0.9418733802295446, 'number': 1370}","{'precision': 0.8023598820058997, 'recall': 0.8634920634920635, 'f1': 0.8318042813455657, 'number': 630}","{'precision': 0.9120703437250199, 'recall': 0.9084394904458599, 'f1': 0.9102512963701636, 'number': 1256}","{'precision': 0.9683219178082192, 'recall': 0.9800693240901213, 'f1': 0.9741602067183462, 'number': 1154}",0.923216,0.926984,0.925096,0.979414
2,0.0551,0.063608,"{'precision': 0.9539136795903438, 'recall': 0.9518248175182482, 'f1': 0.9528681037632444, 'number': 1370}","{'precision': 0.8689759036144579, 'recall': 0.9158730158730158, 'f1': 0.8918083462132922, 'number': 630}","{'precision': 0.9269261318506752, 'recall': 0.929140127388535, 'f1': 0.9280318091451292, 'number': 1256}","{'precision': 0.9766031195840554, 'recall': 0.9766031195840554, 'f1': 0.9766031195840554, 'number': 1154}",0.939469,0.946712,0.943077,0.98376
3,0.0386,0.064025,"{'precision': 0.9616801768607222, 'recall': 0.9525547445255474, 'f1': 0.9570957095709571, 'number': 1370}","{'precision': 0.8977635782747604, 'recall': 0.8920634920634921, 'f1': 0.8949044585987261, 'number': 630}","{'precision': 0.9251592356687898, 'recall': 0.9251592356687898, 'f1': 0.9251592356687898, 'number': 1256}","{'precision': 0.9692570452604612, 'recall': 0.9835355285961872, 'f1': 0.9763440860215054, 'number': 1154}",0.944218,0.944218,0.944218,0.9844


Trainer is attempting to log a value of "{'precision': 0.9556724267468069, 'recall': 0.9284671532846716, 'f1': 0.9418733802295446, 'number': 1370}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8023598820058997, 'recall': 0.8634920634920635, 'f1': 0.8318042813455657, 'number': 630}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9120703437250199, 'recall': 0.9084394904458599, 'f1': 0.9102512963701636, 'number': 1256}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9683219178082192, 'recall': 0.9800693240901213

TrainOutput(global_step=1845, training_loss=0.08959743685838653, metrics={'train_runtime': 1004.5539, 'train_samples_per_second': 29.35, 'train_steps_per_second': 1.837, 'total_flos': 2381207177755080.0, 'train_loss': 0.08959743685838653, 'epoch': 3.0})

Testing the Model

In [7]:
# Evaluate on test set
results = trainer.evaluate(tokenized_datasets["test"])
print("Test Set Results:", results)


Trainer is attempting to log a value of "{'precision': 0.9706450366937042, 'recall': 0.9562404870624048, 'f1': 0.9633889208357294, 'number': 2628}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8967343336275375, 'recall': 0.8896672504378283, 'f1': 0.8931868131868131, 'number': 1142}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9263235838578304, 'recall': 0.9321907600596125, 'f1': 0.9292479108635098, 'number': 2684}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.96470116325712, 'recall': 0.9725030327537404,

Test Set Results: {'eval_loss': 0.06231163442134857, 'eval_LOC': {'precision': 0.9706450366937042, 'recall': 0.9562404870624048, 'f1': 0.9633889208357294, 'number': 2628}, 'eval_MISC': {'precision': 0.8967343336275375, 'recall': 0.8896672504378283, 'f1': 0.8931868131868131, 'number': 1142}, 'eval_ORG': {'precision': 0.9263235838578304, 'recall': 0.9321907600596125, 'f1': 0.9292479108635098, 'number': 2684}, 'eval_PER': {'precision': 0.96470116325712, 'recall': 0.9725030327537404, 'f1': 0.968586387434555, 'number': 2473}, 'eval_overall_precision': 0.946164199192463, 'eval_overall_recall': 0.944998319704268, 'eval_overall_f1': 0.9455809000728578, 'eval_overall_accuracy': 0.9858559183582123, 'eval_runtime': 14.7755, 'eval_samples_per_second': 190.112, 'eval_steps_per_second': 11.912, 'epoch': 3.0}


In [None]:
model.save_pretrained("./bert-ner-model")
tokenizer.save_pretrained("./bert-ner-model")


('./bert-ner-model-f/tokenizer_config.json',
 './bert-ner-model-f/special_tokens_map.json',
 './bert-ner-model-f/vocab.txt',
 './bert-ner-model-f/added_tokens.json',
 './bert-ner-model-f/tokenizer.json')

Example Usage

In [None]:
from transformers import pipeline

# Load pipeline for NER
ner_pipeline = pipeline("ner", model="./bert-ner-model", tokenizer="./bert-ner-model")

# Test sample
text = "Barack Obama was born in Hawaii and worked at Microsoft."
results = ner_pipeline(text)

# Display predictions
print(results)


Device set to use cuda:0


[{'entity': 'LABEL_1', 'score': 0.99608034, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'LABEL_2', 'score': 0.99658066, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'LABEL_0', 'score': 0.9997788, 'index': 3, 'word': 'was', 'start': 13, 'end': 16}, {'entity': 'LABEL_0', 'score': 0.99919254, 'index': 4, 'word': 'born', 'start': 17, 'end': 21}, {'entity': 'LABEL_0', 'score': 0.9997615, 'index': 5, 'word': 'in', 'start': 22, 'end': 24}, {'entity': 'LABEL_5', 'score': 0.99868256, 'index': 6, 'word': 'Hawaii', 'start': 25, 'end': 31}, {'entity': 'LABEL_0', 'score': 0.9997942, 'index': 7, 'word': 'and', 'start': 32, 'end': 35}, {'entity': 'LABEL_0', 'score': 0.9997876, 'index': 8, 'word': 'worked', 'start': 36, 'end': 42}, {'entity': 'LABEL_0', 'score': 0.9996991, 'index': 9, 'word': 'at', 'start': 43, 'end': 45}, {'entity': 'LABEL_3', 'score': 0.9969868, 'index': 10, 'word': 'Microsoft', 'start': 46, 'end': 55}, {'entity': 'LABEL_0', 'score': 0.9998355,