In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
raw_datasets["train"][0]["tokens"],raw_datasets["train"][0]["ner_tags"]

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [4]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [5]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [6]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [7]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer.is_fast

True

In [9]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [10]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [11]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [12]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [13]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [15]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [16]:
tokenized_datasets["train"][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [18]:
import evaluate

metric = evaluate.load("seqeval")

In [19]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [21]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [23]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [24]:
from transformers import get_scheduler

num_train_epochs = 20
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [25]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [26]:
import json
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import time

# Define directories
output_dir = "bert-finetuned-ner-accelerate-checkpoint"
final_output_dir = "bert-finetuned-ner-accelerate"
metrics_output_file = os.path.join(output_dir, "metrics.json")
log_history_file = os.path.join(output_dir, "log_history.json")  # New file for log history

progress_bar = tqdm(range(num_training_steps))
all_metrics = {}  # To store metrics for each epoch
log_history = []  # To store state log history

# Define a helper function to handle non-serializable types
def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

for epoch in range(num_train_epochs):
    # Start tracking time
    epoch_start_time = time.time()

    # Training
    model.train()
    num_samples = 0
    num_steps = 0
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        num_samples += batch["input_ids"].size(0)
        num_steps += 1
        progress_bar.update(1)

    # End training time tracking
    epoch_training_time = time.time() - epoch_start_time

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    # Compute metrics for the current epoch
    results = metric.compute()

    # Extract overall metrics
    overall_metrics = {
        key: results[f"overall_{key}"]
        for key in ["precision", "recall", "f1", "accuracy"]
    }

    # Extract per-label metrics
    per_label_metrics = {
        label: {
            "precision": metrics["precision"],
            "recall": metrics["recall"],
            "f1": metrics["f1"],
            "number": metrics["number"],
        }
        for label, metrics in results.items() if not label.startswith("overall")
    }

    # Compute runtime metrics
    samples_per_second = num_samples / epoch_training_time
    steps_per_second = num_steps / epoch_training_time

    # Combine all metrics for this epoch
    epoch_metrics = {
        "overall": overall_metrics,
        "per_label": per_label_metrics,
        "runtime": {
            "epoch": epoch,
            "training_time": epoch_training_time,
            "samples_per_second": samples_per_second,
            "steps_per_second": steps_per_second,
        },
    }
    all_metrics[f"epoch_{epoch}"] = epoch_metrics

    # Save metrics to the current epoch's folder
    epoch_output_dir = os.path.join(output_dir, f"epoch-{epoch}")
    accelerator.wait_for_everyone()
    if not os.path.exists(epoch_output_dir):
        os.makedirs(epoch_output_dir)
    epoch_metrics_file = os.path.join(epoch_output_dir, "metrics.json")
    with open(epoch_metrics_file, "w") as f:
        json.dump(epoch_metrics, f, indent=4, default=convert_to_serializable)

    # Update the global metrics file
    with open(metrics_output_file, "w") as f:
        json.dump(all_metrics, f, indent=4, default=convert_to_serializable)

    # Log state history (simulating `state.log_history`)
    log_entry = {
        "epoch": epoch,
        "precision": overall_metrics["precision"],
        "recall": overall_metrics["recall"],
        "f1": overall_metrics["f1"],
        "accuracy": overall_metrics["accuracy"],
        "training_time": epoch_training_time,
    }
    log_history.append(log_entry)

    # Save the log history to a file after each epoch
    with open(log_history_file, "w") as f:
        json.dump(log_history, f, indent=4, default=convert_to_serializable)

    # Clean notebook-friendly output
    print(f"Epoch {epoch}: Precision={overall_metrics['precision']:.4f}, "
          f"Recall={overall_metrics['recall']:.4f}, F1={overall_metrics['f1']:.4f}, "
          f"Accuracy={overall_metrics['accuracy']:.4f}, "
          f"Samples/sec={samples_per_second:.2f}, Steps/sec={steps_per_second:.2f}")

    # Display per-label metrics as a table
    per_label_df = pd.DataFrame(per_label_metrics).T
    per_label_df.index.name = "Label"
    per_label_df.reset_index(inplace=True)
    display(per_label_df)

    # Save the model checkpoint for the current epoch
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(epoch_output_dir, save_function=accelerator.save)

# Save the final model, tokenizer, metrics, and log history
if accelerator.is_main_process:
    if not os.path.exists(final_output_dir):
        os.makedirs(final_output_dir)

    print("Saving the final model...")

    # Save the model and tokenizer
    unwrapped_model.save_pretrained(final_output_dir, save_function=accelerator.save)
    tokenizer.save_pretrained(final_output_dir)

    # Save final metrics to the final output directory
    final_metrics_file = os.path.join(final_output_dir, "metrics.json")
    with open(final_metrics_file, "w") as f:
        json.dump(all_metrics, f, indent=4, default=convert_to_serializable)

    # Save final log history to the final output directory
    final_log_history_file = os.path.join(final_output_dir, "log_history.json")
    with open(final_log_history_file, "w") as f:
        json.dump(log_history, f, indent=4, default=convert_to_serializable)

    print(f"Final model, metrics, and log history saved to {final_output_dir}")



  attn_output = torch.nn.functional.scaled_dot_product_attention(
  5%|▌         | 1756/35120 [02:41<52:02, 10.69it/s]  

Epoch 0: Precision=0.9261, Recall=0.8996, F1=0.9127, Accuracy=0.9823, Samples/sec=87.20, Steps/sec=10.90


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.939031,0.942623,0.940824,1830.0
1,MISC,0.840564,0.786802,0.812795,985.0
2,ORG,0.923192,0.863922,0.892574,1433.0
3,PER,0.958198,0.944355,0.951226,1869.0


 10%|▉         | 3511/35120 [05:29<49:03, 10.74it/s]   

Epoch 1: Precision=0.9401, Recall=0.9113, F1=0.9254, Accuracy=0.9845, Samples/sec=87.33, Steps/sec=10.92


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.956451,0.936567,0.946405,1876.0
1,MISC,0.87961,0.839545,0.85911,966.0
2,ORG,0.917972,0.879286,0.898212,1400.0
3,PER,0.970141,0.946504,0.958177,1888.0


 15%|█▍        | 5267/35120 [08:22<47:45, 10.42it/s]   

Epoch 2: Precision=0.9512, Recall=0.9255, F1=0.9382, Accuracy=0.9869, Samples/sec=85.58, Steps/sec=10.70


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.974959,0.953167,0.96394,1879.0
1,MISC,0.883948,0.857895,0.870726,950.0
2,ORG,0.927666,0.8969,0.912023,1387.0
3,PER,0.978284,0.952935,0.965443,1891.0


 20%|█▉        | 7023/35120 [11:11<44:02, 10.63it/s]   

Epoch 3: Precision=0.9477, Recall=0.9277, F1=0.9376, Accuracy=0.9857, Samples/sec=86.56, Steps/sec=10.82


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.960261,0.956616,0.958435,1844.0
1,MISC,0.890456,0.895311,0.892877,917.0
2,ORG,0.938106,0.882188,0.909288,1426.0
3,PER,0.970684,0.949549,0.96,1883.0


 25%|██▌       | 8780/35120 [14:01<42:02, 10.44it/s]   

Epoch 4: Precision=0.9441, Recall=0.9291, F1=0.9366, Accuracy=0.9847, Samples/sec=87.33, Steps/sec=10.92


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.954818,0.96057,0.957685,1826.0
1,MISC,0.898048,0.863399,0.880383,959.0
2,ORG,0.928412,0.883605,0.905455,1409.0
3,PER,0.96797,0.96692,0.967444,1844.0


 30%|██▉       | 10535/35120 [16:49<39:27, 10.38it/s] 

Epoch 5: Precision=0.9514, Recall=0.9272, F1=0.9391, Accuracy=0.9862, Samples/sec=87.42, Steps/sec=10.93


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.977681,0.95228,0.964813,1886.0
1,MISC,0.906725,0.849593,0.87723,984.0
2,ORG,0.914243,0.912202,0.913222,1344.0
3,PER,0.974484,0.953266,0.963758,1883.0


 35%|███▍      | 12291/35120 [19:39<39:16,  9.69it/s]  

Epoch 6: Precision=0.9485, Recall=0.9320, F1=0.9402, Accuracy=0.9860, Samples/sec=87.15, Steps/sec=10.90


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.972237,0.94748,0.959699,1885.0
1,MISC,0.904555,0.89774,0.901135,929.0
2,ORG,0.91648,0.913755,0.915115,1345.0
3,PER,0.970141,0.946504,0.958177,1888.0


 40%|███▉      | 14047/35120 [22:28<31:59, 10.98it/s]  

Epoch 7: Precision=0.9485, Recall=0.9331, F1=0.9407, Accuracy=0.9858, Samples/sec=87.45, Steps/sec=10.94


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.965705,0.962039,0.963869,1844.0
1,MISC,0.907809,0.87644,0.891849,955.0
2,ORG,0.914243,0.900147,0.90714,1362.0
3,PER,0.976656,0.957424,0.966944,1879.0


 45%|████▍     | 15803/35120 [25:17<28:47, 11.18it/s]  

Epoch 8: Precision=0.9482, Recall=0.9312, F1=0.9396, Accuracy=0.9853, Samples/sec=87.39, Steps/sec=10.93


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.959717,0.955038,0.957372,1846.0
1,MISC,0.899132,0.880978,0.889962,941.0
2,ORG,0.929903,0.887544,0.90823,1405.0
3,PER,0.974484,0.966093,0.97027,1858.0


 50%|████▉     | 17559/35120 [28:06<26:34, 11.01it/s]  

Epoch 9: Precision=0.9467, Recall=0.9276, F1=0.9370, Accuracy=0.9853, Samples/sec=87.37, Steps/sec=10.93


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.964616,0.947594,0.956029,1870.0
1,MISC,0.900217,0.863684,0.881572,961.0
2,ORG,0.914989,0.910237,0.912607,1348.0
3,PER,0.975027,0.952785,0.963778,1885.0


 55%|█████▍    | 19315/35120 [30:55<24:27, 10.77it/s]  

Epoch 10: Precision=0.9488, Recall=0.9308, F1=0.9397, Accuracy=0.9854, Samples/sec=87.30, Steps/sec=10.92


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.968971,0.954424,0.961642,1865.0
1,MISC,0.895879,0.879659,0.887695,939.0
2,ORG,0.909769,0.914543,0.91215,1334.0
3,PER,0.983713,0.944242,0.963574,1919.0


 60%|█████▉    | 21071/35120 [33:44<21:34, 10.85it/s]  

Epoch 11: Precision=0.9502, Recall=0.9315, F1=0.9408, Accuracy=0.9861, Samples/sec=87.23, Steps/sec=10.91


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.968427,0.961622,0.965012,1850.0
1,MISC,0.902386,0.866667,0.884166,960.0
2,ORG,0.924683,0.904449,0.914454,1371.0
3,PER,0.974484,0.954787,0.964535,1880.0


 65%|██████▍   | 22827/35120 [36:36<15:45, 13.00it/s]  

Epoch 12: Precision=0.9517, Recall=0.9315, F1=0.9415, Accuracy=0.9859, Samples/sec=85.82, Steps/sec=10.73


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.966249,0.956873,0.961538,1855.0
1,MISC,0.904555,0.883475,0.893891,944.0
2,ORG,0.925429,0.898624,0.91183,1381.0
3,PER,0.979913,0.954521,0.967051,1891.0


 70%|███████   | 24584/35120 [39:25<16:44, 10.49it/s]  

Epoch 13: Precision=0.9507, Recall=0.9336, F1=0.9420, Accuracy=0.9858, Samples/sec=86.39, Steps/sec=10.80


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.964616,0.958874,0.961737,1848.0
1,MISC,0.902386,0.886049,0.894143,939.0
2,ORG,0.924683,0.895307,0.909758,1385.0
3,PER,0.979913,0.960617,0.970169,1879.0


 75%|███████▍  | 26339/35120 [42:14<13:28, 10.87it/s]  

Epoch 14: Precision=0.9504, Recall=0.9340, F1=0.9421, Accuracy=0.9855, Samples/sec=87.43, Steps/sec=10.93


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.961894,0.967689,0.964783,1826.0
1,MISC,0.921909,0.85,0.884495,1000.0
2,ORG,0.917226,0.914498,0.91586,1345.0
3,PER,0.977199,0.96,0.968523,1875.0


 80%|███████▉  | 28095/35120 [45:03<10:41, 10.95it/s]  

Epoch 15: Precision=0.9488, Recall=0.9355, F1=0.9421, Accuracy=0.9859, Samples/sec=87.42, Steps/sec=10.93


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.97006,0.956522,0.963243,1863.0
1,MISC,0.894794,0.886144,0.890448,931.0
2,ORG,0.919463,0.907285,0.913333,1359.0
3,PER,0.976113,0.959445,0.967707,1874.0


 85%|████████▌ | 29852/35120 [47:51<08:14, 10.65it/s]  

Epoch 16: Precision=0.9498, Recall=0.9354, F1=0.9426, Accuracy=0.9859, Samples/sec=87.96, Steps/sec=11.00


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.968427,0.957997,0.963184,1857.0
1,MISC,0.900217,0.884861,0.892473,938.0
2,ORG,0.9217,0.906158,0.913863,1364.0
3,PER,0.976656,0.959467,0.967985,1875.0


 90%|█████████ | 31608/35120 [50:40<05:26, 10.76it/s]  

Epoch 17: Precision=0.9509, Recall=0.9379, F1=0.9443, Accuracy=0.9860, Samples/sec=88.24, Steps/sec=11.03


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.964072,0.964072,0.964072,1837.0
1,MISC,0.900217,0.894397,0.897297,928.0
2,ORG,0.92692,0.905317,0.915991,1373.0
3,PER,0.980456,0.957582,0.968884,1886.0


 95%|█████████▍| 33363/35120 [53:28<02:43, 10.74it/s]  

Epoch 18: Precision=0.9524, Recall=0.9400, F1=0.9462, Accuracy=0.9863, Samples/sec=88.26, Steps/sec=11.04


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.965161,0.966739,0.965949,1834.0
1,MISC,0.907809,0.894231,0.900969,936.0
2,ORG,0.928412,0.904139,0.916115,1377.0
3,PER,0.97937,0.963161,0.971198,1873.0


100%|█████████▉| 35119/35120 [56:17<00:00, 11.05it/s]

Epoch 19: Precision=0.9517, Recall=0.9387, F1=0.9452, Accuracy=0.9862, Samples/sec=88.09, Steps/sec=11.02


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.964616,0.965142,0.964879,1836.0
1,MISC,0.907809,0.891374,0.899516,939.0
2,ORG,0.926174,0.902616,0.914244,1376.0
3,PER,0.97937,0.963161,0.971198,1873.0


Saving the final model...
Final model, metrics, and log history saved to bert-finetuned-ner-accelerate


In [27]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "bert-finetuned-ner-accelerate"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER',
  'score': 0.9999658,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.99996203,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9999542,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [28]:
import pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer
from tqdm.auto import tqdm
from accelerate import Accelerator
import torch
import json
import numpy as np

accelerator = Accelerator()

# Load the trained model and tokenizer
final_output_dir = "bert-finetuned-ner-accelerate"
model = AutoModelForTokenClassification.from_pretrained(final_output_dir)
tokenizer = AutoTokenizer.from_pretrained(final_output_dir)

# Prepare the test dataloader
test_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=8
)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a helper function to handle non-serializable types
def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

# Testing loop
model.eval()
progress_bar = tqdm(test_dataloader)
metric_results = {}

for batch in progress_bar:
    with torch.no_grad():
        # Move batch to device
        batch = {key: val.to(device) for key, val in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        
        # Get predictions and labels
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        # Pad predictions and labels for distributed gathering (if necessary)
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        # Gather predictions and labels across all processes
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        
        # Postprocess to remove padding and decode tokens
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        
        # Add predictions and references to the metric
        metric.add_batch(predictions=true_predictions, references=true_labels)

# Compute final results
results = metric.compute()

# Extract overall metrics
overall_metrics = {
    key: results[f"overall_{key}"]
    for key in ["precision", "recall", "f1", "accuracy"]
}
metric_results["overall"] = overall_metrics

# Extract per-label metrics
label_metrics = {
    label: {
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1": metrics["f1"],
        "number": metrics["number"]
    }
    for label, metrics in results.items() if not label.startswith("overall")
}
metric_results["per_label"] = label_metrics

# Display overall metrics
print("Overall Metrics:")
overall_df = pd.DataFrame([overall_metrics])
display(overall_df)

# Display per-label metrics
print("\nPer-Label Metrics:")
per_label_df = pd.DataFrame(label_metrics).T
per_label_df.index.name = "Label"
per_label_df.reset_index(inplace=True)
display(per_label_df)

# Save metrics to a JSON file
test_metrics_file = f"{final_output_dir}/test_metrics.json"
if accelerator.is_main_process:
    with open(test_metrics_file, "w") as f:
        json.dump(metric_results, f, indent=4, default=convert_to_serializable)
    print(f"Test results saved to {test_metrics_file}")


100%|██████████| 35120/35120 [56:28<00:00, 10.37it/s]
100%|██████████| 432/432 [00:08<00:00, 53.50it/s]


Overall Metrics:


Unnamed: 0,precision,recall,f1,accuracy
0,0.918201,0.896766,0.907357,0.972345



Per-Label Metrics:


Unnamed: 0,Label,precision,recall,f1,number
0,LOC,0.929856,0.921021,0.925418,1684.0
1,MISC,0.820513,0.75,0.783673,768.0
2,ORG,0.906683,0.882767,0.894565,1706.0
3,PER,0.960421,0.955692,0.958051,1625.0


Test results saved to bert-finetuned-ner-accelerate/test_metrics.json


In [1]:
from transformers import pipeline

# Replace with your specific model checkpoint
model_checkpoint = "bert-finetuned-ner-accelerate"
token_classifier = pipeline(
    "token-classification", 
    model=model_checkpoint, 
    aggregation_strategy="simple"
)

# Example usage
result = token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
print(result)

  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER', 'score': 0.9999658, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.99996203, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.9999542, 'word': 'Brooklyn', 'start': 49, 'end': 57}]
