In [None]:
!pip install pandas
!pip install datasets
!pip install datasets==3.6.0



In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import numpy as np
import torch

In [None]:
from datasets import load_dataset, ClassLabel

ds = load_dataset("conll2003")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
example = ds['train'][0]


In [None]:
#  Get the labels
labels = ds['train'].features['ner_tags'].feature.names
label_to_id = {l: i for i, l in enumerate(labels)}
id_to_label = {i: l for l, i in label_to_id.items()}
num_labels = len(labels)

In [None]:
print("Tokens:", example['tokens'])
print("NER tags:", [ds['train'].features['ner_tags'].feature.int2str(i) for i in example['ner_tags']])


Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [None]:
#  Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128,
    )

    labels_aligned = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # ignored index
            elif word_idx != previous_word_idx:
                aligned_labels.append(label[word_idx])
            else:
                aligned_labels.append(label[word_idx] if False else -100)
            previous_word_idx = word_idx
        labels_aligned.append(aligned_labels)
    tokenized_inputs["labels"] = labels_aligned
    return tokenized_inputs


In [None]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
# Load model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p,l) in zip(prediction,label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p,l) in zip(prediction,label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    flat_pred = [x for sub in true_predictions for x in sub]
    flat_labels = [x for sub in true_labels for x in sub]

    correct = sum(p==l for p,l in zip(flat_pred, flat_labels))
    acc = correct / len(flat_labels)
    return {"accuracy": acc}

In [None]:
#  Trainer
training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Train
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshreenalathiya42[0m ([33mshreenalathiya42-auro-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0764,0.059262,0.984198
2,0.0298,0.05297,0.988738
3,0.0142,0.05128,0.989849


TrainOutput(global_step=5268, training_loss=0.05253997950474523, metrics={'train_runtime': 586.3664, 'train_samples_per_second': 71.837, 'train_steps_per_second': 8.984, 'total_flos': 1376049275709696.0, 'train_loss': 0.05253997950474523, 'epoch': 3.0})

In [None]:
# Evaluate
results = trainer.evaluate()
print(results)

{'eval_loss': 0.05128026753664017, 'eval_accuracy': 0.9898486058881982, 'eval_runtime': 12.1439, 'eval_samples_per_second': 267.623, 'eval_steps_per_second': 33.515, 'epoch': 3.0}


In [None]:
#Save model and tokenizer
output_dir = "./distilbert_ner_model"
model.config.id2label = id_to_label
model.config.label2id = label_to_id
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./distilbert_ner_model/tokenizer_config.json',
 './distilbert_ner_model/special_tokens_map.json',
 './distilbert_ner_model/vocab.txt',
 './distilbert_ner_model/added_tokens.json',
 './distilbert_ner_model/tokenizer.json')

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [None]:
import torch

def test_ner_model(sentences):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    results = []

    for text in sentences:
        tokens = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            return_offsets_mapping=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**{k: tokens[k] for k in tokens if k != "offset_mapping"})

        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().tolist()
        offsets = tokens['offset_mapping'].squeeze().cpu().tolist()

        entities = []
        for idx, label_id in enumerate(predictions):
            if label_id == label_to_id['O']:  # Corrected from label_to_id['0']
                continue
            start, end = offsets[idx]
            entities.append({
                "start": start,
                "end": end,
                "entity": id_to_label[label_id],
                "text": text[start:end]
            })
        results.append(entities)

    return results

In [None]:
test_sentences = [
    "Shreena works at OpenAI in San Francisco.",
    "Barack Obama was the 44th president of the United States."
]

ner_results = test_ner_model(test_sentences)

for i, sentence_entities in enumerate(ner_results):
    print(f"Sentence: {test_sentences[i]}")
    for entity in sentence_entities:
        print(f"  Entity: {entity['entity']}, Text: '{entity['text']}'")
    print()


Sentence: Shreena works at OpenAI in San Francisco.
  Entity: B-PER, Text: 'S'
  Entity: I-PER, Text: 'hr'
  Entity: B-ORG, Text: 'Open'
  Entity: I-ORG, Text: 'A'
  Entity: I-ORG, Text: 'I'
  Entity: B-LOC, Text: 'San'
  Entity: I-LOC, Text: 'Francisco'

Sentence: Barack Obama was the 44th president of the United States.
  Entity: B-PER, Text: 'Barack'
  Entity: I-PER, Text: 'Obama'
  Entity: B-LOC, Text: 'United'
  Entity: I-LOC, Text: 'States'



In [None]:
from google.colab import files
import shutil

# Compress the entire folder
shutil.make_archive("distilbert_ner_model", 'zip', output_dir)

# Download the zip
files.download("distilbert_ner_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>