In [1]:
! pip install transformers datasets evaluate seqeval accelerate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     ---------------------------------------- 0.0/43.6 kB ? eta -:--:--
     --------- ------------------------------ 10.2/43.6 kB ? eta -:--:--
     -------------------------------------- 43.6/43.6 kB 529.6 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp39-cp39-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from dataset

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-groq 0.1.4 requires langchain-core<0.3,>=0.1.45, but you have langchain-core 0.1.23 which is incompatible.
langchain-ollama 0.1.1 requires langchain-core<0.3.0,>=0.2.20, but you have langchain-core 0.1.23 which is incompatible.
langchain-text-splitters 0.2.0 requires langchain-core<0.3.0,>=0.2.0, but you have langchain-core 0.1.23 which is incompatible.
llama-index-llms-langchain 0.1.3 requires langchain<0.2.0,>=0.1.3, but you have langchain 0.1.0 which is incompatible.


# Imports and Config

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [6]:
class NERDataset:

    def __init__(self, data_id, tokenizer_ckpt):
        self.data_id = data_id
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt)

    def load_data(self):
        self.dataset = load_dataset(self.data_id)
        self.train = self.dataset["train"]
        self.test = self.dataset["test"]
        ner_feature = self.dataset["train"].features["ner_tags"]
        label_names = ner_feature.feature.names
        return self.train, self.test, label_names

    def align_labels_with_tokens(self, labels, word_ids):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                current_word = word_id
                try:
                    label = -100 if word_id is None else labels[word_id]
                except:
                    label = -100
                new_labels.append(label)
            elif word_id is None:
                new_labels.append(-100)
            else:
                label = labels[word_id]
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)

        return new_labels

    def preprocess_function(self, examples):
        tokenized_inputs = self.tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(self.align_labels_with_tokens(labels, word_ids))

        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs

    def create_data(self):

        self.train, self.test, label_names = self.load_data()

        tokenized_train_dataset = self.train.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        tokenized_test_dataset = self.test.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        return tokenized_train_dataset, tokenized_test_dataset, label_names

# Model Training - Make sure to do login

If notebook:

```shell
from huggingface_hub import notebook_login

notebook_login()

```
If script

```shell
huggingface-cli login
```

In [8]:
from huggingface_hub import notebook_login

notebook_login()

ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

In [7]:
from huggingface_hub import login

# Log in using your token
login("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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Shivani\.cache\huggingface\token
Login successful


In [6]:
class NERTrainer:

    def __init__(self):
        self.nerdataset = NERDataset("conll2003", "bert-base-cased")
        self.train_data, self.test_data, self.ner_labels = self.nerdataset.create_data()
        self.id2label = {i: label for i, label in enumerate(self.ner_labels)}
        self.label2id = {v: k for k, v in self.id2label.items()}
        self.model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", id2label=self.id2label, label2id= self.label2id)
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    def compute_metrics(self, eval_preds):
        metric = evaluate.load("seqeval")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        true_labels = [[self.ner_labels[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [self.ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }

    def set_training_args(self):
        return TrainingArguments(
        output_dir="bert-ner-custom",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=True
    )

    def train_and_save_model(self):
        trainer = Trainer(
            model=self.model,
            args=self.set_training_args(),
            train_dataset=self.train_data,
            eval_dataset=self.test_data,
            data_collator=DataCollatorForTokenClassification(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics,
            tokenizer=self.tokenizer,
        )
        trainer.train()

nertrainer = NERTrainer()
nertrainer.train_and_save_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0753,0.17008,0.864053,0.901381,0.882322,0.966767
2,0.0355,0.174188,0.891451,0.910234,0.900745,0.971778
3,0.0237,0.184328,0.885392,0.916431,0.900644,0.971888


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

# Inference

In [10]:
from transformers import pipeline

model_checkpoint = "ShivuuGenieExpl302001/bert-ner-custom"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Shivani lives in Delhi.")




[{'entity_group': 'PER',
  'score': 0.997952,
  'word': 'Shivani',
  'start': 0,
  'end': 7},
 {'entity_group': 'LOC',
  'score': 0.9994198,
  'word': 'Delhi',
  'start': 17,
  'end': 22}]