In [None]:
!pip install transformers sentencepiece datasets evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m126.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

# Named Entity Recognition

**Named Entity Recognition (NER)** is a process by which a system takes an input of unstructured data, such as a text, and outputs structured data, specifically the identification and classification of specific entities such as people, places, dates, and more.The purpose of NER is to extract structured data from unstructured texts. NER is a form of natural language processing (NLP) and is one of the most popular data preprocessing tasks.

## Importing necessary libraries

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW

**Note**: The CoNLL-2003 dataset is a widely used dataset for NER tasks. It provides annotated data for English and German languages, covering various named entity types. Researchers and practitioners often use this dataset to train and evaluate NER models.

In [None]:
# Load the dataset
dataset = load_dataset('conll2003')
dataset

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
#to extract the label names, create dictionaries to map between label names and their corresponding IDs

ner_feature = dataset["train"].features["ner_tags"]
label_names = ner_feature.feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased',
                                                        id2label=id2label,label2id=label2id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cl

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

#Preprocess the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True,remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

The **"align_labels_with_tokens"** function takes as input a list of labels and a list of word IDs and outputs a new list of labels that are aligned with the corresponding tokens. The function adds a special token (-100) for tokens that are not part of any entity and changes the label of the first token of an entity from "B-XXX" to "I-XXX" if necessary.

The **"tokenize_and_align_labels"** function takes as input a list of examples, where each example is a dictionary containing the tokens and labels for a single sentence. The function first tokenizes the input using the Hugging Face tokenizer and then aligns the labels with the corresponding tokens using the "align_labels_with_tokens" function. The function returns a new list of examples, where each example is a dictionary containing the tokenized input and the aligned labels.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # model model to cuda for faster training
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
# Define optimizer and dataloader
optimizer = AdamW(model.parameters() , lr=2e-5)

train_loader = DataLoader(tokenized_dataset['train'], batch_size=8,collate_fn=data_collator, shuffle=True)
eval_loader = DataLoader(tokenized_dataset['test'], batch_size = 8,collate_fn=data_collator)



In [None]:
## Create dataloader to properly structure data while feeding to the model

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], collate_fn=data_collator, batch_size=8
)

## Train the NER model

In [None]:
from tqdm import tqdm

# Training Loop
num_epochs = 10

for epoch in tqdm(range(num_epochs)):
  model.train()
  total_loss = 0
  total_eval_loss = 0
  for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  print(f"Epoch:{epoch+1}| Average Loss:{total_loss/len(train_loader):.4f}")

  model.eval()
  for batch in eval_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_eval_loss += loss.item()
  print(f"Average Evaluation Loss:{total_eval_loss/len(eval_loader):.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:1| Average Loss:0.0174


 10%|█         | 1/10 [02:38<23:47, 158.60s/it]

Average Evaluation Loss:0.1615
Epoch:2| Average Loss:0.0137


 20%|██        | 2/10 [05:16<21:07, 158.39s/it]

Average Evaluation Loss:0.2145
Epoch:3| Average Loss:0.0097


 30%|███       | 3/10 [08:09<19:13, 164.79s/it]

Average Evaluation Loss:0.2007
Epoch:4| Average Loss:0.0068


 40%|████      | 4/10 [11:14<17:16, 172.67s/it]

Average Evaluation Loss:0.2259
Epoch:5| Average Loss:0.0071


 50%|█████     | 5/10 [13:55<14:03, 168.64s/it]

Average Evaluation Loss:0.2253
Epoch:6| Average Loss:0.0059


 60%|██████    | 6/10 [16:34<11:01, 165.27s/it]

Average Evaluation Loss:0.2120
Epoch:7| Average Loss:0.0053


 70%|███████   | 7/10 [19:12<08:08, 162.90s/it]

Average Evaluation Loss:0.2168
Epoch:8| Average Loss:0.0044


 80%|████████  | 8/10 [21:51<05:23, 161.74s/it]

Average Evaluation Loss:0.2217
Epoch:9| Average Loss:0.0049


 90%|█████████ | 9/10 [24:28<02:40, 160.37s/it]

Average Evaluation Loss:0.2259
Epoch:10| Average Loss:0.0043


100%|██████████| 10/10 [27:05<00:00, 162.57s/it]

Average Evaluation Loss:0.2267





In [None]:
model.save_pretrained('/content/model')  # Save the trained model for future inference

In [None]:
tokenizer.save_pretrained('/content/model') # Save the tokenizer

('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/vocab.txt',
 '/content/model/added_tokens.json',
 '/content/model/tokenizer.json')

## Uploading the trained model and tokenizer to HuggingFace Hub

In [None]:
from huggingface_hub import notebook_login, HfApi

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
api = HfApi()

api.upload_folder(
    folder_path='/content/model',
    path_in_repo=".",
    repo_id="Neupane9Sujal/NER_usingBERT",
    repo_type='model',
    create_pr=1
)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

'https://huggingface.co/Neupane9Sujal/NER_usingBERT/tree/refs%2Fpr%2F1/.'

You can check if the trained model correctly predicts the named entity or not from following function.

In [None]:
def inference(model, tokenizer, sentence):
    # Preprocess the input sentence
    inputs = tokenizer(sentence, truncation=True, padding=True, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Move tensors to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract predictions from model outputs
    predictions = outputs.logits.argmax(dim=1).tolist()

    return predictions