# Named Entity Recognition (NER)
- Task : To locate and classify the names entities in unstructured text into some pre-defined categories


## POS Tagging
- To mark words in a sentence with the Part of Speech for better context

# Dataset
- [Annotated Dataset for NER](https://www.kaggle.com/datasets/abhinavwalia95/entity-annotated-corpus)

### Reference Code
- [Kaggle Link](https://www.kaggle.com/code/eneszvo/ner-named-entity-recognition-tutorial/notebook)

In [None]:
!pip install datasets



In [None]:
!pip install seqeval



In [None]:
import numpy as np
import pandas as pd
import os
from nltk import word_tokenize, pos_tag

import torch

from datasets import load_dataset, load_metric

from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments

In [None]:
dataset = load_dataset('conll2003')

dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
print(dataset['train'].features)

{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}


In [None]:
label_names = dataset['train'].features['ner_tags'].feature.names

label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
checkpoint = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
token = tokenizer(dataset['train'][0]['tokens'], is_split_into_words = True)

print(token, '\n---\n',
      token.tokens(),'\n---\n',
      token.word_ids())

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 
---
 ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]'] 
---
 [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


In [None]:
def align_target(labels, word_ids):
    begin2inside = {
        1: 2,  # B-LOC -> I-LOC
        3: 4,  # B-MISC -> I-MISC
        5: 6,  # B-ORG -> I-ORG
        7: 8    # B-PER -> I-PER
    }

    align_labels = []
    last_word = None

    for word in word_ids:
        if word is None:
            label = -100
        elif word != last_word:
            label = labels[word]
        else:
            label = labels[word]
            if label in begin2inside:
                label = begin2inside[label]
        align_labels.append(label)
        last_word = word

    return align_labels

In [None]:
labels = dataset['train'][0]['ner_tags']
word_ids = token.word_ids()

aligned_target = align_target(labels, word_ids)

print(token.tokens(), '\n--------------------------------------------------------------------------------------\n',
      labels, '\n--------------------------------------------------------------------------------------\n',
      aligned_target)

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]'] 
--------------------------------------------------------------------------------------
 [3, 0, 7, 0, 0, 0, 7, 0, 0] 
--------------------------------------------------------------------------------------
 [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [None]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_target]

for x, y in zip(token.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
EU	B-ORG
rejects	O
German	B-MISC
call	O
to	O
boycott	O
British	B-MISC
la	O
##mb	O
.	O
[SEP]	None


In [None]:
# Define fake input data
words = ['[CLS]', 'Ger', '##man', 'call', 'to', 'Micro', '##so', '##ft', '[SEP]']
word_ids = [None, 0, 0, 1, 2, 3, 3, 3, None]
labels = [7, 0, 0, 3, 4]

# Use the align_target function to align labels
aligned_target = align_target(labels, word_ids)

# Create a list of aligned labels using label names
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_target]

# Loop through words and aligned labels and print them
for x, y in zip(words, aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
Ger	B-MISC
##man	I-MISC
call	O
to	O
Micro	B-ORG
##so	I-ORG
##ft	I-ORG
[SEP]	None


In [None]:
def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids))

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = aligned_targets_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

batch = data_collator([tokenized_dataset['train'][i] for i in range(2)])

batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [None]:
metric = load_metric("seqeval")

metric.compute(predictions = [['O' , 'B-ORG' , 'I-ORG']],
               references = [['O' , 'B-MISC' , 'I-ORG']])

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [None]:
def compute_metrics(logits_and_labels):

  logits, labels = logits_and_labels

  predictions = np.argmax(logits, axis=-1)

  str_labels = [
    [label_names[t] for t in label if t!=-100] for label in labels
  ]

  str_preds = [
    [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
    for prediction, label in zip(predictions, labels)
  ]

  results = metric.compute(predictions=str_preds, references=str_labels)

  return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"]
  }

In [None]:
id2label = {k: v for k, v in enumerate(label_names)}

label2id = {v: k for k, v in enumerate(label_names)}

print(id2label , '\n--------------------\n' , label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'} 
--------------------
 {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
  checkpoint,

  # Pass in label mappings
  id2label=id2label,
  label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install accelerate -U



# Model Training

In [None]:
training_args = TrainingArguments(
  output_dir = "fine_tuned_model",

  evaluation_strategy = "epoch",

  learning_rate = 2e-5,

  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 16,

  num_train_epochs = 3,

  weight_decay = 0.01
)


In [None]:
from transformers import Trainer

trainer = Trainer(
  model=model,

  args=training_args,

  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset["validation"],

  tokenizer=tokenizer,

  compute_metrics=compute_metrics,

  # Data collator
  data_collator=data_collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2277,0.065545,0.898973,0.928475,0.913486,0.981133
2,0.0467,0.058654,0.930017,0.94379,0.936853,0.985165
3,0.0259,0.05509,0.930356,0.946483,0.93835,0.986166


Checkpoint destination directory fine_tuned_model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuned_model/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuned_model/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuned_model/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuned_model/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2634, training_loss=0.07775992440199797, metrics={'train_runtime': 539.8907, 'train_samples_per_second': 78.021, 'train_steps_per_second': 4.879, 'total_flos': 1050534559887048.0, 'train_loss': 0.07775992440199797, 'epoch': 3.0})

In [None]:
trainer.save_model('fine_tuned_model')

In [None]:
from transformers import pipeline

ner = pipeline(
    'token-classification',
    model = 'fine_tuned_model',
    aggregation_strategy = 'simple' ,
    device = 0
)

In [None]:
ner('Apple Inc. is planning to open a new store in San Francisco, California.')

[{'entity_group': 'ORG',
  'score': 0.9982869,
  'word': 'Apple Inc.',
  'start': 0,
  'end': 10},
 {'entity_group': 'LOC',
  'score': 0.99784327,
  'word': 'San Francisco',
  'start': 46,
  'end': 59},
 {'entity_group': 'LOC',
  'score': 0.99271363,
  'word': 'California',
  'start': 61,
  'end': 71}]

# Where is it paying attention to?
[Link](https://github.com/jessevig/bertviz?tab=readme-ov-file)

In [None]:
!pip install bertviz



In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_model')
model = AutoModel.from_pretrained('./fine_tuned_model', output_attentions=True)

Some weights of BertModel were not initialized from the model checkpoint at ./fine_tuned_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from bertviz import head_view, model_view
from transformers import BertTokenizer, BertModel

sentence_a = "Apple is planning to open a new store in California."
sentence_b = "Apple is planning to open a new store in California."
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt')
input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
sentence_b_start = token_type_ids[0].tolist().index(1)
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)

In [None]:
head_view(attention, tokens, sentence_b_start)

<IPython.core.display.Javascript object>

In [None]:
print(type(ner("Hello new World")))

<class 'list'>


In [None]:
test = ner('Apple is planning to open a new store in California.')

test

[{'entity_group': 'ORG',
  'score': 0.9982869,
  'word': 'Apple Inc.',
  'start': 0,
  'end': 10},
 {'entity_group': 'LOC',
  'score': 0.99784327,
  'word': 'San Francisco',
  'start': 46,
  'end': 59},
 {'entity_group': 'LOC',
  'score': 0.99271363,
  'word': 'California',
  'start': 61,
  'end': 71}]

In [None]:
print(test[0]['word'])

Apple Inc.


In [None]:
# Testing the keywords

for i in range(len(test)):
  print(test[i]['word'])

Apple Inc.
San Francisco
California


In [None]:
# Create a function to get the keywords
def extract_keywords(output):
  keywords = []

  for i in range(len(output)):
    # print(output[i]['word'])
    keywords.append(output[i]['word'])

  return keywords

In [None]:
test = ner('Apple Inc. is planning to open a new store in San Francisco, California.')

extract_keywords(test)

['Apple Inc.', 'San Francisco', 'California']

## Output the weights and mapping
[StackOverflow Question and Answer](https://stackoverflow.com/questions/60120849/outputting-attention-for-bert-base-uncased-with-huggingface-transformers-torch)

In [None]:
def closest_word(sentence, target_word):
  tokens = tokenizer.tokenize(sentence)
  input_ids = tokenizer.encode(sentence, return_tensors='pt')

  with torch.no_grad():
      outputs = model(input_ids)
      attentions = outputs.attentions

  target_word_index = tokens.index(target_word)

  attention_weights = attentions[0][0][target_word_index].cpu().numpy()

  closest_words = [(tokens[i], attention_weights[i]) for i in range(len(tokens))]

  closest_words.sort()

  print(f"Closest words to '{target_word}' in the sentence:")
  for i in closest_words[:4]:
    print(i[0])

In [None]:
sentence = "India is a vast country"
target_word = ner(sentence)[0]['word']
closest_word(sentence, target_word)

Closest words to 'India' in the sentence:
India
a
country
is


In [None]:
sentence = "Apple is planning to open a new store in California."
ner_val = ner(sentence)
targets = extract_keywords(ner_val)

for i in targets:
  closest_word(sentence, i)

Closest words to 'Apple' in the sentence:
.
Apple
California
a
Closest words to 'California' in the sentence:
.
Apple
California
a
