In [1]:
!pip install transformers datasets tokenizers seqeval -q

!pip install --upgrade accelerate -q
!pip uninstall -y transformers accelerate -q
!pip install transformers accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.5 MB/s[0m 

In [2]:
import pandas as pd
import numpy as np
import datasets
import json

from transformers import BertTokenizerFast, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

from transformers import TrainingArguments, Trainer
from transformers import pipeline


In [3]:
# CoNLL2003 dataset

conll2003 = datasets.load_dataset("conll2003")
conll2003

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
# Lets take first 1000 records of the train dataset keeping computation into consideration

conll2003['train'] = conll2003['train'].select(range(1000))
print(conll2003)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [5]:
conll2003["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [6]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
conll2003['train'].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [8]:
# Lets see the train dataset

columns_names = conll2003['train'].features

train_df = pd.DataFrame.from_records(conll2003["train"].data).T
train_df.columns = columns_names
train_df.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"(EU, rejects, German, call, to, boycott, Briti...","(22, 42, 16, 21, 35, 37, 16, 21, 7)","(11, 21, 11, 12, 21, 22, 11, 12, 0)","(3, 0, 7, 0, 0, 0, 7, 0, 0)"
1,1,"(Peter, Blackburn)","(22, 22)","(11, 12)","(1, 2)"
2,2,"(BRUSSELS, 1996-08-22)","(22, 11)","(11, 12)","(5, 0)"
3,3,"(The, European, Commission, said, on, Thursday...","(12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","(11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","(0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,4,"(Germany, 's, representative, to, the, Europea...","(22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","(11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","(5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#### Lets review very first training data and train it

In [10]:
print(conll2003['train'][0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [11]:
example_text = conll2003['train'][0]
tokenized_input = tokenizer(example_text["tokens"], truncation=True,is_split_into_words=True)

for key, value in tokenized_input.items():
    print(f"{key}: {value}")


print("\n-----------------------------------------------------------------------------------------------\n")
# It will add [CLS] and [SEP] to the the tokens

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

input_ids: [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

-----------------------------------------------------------------------------------------------

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


In [12]:
word_ids = tokenized_input.word_ids()
print(word_ids)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [13]:
len(tokens)

11

In [14]:
len(conll2003['train'][0]['ner_tags'])

9

In [15]:
conll2003['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [16]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
print(conll2003['train'][4:5])

{'id': ['4'], 'tokens': [['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']], 'pos_tags': [[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21, 1, 0]], 'ner_tags': [[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]]}


In [18]:
q = tokenize_and_align_labels(conll2003['train'][4:5])
for key, value in q.items():
    print(f"{key}: {value}")



input_ids: [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]]
token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
labels: [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]


In [19]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]), q["labels"][0]):
    print(f"{token:_<20} {label}")

[CLS]_______________ -100
germany_____________ 5
'___________________ 0
s___________________ 0
representative______ 0
to__________________ 0
the_________________ 0
european____________ 3
union_______________ 4
'___________________ 0
s___________________ 0
veterinary__________ 0
committee___________ 0
werner______________ 1
z___________________ 2
##wing______________ 2
##mann______________ 2
said________________ 0
on__________________ 0
wednesday___________ 0
consumers___________ 0
should______________ 0
buy_________________ 0
sheep_______________ 0
##me________________ 0
##at________________ 0
from________________ 0
countries___________ 0
other_______________ 0
than________________ 0
britain_____________ 5
until_______________ 0
the_________________ 0
scientific__________ 0
advice______________ 0
was_________________ 0
clearer_____________ 0
.___________________ 0
[SEP]_______________ -100


In [20]:
## Now Applying on entire data

tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [21]:
columns_names = tokenized_datasets['train'].features

train_df = pd.DataFrame.from_records(tokenized_datasets["train"].data).T
train_df.columns = columns_names
train_df.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags,input_ids,token_type_ids,attention_mask,labels
0,0,"(EU, rejects, German, call, to, boycott, Briti...","(22, 42, 16, 21, 35, 37, 16, 21, 7)","(11, 21, 11, 12, 21, 22, 11, 12, 0)","(3, 0, 7, 0, 0, 0, 7, 0, 0)","(101, 7327, 19164, 2446, 2655, 2000, 17757, 23...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)","(-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100)"
1,1,"(Peter, Blackburn)","(22, 22)","(11, 12)","(1, 2)","(101, 2848, 13934, 102)","(0, 0, 0, 0)","(1, 1, 1, 1)","(-100, 1, 2, -100)"
2,2,"(BRUSSELS, 1996-08-22)","(22, 11)","(11, 12)","(5, 0)","(101, 9371, 2727, 1011, 5511, 1011, 2570, 102)","(0, 0, 0, 0, 0, 0, 0, 0)","(1, 1, 1, 1, 1, 1, 1, 1)","(-100, 5, 0, 0, 0, 0, 0, -100)"
3,3,"(The, European, Commission, said, on, Thursday...","(12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","(11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","(0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ...","(101, 1996, 2647, 3222, 2056, 2006, 9432, 2009...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(-100, 0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, ..."
4,4,"(Germany, 's, representative, to, the, Europea...","(22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","(11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","(5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ...","(101, 2762, 1005, 1055, 4387, 2000, 1996, 2647...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, ..."


**The CoNLL-2003 dataset has 9 labels for NER:**

'O'     ====> Non-entity or other

'B-LOC' ====> Beginning of a location entity

'I-LOC' ====> Inside or continuation of a location entity

'B-PER' ====> Beginning of a person entity

'I-PER' ====> Inside or continuation of a person entity

'B-ORG' ====> Beginning of an organization entity

'I-ORG' ====> Inside or continuation of an organization entity

'B-MISC'====> Beginning of a miscellaneous entity

'I-MISC'====> Inside or continuation of a miscellaneous entity


In [22]:
# Defining model

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [23]:
#Defining training args
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
                        "test-ner",
                        evaluation_strategy = "epoch",
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=2,
                        weight_decay=0.01,
)

In [24]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [25]:
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [26]:
example = conll2003['train'][0]
label_list = conll2003["train"].features["ner_tags"].feature.names
label_list

for i in example["ner_tags"]:
  print(i)

labels = [label_list[i] for i in example["ner_tags"]]
labels

metric.compute(predictions=[labels], references=[labels])

3
0
7
0
0
0
7
0
0


{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [27]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [28]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [29]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.347237,0.541017,0.53563,0.53831,0.907859
2,No log,0.27326,0.588558,0.612261,0.600175,0.922141


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=126, training_loss=0.5802325899638827, metrics={'train_runtime': 46.6545, 'train_samples_per_second': 42.868, 'train_steps_per_second': 2.701, 'total_flos': 43733229447888.0, 'train_loss': 0.5802325899638827, 'epoch': 2.0})

In [30]:
## Save model
model.save_pretrained("ner_model")

In [31]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [32]:
id2label = {
    str(idx): label for idx,label in enumerate(label_list)
}
label2id = {
    label: str(idx) for idx,label in enumerate(label_list)
}

In [33]:
id2label

{'0': 'O',
 '1': 'B-PER',
 '2': 'I-PER',
 '3': 'B-ORG',
 '4': 'I-ORG',
 '5': 'B-LOC',
 '6': 'I-LOC',
 '7': 'B-MISC',
 '8': 'I-MISC'}

In [34]:
label2id

{'O': '0',
 'B-PER': '1',
 'I-PER': '2',
 'B-ORG': '3',
 'I-ORG': '4',
 'B-LOC': '5',
 'I-LOC': '6',
 'B-MISC': '7',
 'I-MISC': '8'}

In [44]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [45]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [46]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


In [38]:
input_text = "Google India has built a new office in Delhi"
ner_results = nlp(input_text)

output = [{result['word']: result['entity']} for result in ner_results]
print(*output, sep='\n')

{'google': 'B-ORG'}
{'india': 'B-LOC'}
{'delhi': 'B-LOC'}
