In [17]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer
import torch
from datasets import load_metric
import pickle
import json
import numpy as np

In [6]:
# reading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(r'E:\Work\Data_Science\Projects\Custom_NER\models\huggingface\final.model')

In [7]:
# read the test data
with open(f"../data/huggingface/test.json", 'r') as f:
    test = json.load(f)

In [12]:
# taking one example from the test se
test_ex = test[0]
print(test_ex)

{'id': 9797, 'ner_tags': [0, 0, 0, 0, 0, 0, 2, 5, 7, 2, 0, 0, 2, 5, 5, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['In', 'a', 'letter', 'sent', 'to', 'the', 'Confederation', 'of', 'African', 'Football', 'and', 'the', 'Nigeria', 'Football', 'Association', ',', 'FIFA', 'said', 'the', 'two', 'appointments', 'are', 'in', 'direct', 'violation', 'of', 'Article', '17', 'of', 'the', 'world', 'governing', 'body', "'s", 'statutes', '.']}


In [13]:
# creating the original sentence
test_sent = " ".join(test_ex['tokens'])
print(test_sent)

In a letter sent to the Confederation of African Football and the Nigeria Football Association , FIFA said the two appointments are in direct violation of Article 17 of the world governing body 's statutes .


### Now in order to tokenize either we can use the sentence or can use the word token, both will give the same result

In [16]:
# tokenization from the word tokens
tokens = tokenizer(test_ex['tokens'], truncation=True, is_split_into_words=True)
print(tokens)

{'input_ids': [101, 1999, 1037, 3661, 2741, 2000, 1996, 11078, 1997, 3060, 2374, 1998, 1996, 7387, 2374, 2523, 1010, 5713, 2056, 1996, 2048, 14651, 2024, 1999, 3622, 11371, 1997, 3720, 2459, 1997, 1996, 2088, 8677, 2303, 1005, 1055, 18574, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
# tokenization from sentence
tokens = tokenizer(test_sent)
print(tokens)

{'input_ids': [101, 1999, 1037, 3661, 2741, 2000, 1996, 11078, 1997, 3060, 2374, 1998, 1996, 7387, 2374, 2523, 1010, 5713, 2056, 1996, 2048, 14651, 2024, 1999, 3622, 11371, 1997, 3720, 2459, 1997, 1996, 2088, 8677, 2303, 1005, 1055, 18574, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


As we can see above two cell giving same output

### Reading the model that we trained

In [18]:
# read label_maps
with open(f"label_maps.json", 'r') as f:
    label2id = json.load(f)
print(label2id)

model = AutoModelForTokenClassification.from_pretrained(r'E:\Work\Data_Science\Projects\Custom_NER\models\huggingface\final.model',
                                                        num_labels=len(label2id)
                                                       )

{'O': 0, 'B-geo': 1, 'B-org': 2, 'B-tim': 3, 'I-per': 4, 'I-org': 5, 'B-per': 6, 'B-gpe': 7, 'I-geo': 8, 'I-tim': 9, 'B-art': 10, 'B-eve': 11, 'I-eve': 12, 'I-art': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16}


In [21]:
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

torch.Size([1, 39])

### Finally making the prediction

In [24]:
# raw prediction as probability distribution
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
# print(predictions)

In [26]:
# getting the prediction class
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
print(predictions)

tensor([0, 0, 0, 0, 0, 0, 0, 2, 5, 7, 5, 0, 0, 2, 5, 5, 0, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6])


In [30]:
# creating reverse lable_maps
id2label = {}
for k in label2id.keys():
    id2label[label2id[k]]=k
print(id2label)

{0: 'O', 1: 'B-geo', 2: 'B-org', 3: 'B-tim', 4: 'I-per', 5: 'I-org', 6: 'B-per', 7: 'B-gpe', 8: 'I-geo', 9: 'I-tim', 10: 'B-art', 11: 'B-eve', 12: 'I-eve', 13: 'I-art', 14: 'I-gpe', 15: 'B-nat', 16: 'I-nat'}


In [32]:
predictions = [id2label[int(i)] for i in list(predictions)]
print(predictions)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org', 'B-gpe', 'I-org', 'O', 'O', 'B-org', 'I-org', 'I-org', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per']
