# Importing necessary libraries

In [3]:
import pandas as pd
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import BertTokenizer, BertTokenizerFast




# Loading Data

In [4]:
df=pd.read_csv("NER_Dataset.csv")

In [5]:
df.head()

Unnamed: 0,Sentence_ID,Word,POS,Tag
0,Sentence: 1,"['Thousands', 'of', 'demonstrators', 'have', '...","['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 10,"['Iranian', 'officials', 'say', 'they', 'expec...","['JJ', 'NNS', 'VBP', 'PRP', 'VBP', 'TO', 'VB',...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '..."
2,Sentence: 100,"['Helicopter', 'gunships', 'Saturday', 'pounde...","['NN', 'NNS', 'NNP', 'VBD', 'JJ', 'NNS', 'IN',...","['O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', '..."
3,Sentence: 1000,"['They', 'left', 'after', 'a', 'tense', 'hour-...","['PRP', 'VBD', 'IN', 'DT', 'NN', 'JJ', 'NN', '...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 10000,"['U.N.', 'relief', 'coordinator', 'Jan', 'Egel...","['NNP', 'NN', 'NN', 'NNP', 'NNP', 'VBD', 'NNP'...","['B-geo', 'O', 'O', 'B-per', 'I-per', 'O', 'B-..."


In [6]:
len(df)

47959

In [7]:
ner_dataset=df.sample(n=2000, random_state=42)

In [8]:
ner_dataset=ner_dataset.reset_index(drop=True)

# Preprocess the dataset

In [21]:
ner_dataset=pd.read_csv("NER_Dataset_New.csv")

In [22]:
ner_dataset["Word"][0]

'[\'The\', \'report\', \'calls\', \'on\', \'President\', \'Bush\', \'and\', \'Congress\', \'to\', \'urge\', \'Chinese\', \'officials\', \'not\', \'to\', \'use\', \'the\', \'global\', \'war\', \'against\', \'terrorism\', \'as\', \'a\', \'pretext\', \'to\', \'suppress\', \'minorities\', "\'", \'rights\', \'.\']'

We need to make a sentence out of the words.

In [23]:
import ast

def get_sentence(row):
    # Safely evaluate the string representation of the list
    row["Word"] = ast.literal_eval(row["Word"])

    # Concatenate the words into a single sentence separated by spaces
    row["Sentence"] = ' '.join(row["Word"])
    
    return row  # Return the modified row

ner_dataset = ner_dataset.apply(get_sentence,axis=1)

In [24]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the Baku-T'bilisi-Ceyhan o...
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, ,...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...","Zelenovic had lived in Khanty-Mansiisk , some ..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...
...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50 .
1998,1998,Sentence: 30109,"[Last, week, ,, diplomats, at, an, Internation...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...","Last week , diplomats at an International Atom..."


I am checking the minimum, maximum, mean and median token lengths of the tokens tokenized from words to decide the max_length of the tokenizer. So that we will get same token length and the no of padded tokens will not be too high.

In [25]:
from transformers import BertTokenizer, BertTokenizerFast

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Convert the tokenizer to a fast tokenizer
tokenizer_fast = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

In [26]:
import numpy as np

tokens_lengths=[]

# Iterate through each tokenized input and find the maximum length
for text in ner_dataset['Sentence'].tolist():
    tokenized_input = tokenizer_fast.tokenize(text)
    tokens_length = len(tokenized_input)
    tokens_lengths.append(tokens_length)

print("Minimum sequence length:", min(tokens_lengths))
print("Maximum sequence length:", max(tokens_lengths))
print("Average sequence length:",np.mean(np.array(tokens_lengths)))
print("Median sequence length:",np.median(np.array(tokens_lengths)))
print(sorted(tokens_lengths))

Minimum sequence length: 5
Maximum sequence length: 77
Average sequence length: 26.3085
Median sequence length: 26.0
[5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14

In [27]:
# Count occurrences of each token length
counts = np.bincount(tokens_lengths)

# Find the index with the maximum count, which corresponds to the mode
np.argmax(counts)

25

As the dataset is loaded from a csv file. We need to make adjustment of the Tag column to get a list of labels.

In [28]:
import ast

def get_labels(tag):
    # Safely evaluate the string representation of the list
    labels = ast.literal_eval(tag)
    return labels

ner_dataset["Labels"]=ner_dataset["Tag"].apply(get_labels)

In [29]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g..."
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the Baku-T'bilisi-Ceyhan o...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp..."
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, ,...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...","Zelenovic had lived in Khanty-Mansiisk , some ...","[B-per, O, O, O, B-geo, O, O, O, O, O, O, B-ge..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-..."
...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,..."
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50 .,"[O, O, O, O, O, O, O, B-tim, O]"
1998,1998,Sentence: 30109,"[Last, week, ,, diplomats, at, an, Internation...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...","Last week , diplomats at an International Atom...","[O, O, O, O, O, O, B-org, I-org, I-org, I-org,..."


We need to make sure the labels has the same length as the no of words in sentence. This is an additional step which has been done according to my dataset requirements. I studied the data and came up with this algorithm and also make some changes manually in the training data.

In [30]:
# Function to remove last element from labels if sentence ends with '.' and remove punctuation
def process_labels(row):
    if row['Sentence'].endswith('.') | row["Sentence"].endswith('. "'):
        row['Labels'] = row['Labels'][:-1]
    row['Sentence'] = row['Sentence'].replace('.', '')
    while '-' in row['Word']:
        # Get the index of '-' in the word
        index_of_dash = row['Word'].index('-')
        # Remove the corresponding element from the Labels column
        del row['Labels'][index_of_dash]
        # Remove '-' from the word
        del row['Word'][index_of_dash]
    row['Sentence']=row["Sentence"].replace("-",'')
    while "'s" in row['Word']:
        # Get the index of "'s" in the word
        index_of_dash = row['Word'].index("'s")
        # Remove the corresponding element from the Labels column
        del row['Labels'][index_of_dash]
        # Remove "'s" from the word
        del row['Word'][index_of_dash]
    row['Sentence']=row["Sentence"].replace("'s",'')
    while ',' in row['Word']:
        # Get the index of ',' in the word
        index_of_dash = row['Word'].index(',')
        # Remove the corresponding element from the Labels column
        del row['Labels'][index_of_dash]
        # Remove ',' from the word
        del row['Word'][index_of_dash]
    row['Sentence']=row["Sentence"].replace(",",'')
    while "'" in row['Word']:
        # Get the index of "'" in the word
        index_of_dash = row['Word'].index("'")
        # Remove the corresponding element from the Labels column
        del row['Labels'][index_of_dash]
        # Remove "'" from the word
        del row['Word'][index_of_dash]
    row['Sentence']=row["Sentence"].replace("'",'')
    return row

# Apply the function to each row
ner_dataset = ner_dataset.apply(process_labels, axis=1)

In [31]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g..."
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the BakuTbilisiCeyhan oil ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-org,..."
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp..."
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, s...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...",Zelenovic had lived in KhantyMansiisk some 20...,"[B-per, O, O, O, B-geo, O, O, O, O, O, B-geo, ..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-..."
...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,..."
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50,"[O, O, O, O, O, O, O, B-tim]"
1998,1998,Sentence: 30109,"[Last, week, diplomats, at, an, International,...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...",Last week diplomats at an International Atomi...,"[O, O, O, O, O, B-org, I-org, I-org, I-org, O,..."


Now we have to check if the length of list of corresponding labels column is same as the no of words in a sentence.

In [33]:
s=0

for i in range(2000):
    if(len(ner_dataset["Sentence"][i].split())!=len(ner_dataset["Labels"][i])):
        print("Labels Mismatch")
        print(i)
        print(len(ner_dataset["Sentence"][i].split()))
        print(len(ner_dataset["Labels"][i]))
        print(len(ner_dataset["Word"][i]))
        print(ner_dataset["Sentence"][i].split())
        print(ner_dataset["Word"][i])
        s=s+1
        print("\n")
        
print("No of mismatches:",s)

No of mismatches: 0


Now we can carry forward and tokenize the sentence. We see before that the average tokens length was around 26. So we have taken max_length as 32.

In [34]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_text(text):
    tokenized_input = tokenizer_fast.encode_plus(text,
        max_length=32,  # Set your desired maximum sequence length
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True)
    return tokenized_input

ner_dataset["tokenized_input"]=ner_dataset['Sentence'].apply(tokenize_text)

In [35]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g...","[input_ids, token_type_ids, attention_mask]"
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the BakuTbilisiCeyhan oil ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-org,...","[input_ids, token_type_ids, attention_mask]"
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp...","[input_ids, token_type_ids, attention_mask]"
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, s...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...",Zelenovic had lived in KhantyMansiisk some 20...,"[B-per, O, O, O, B-geo, O, O, O, O, O, B-geo, ...","[input_ids, token_type_ids, attention_mask]"
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[input_ids, token_type_ids, attention_mask]"
...,...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]"
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]"
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50,"[O, O, O, O, O, O, O, B-tim]","[input_ids, token_type_ids, attention_mask]"
1998,1998,Sentence: 30109,"[Last, week, diplomats, at, an, International,...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...",Last week diplomats at an International Atomi...,"[O, O, O, O, O, B-org, I-org, I-org, I-org, O,...","[input_ids, token_type_ids, attention_mask]"


labels2id and id2tabels dictionary. To give each entity an unique id.

In [38]:
label_names=list(set([item for sublist in ner_dataset["Labels"] for item in sublist]))
label_names

['B-art',
 'B-org',
 'B-tim',
 'I-nat',
 'I-eve',
 'B-nat',
 'I-org',
 'I-per',
 'I-tim',
 'B-geo',
 'I-geo',
 'O',
 'B-eve',
 'I-gpe',
 'B-per',
 'I-art',
 'B-gpe']

In [39]:
label2id = {k: v for v, k in enumerate(label_names)}
id2label = {v: k for v, k in enumerate(label_names)}
label2id

{'B-art': 0,
 'B-org': 1,
 'B-tim': 2,
 'I-nat': 3,
 'I-eve': 4,
 'B-nat': 5,
 'I-org': 6,
 'I-per': 7,
 'I-tim': 8,
 'B-geo': 9,
 'I-geo': 10,
 'O': 11,
 'B-eve': 12,
 'I-gpe': 13,
 'B-per': 14,
 'I-art': 15,
 'B-gpe': 16}

In [40]:
id2label

{0: 'B-art',
 1: 'B-org',
 2: 'B-tim',
 3: 'I-nat',
 4: 'I-eve',
 5: 'B-nat',
 6: 'I-org',
 7: 'I-per',
 8: 'I-tim',
 9: 'B-geo',
 10: 'I-geo',
 11: 'O',
 12: 'B-eve',
 13: 'I-gpe',
 14: 'B-per',
 15: 'I-art',
 16: 'B-gpe'}

Now we need to adjust the labels with the tokens as some words can be divided into multiple tokens.

In [45]:
def adjusted_label(tokenized_samples,labels):
    #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
    #so the new keys [input_ids, labels (after adjustment)]
    #can be added to the datasets dict for each train test validation split
    total_adjusted_labels = []
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=0)
    existing_label_ids = [label2id[label] for label in labels]
    i = -1
    adjusted_label_ids = []
    for wid in word_ids_list:
        if(wid is None):
            adjusted_label_ids.append(-100)
        elif((wid!=prev_wid) & (wid != None)):
            i=i+1
            if(i<len(existing_label_ids)):
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
        else:
            adjusted_label_ids.append(existing_label_ids[i])
    if(len(adjusted_label_ids)!=32):
        print("mismatch")
    return adjusted_label_ids

In [46]:
ner_dataset["Adjusted Labels"] = ner_dataset.apply(lambda row: adjusted_label(row["tokenized_input"], row["Labels"]), axis=1)

In [47]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 14, 7, 11, 1, 11, 11, 1..."
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the BakuTbilisiCeyhan oil ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-org,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,..."
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 14,..."
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, s...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...",Zelenovic had lived in KhantyMansiisk some 20...,"[B-per, O, O, O, B-geo, O, O, O, O, O, B-geo, ...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 14, 11, 11, 11, 9, 9, 9, 9, 9, ..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,..."
...,...,...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 11, 11,..."
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, ..."
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50,"[O, O, O, O, O, O, O, B-tim]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 2, -100, -1..."
1998,1998,Sentence: 30109,"[Last, week, diplomats, at, an, International,...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...",Last week diplomats at an International Atomi...,"[O, O, O, O, O, B-org, I-org, I-org, I-org, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 1, 6, 6, 6, 11,..."


Now we will check if lengths of all elements of adjusted_labels is same as corresponding labels of input_ids of tokenized_inputs.

In [48]:
s=0

for i in range(2000):
    if(len(ner_dataset["Adjusted Labels"][i])!=len(ner_dataset["tokenized_input"][i]["input_ids"][0])):
        print("Mismatch")
        print(i)
        print(len(ner_dataset["Adjusted Labels"][i]))
        print(len(ner_dataset["Labels"][i]))
        s=s+1
        print("\n")
        
print("No of mismatches:",s)

No of mismatches: 0


In [52]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 14, 7, 11, 1, 11, 11, 1..."
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the BakuTbilisiCeyhan oil ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-org,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,..."
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 14,..."
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, s...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...",Zelenovic had lived in KhantyMansiisk some 20...,"[B-per, O, O, O, B-geo, O, O, O, O, O, B-geo, ...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 14, 11, 11, 11, 9, 9, 9, 9, 9, ..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,..."
...,...,...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 11, 11,..."
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, ..."
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50,"[O, O, O, O, O, O, O, B-tim]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 2, -100, -1..."
1998,1998,Sentence: 30109,"[Last, week, diplomats, at, an, International,...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...",Last week diplomats at an International Atomi...,"[O, O, O, O, O, B-org, I-org, I-org, I-org, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 1, 6, 6, 6, 11,..."


We need to make the training data as the format supported to be passed onto BERT model.

In [55]:
def make_training_data(tokenized_input,adjusted_labels):
    input_ids=tokenized_input["input_ids"][0]
    token_type_ids=tokenized_input["token_type_ids"][0]
    attention_mask=tokenized_input["attention_mask"][0]
    
    labels=torch.tensor(adjusted_labels)
    features={
        'input_ids': input_ids,
        'token_type_ids':token_type_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
    return features

In [56]:
ner_dataset["features"] = ner_dataset.apply(lambda row: make_training_data(row['tokenized_input'], row['Adjusted Labels']), axis=1)

In [59]:
ner_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
0,0,Sentence: 22048,"[The, report, calls, on, President, Bush, and,...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 14, 7, 11, 1, 11, 11, 1...","{'input_ids': [tensor(101), tensor(10117), ten..."
1,1,Sentence: 1273,"[The, construction, on, the, Baku-T'bilisi-Cey...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the BakuTbilisiCeyhan oil ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-org,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(10117), ten..."
2,2,Sentence: 1541,"[The, pact, was, initially, approved, after, d...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 14,...","{'input_ids': [tensor(101), tensor(10117), ten..."
3,3,Sentence: 41443,"[Zelenovic, had, lived, in, Khanty-Mansiisk, s...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...",Zelenovic had lived in KhantyMansiisk some 20...,"[B-per, O, O, O, B-geo, O, O, O, O, O, B-geo, ...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 14, 11, 11, 11, 9, 9, 9, 9, 9, ...","{'input_ids': [tensor(101), tensor(15536), ten..."
4,4,Sentence: 18642,"[Exports, have, grown, significantly, because,...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(108268), te..."
...,...,...,...,...,...,...,...,...,...,...
1995,1995,Sentence: 21531,"[A, Somali, government, spokesman, says, some,...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(138), tenso..."
1996,1996,Sentence: 40507,"[Israel, has, raided, the, camp, repeatedly, t...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(12991), ten..."
1997,1997,Sentence: 4093,"[The, government, puts, the, figure, at, aroun...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50,"[O, O, O, O, O, O, O, B-tim]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 2, -100, -1...","{'input_ids': [tensor(101), tensor(10117), ten..."
1998,1998,Sentence: 30109,"[Last, week, diplomats, at, an, International,...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...",Last week diplomats at an International Atomi...,"[O, O, O, O, O, B-org, I-org, I-org, I-org, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 1, 6, 6, 6, 11,...","{'input_ids': [tensor(101), tensor(14812), ten..."


We need to divide the dataset into train and test data.

In [119]:
from sklearn.model_selection import train_test_split

train_ner,test_ner=train_test_split(ner_dataset, test_size=0.2, random_state=42)

In [120]:
len(train_ner)

1600

In [121]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
968,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten..."
240,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten..."
819,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten..."
692,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten..."
420,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten..."
...,...,...,...,...,...,...,...,...,...,...
1130,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten..."
1294,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten..."
860,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten..."
1459,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten..."


In [62]:
train_dataset=train_ner["features"].tolist()

In [63]:
train_dataset

[{'input_ids': tensor([  101, 43309, 10808, 19382, 10108, 11104, 12845, 84638, 12715, 10637,
          31423, 10428, 10169, 28446, 45652, 10107, 10106, 11586, 40714, 35629,
          10108, 10485, 37307, 10114, 10105, 15441, 11830, 10873, 10531, 16118,
            102,     0]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 0]),
  'labels': tensor([-100,   11,    1,   11,   11,    1,   14,   14,   14,   14,    7,   11,
            11,   16,   11,   11,   11,    9,    2,   11,   11,   11,   11,   11,
            11,    9,   10,    2,   11,   11, -100, -100])},
 {'input_ids': tensor([  101, 60555, 10873, 10106, 17156, 14074, 50400, 14342,   169, 13213,
          53021, 39127, 11598, 82228, 23324, 10135, 62976,   102,     0,     0,
              0,     0,     0,     0,    

In [65]:
len(train_dataset)

1600

In [66]:
test_dataset=test_ner["features"].tolist()

In [67]:
test_dataset

[{'input_ids': tensor([  101, 28221, 15938, 10393, 31952, 25566, 12533, 64886, 10114, 31877,
          95167, 10188, 86786, 79965, 12141, 25121, 12373, 10108, 35396, 14179,
          16691, 12410,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
          0, 0, 0, 0, 0, 0, 0, 0]),
  'labels': tensor([-100,   14,   14,   11,   11,   11,   11,   11,   11,   11,   11,   11,
             1,   11,   11,   11,   11,   11,    9,   11,   11,   11, -100, -100,
          -100, -100, -100, -100, -100, -100, -100, -100])},
 {'input_ids': tensor([  101, 10167, 73520, 12916, 49912, 74675, 10124, 70676, 10146, 22807,
          12901, 11337, 10192, 20165, 28944, 10105, 32176, 14890, 11127, 22698,
          76305, 10822, 38031, 12277, 102

In [68]:
len(test_dataset)

400

Now we will train the model using training data and save the model as a external folder.

In [70]:
from tqdm import tqdm
model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", id2label=id2label, label2id=label2id,num_labels=len(id2label))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
from transformers import AdamW
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm

# Define training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for batch in progress_bar:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'loss': loss.item()})
    
#     # Evaluate on the validation set
#     model.eval()
#     val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch['input_ids']
#             attention_mask = batch['attention_mask']
#             labels = batch['labels']
#             outputs = model(input_ids, attention_mask=attention_mask)
#             _, predicted = torch.max(outputs.logits, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     accuracy = correct / total
#     print(f'Validation Accuracy: {accuracy}')
    model.train()  # Set the model back to training mode

# Save the trained model
output_dir = "./New_model_BERT"
model.save_pretrained(output_dir)



Epoch 1/3


Epoch 1/3: 100%|██████████| 400/400 [38:05<00:00,  5.71s/it, loss=0.0548]


Epoch 2/3


Epoch 2/3: 100%|██████████| 400/400 [38:23<00:00,  5.76s/it, loss=0.111]  


Epoch 3/3


Epoch 3/3: 100%|██████████| 400/400 [36:42<00:00,  5.51s/it, loss=0.0788] 


In [166]:
ner_dataset

Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,tokenized_input,Labels,Adjusted Labels,features,Word List,Predicted Labels,Tokens
0,Sentence: 22048,"['The', 'report', 'calls', 'on', 'President', ...","['DT', 'NN', 'VBZ', 'IN', 'NNP', 'NNP', 'CC', ...","['O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'B...",The report calls on President Bush and Congres...,"[input_ids, token_type_ids, attention_mask]","[O, O, O, O, B-per, I-per, O, B-org, O, O, B-g...","[-100, 0, 0, 0, 0, 13, 3, 0, 14, 0, 0, 0, 2, 0...","{'input_ids': [tensor(101), tensor(10117), ten...","[The, report, calls, on, President, Bush, and,...","[O, O, O, O, O, B-per, I-per, O, B-org, O, O, ...","[The, report, calls, on, President, Bush, and,..."
1,Sentence: 1273,"['The', 'construction', 'on', 'the', ""Baku-T'b...","['DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NN', ','...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",The construction on the Baku-T'bilisi-Ceyhan o...,"[input_ids, token_type_ids, attention_mask]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'input_ids': [tensor(101), tensor(10117), ten...","[The, construction, on, the, Baku-T'bilisi-Cey...","[O, O, O, O, O, B-org, O, O, O, O, O, O, O, O,...","[The, construction, on, the, Baku, -, T, ', bi..."
2,Sentence: 1541,"['The', 'pact', 'was', 'initially', 'approved'...","['DT', 'NN', 'VBD', 'RB', 'VBN', 'IN', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pe...",The pact was initially approved after discussi...,"[input_ids, token_type_ids, attention_mask]","[O, O, O, O, O, O, O, O, B-per, I-per, O, B-gp...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 3, 0, 2,...","{'input_ids': [tensor(101), tensor(10117), ten...","[The, pact, was, initially, approved, after, d...","[O, O, O, O, O, O, O, O, O, O, B-per, I-per, O...","[The, pa, ##ct, was, initially, approved, afte..."
3,Sentence: 41443,"['Zelenovic', 'had', 'lived', 'in', 'Khanty-Ma...","['NNP', 'VBD', 'VBN', 'IN', 'NNP', ',', 'DT', ...","['B-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O...","Zelenovic had lived in Khanty-Mansiisk , some ...","[input_ids, token_type_ids, attention_mask]","[B-per, O, O, O, B-geo, O, O, O, O, O, O, B-ge...","[-100, 13, 13, 13, 0, 0, 0, 6, 6, 0, 0, 0, 0, ...","{'input_ids': [tensor(101), tensor(15536), ten...","[Zelenovic, had, lived, in, Khanty-Mansiisk, ,...","[O, B-per, B-per, B-per, O, O, O, B-geo, B-geo...","[Ze, ##leno, ##vic, had, lived, in, Khan, ##ty..."
4,Sentence: 18642,"['Exports', 'have', 'grown', 'significantly', ...","['NNS', 'VBP', 'VBN', 'RB', 'IN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Exports have grown significantly because of th...,"[input_ids, token_type_ids, attention_mask]","[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'input_ids': [tensor(101), tensor(108268), te...","[Exports, have, grown, significantly, because,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-o...","[Export, ##s, have, grown, significantly, beca..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Sentence: 21531,"['A', 'Somali', 'government', 'spokesman', 'sa...","['DT', 'JJ', 'NN', 'NN', 'VBZ', 'DT', 'JJ', 'N...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",A Somali government spokesman says some coasta...,"[input_ids, token_type_ids, attention_mask]","[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[-100, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'input_ids': [tensor(101), tensor(138), tenso...","[A, Somali, government, spokesman, says, some,...","[O, O, B-gpe, O, O, O, O, O, O, O, O, O, O, O,...","[A, Somali, government, spoke, ##sman, says, s..."
1996,Sentence: 40507,"['Israel', 'has', 'raided', 'the', 'camp', 're...","['NNP', 'VBZ', 'VBN', 'DT', 'NN', 'RB', 'TO', ...","['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Israel has raided the camp repeatedly to kill ...,"[input_ids, token_type_ids, attention_mask]","[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[-100, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'input_ids': [tensor(101), tensor(12991), ten...","[Israel, has, raided, the, camp, repeatedly, t...","[O, B-geo, O, O, O, O, O, O, O, O, O, O, O, O,...","[Israel, has, raid, ##ed, the, camp, repeatedl..."
1997,Sentence: 4093,"['The', 'government', 'puts', 'the', 'figure',...","['DT', 'NN', 'VBZ', 'DT', 'NN', 'IN', 'IN', 'C...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O']",The government puts the figure at around 50 .,"[input_ids, token_type_ids, attention_mask]","[O, O, O, O, O, O, O, B-tim, O]","[-100, 0, 0, 0, 0, 0, 0, 0, 7, 0, -100, -100, ...","{'input_ids': [tensor(101), tensor(10117), ten...","[The, government, puts, the, figure, at, aroun...","[O, O, O, O, O, O, O, O, O, O, O]","[The, government, puts, the, figure, at, aroun..."
1998,Sentence: 30109,"['Last', 'week', ',', 'diplomats', 'at', 'an',...","['JJ', 'NN', ',', 'NNS', 'IN', 'DT', 'NNP', 'N...","['O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org...","Last week , diplomats at an International Atom...","[input_ids, token_type_ids, attention_mask]","[O, O, O, O, O, O, B-org, I-org, I-org, I-org,...","[-100, 0, 0, 0, 0, 0, 0, 0, 14, 8, 8, 8, 0, 0,...","{'input_ids': [tensor(101), tensor(14812), ten...","[Last, week, ,, diplomats, at, an, Internation...","[O, O, O, O, O, O, O, O, B-org, I-org, I-org, ...","[Last, week, ,, diplomat, ##s, at, an, Interna..."


Now will predict the labels for training and test dataset. 

In [72]:
def get_prediction(Sentence):
    # Step 1: Tokenization
    tokens = tokenizer.tokenize(Sentence)
    inputs = tokenizer_fast(Sentence, return_tensors="pt")
    # Step 2: Inference
    with torch.no_grad():
        outputs = model(**inputs)
    import torch.nn.functional as F

    # Assuming your TokenClassifierOutput object is named 'output'
    logits = outputs.logits

    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1)

    # Get the predicted labels (classes with highest probability)
    predicted_labels = torch.argmax(probs, dim=-1)

    # Convert predicted_labels to a list if necessary
    predicted_labels = predicted_labels.tolist()
    labels = [id2label[p] for p in predicted_labels[0]]
    return labels

In [73]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
968,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten..."
240,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten..."
819,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten..."
692,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten..."
420,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten..."
...,...,...,...,...,...,...,...,...,...,...
1130,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten..."
1294,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten..."
860,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten..."
1459,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten..."


In [74]:
test_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
1860,1860,Sentence: 22260,"[Rangoon, has, faced, growing, international, ...","['NNP', 'VBZ', 'VBN', 'VBG', 'JJ', 'NNS', 'TO'...","['B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Rangoon has faced growing international demand...,"[B-per, O, O, O, O, O, O, O, O, O, B-org, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(28221), ten..."
353,353,Sentence: 35257,"[In, Tehran, Mr., Rowhani, is, quoted, as, say...","['IN', 'NNP', ',', 'NNP', 'NNP', 'VBZ', 'VBN',...","['O', 'B-geo', 'O', 'B-per', 'I-per', 'O', 'O'...",In Tehran Mr Rowhani is quoted as saying Iran...,"[O, B-geo, B-per, I-per, O, O, O, O, B-geo, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 9, 14, 7, 7, 11, 11, 11, 11, 9, 11,...","{'input_ids': [tensor(101), tensor(10167), ten..."
1333,1333,Sentence: 36898,"[Russian, environmental, officials, have, dete...","['JJ', 'JJ', 'NNS', 'VBP', 'VBN', 'DT', 'NN', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Russian environmental officials have detected ...,"[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(13463), ten..."
905,905,Sentence: 12595,"[Coach, went, out, and, set, up, our, new, pit...","['NNP', 'VBD', 'RB', 'CC', 'VB', 'RP', 'PRP$',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Coach went out and set up our new pitching mac...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(38345), ten..."
1289,1289,Sentence: 41688,"[Gazprom, threatened, to, halt, gas, deliverie...","['NNP', 'VBD', 'TO', 'VB', 'NN', 'NNS', 'TO', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Gazprom threatened to halt gas deliveries to U...,"[B-org, O, O, O, O, O, O, O, O, B-tim, O, O, B...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(69699), ten..."
...,...,...,...,...,...,...,...,...,...,...
965,965,Sentence: 29660,"[Tehran, says, it, wants, to, produce, low-gra...","['NNP', 'VBZ', 'PRP', 'VBZ', 'TO', 'VB', 'JJ',...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Tehran says it wants to produce lowgrade fuel ...,"[B-org, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(73520), ten..."
1284,1284,Sentence: 17008,"[Palestinian, witnesses, say, Israeli, forces,...","['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O...",Palestinian witnesses say Israeli forces have ...,"[B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gpe, ...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 16, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(56119), ten..."
1739,1739,Sentence: 17329,"[Xinhua, news, agency, quoted, zoo, official, ...","['NNP', 'NN', 'NN', 'VBN', 'NN', 'NN', 'NNP', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I...",Xinhua news agency quoted zoo official Zhang J...,"[B-org, O, O, O, O, O, B-per, I-per, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 11, 11, 11, 11, 11, 14, 7, 7, ...","{'input_ids': [tensor(101), tensor(59876), ten..."
261,261,Sentence: 46053,"[However, protesters, in, Iran, Friday, threw,...","['RB', 'NNS', 'IN', 'NNP', 'NNP', 'VBD', 'NNS'...","['O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'O', 'O...",However protesters in Iran Friday threw bricks...,"[O, O, O, B-geo, B-tim, O, O, O, O, O, O, B-or...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 2, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(12209), ten..."


In [75]:
train_ner=train_ner.reset_index(drop=True)

In [76]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
0,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten..."
1,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten..."
2,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten..."
3,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten..."
4,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten..."
...,...,...,...,...,...,...,...,...,...,...
1595,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten..."
1596,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten..."
1597,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten..."
1598,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten..."


In [77]:
test_ner=test_ner.reset_index(drop=True)

In [78]:
test_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
0,1860,Sentence: 22260,"[Rangoon, has, faced, growing, international, ...","['NNP', 'VBZ', 'VBN', 'VBG', 'JJ', 'NNS', 'TO'...","['B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Rangoon has faced growing international demand...,"[B-per, O, O, O, O, O, O, O, O, O, B-org, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(28221), ten..."
1,353,Sentence: 35257,"[In, Tehran, Mr., Rowhani, is, quoted, as, say...","['IN', 'NNP', ',', 'NNP', 'NNP', 'VBZ', 'VBN',...","['O', 'B-geo', 'O', 'B-per', 'I-per', 'O', 'O'...",In Tehran Mr Rowhani is quoted as saying Iran...,"[O, B-geo, B-per, I-per, O, O, O, O, B-geo, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 9, 14, 7, 7, 11, 11, 11, 11, 9, 11,...","{'input_ids': [tensor(101), tensor(10167), ten..."
2,1333,Sentence: 36898,"[Russian, environmental, officials, have, dete...","['JJ', 'JJ', 'NNS', 'VBP', 'VBN', 'DT', 'NN', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Russian environmental officials have detected ...,"[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(13463), ten..."
3,905,Sentence: 12595,"[Coach, went, out, and, set, up, our, new, pit...","['NNP', 'VBD', 'RB', 'CC', 'VB', 'RP', 'PRP$',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Coach went out and set up our new pitching mac...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(38345), ten..."
4,1289,Sentence: 41688,"[Gazprom, threatened, to, halt, gas, deliverie...","['NNP', 'VBD', 'TO', 'VB', 'NN', 'NNS', 'TO', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Gazprom threatened to halt gas deliveries to U...,"[B-org, O, O, O, O, O, O, O, O, B-tim, O, O, B...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(69699), ten..."
...,...,...,...,...,...,...,...,...,...,...
395,965,Sentence: 29660,"[Tehran, says, it, wants, to, produce, low-gra...","['NNP', 'VBZ', 'PRP', 'VBZ', 'TO', 'VB', 'JJ',...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Tehran says it wants to produce lowgrade fuel ...,"[B-org, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(73520), ten..."
396,1284,Sentence: 17008,"[Palestinian, witnesses, say, Israeli, forces,...","['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O...",Palestinian witnesses say Israeli forces have ...,"[B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gpe, ...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 16, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(56119), ten..."
397,1739,Sentence: 17329,"[Xinhua, news, agency, quoted, zoo, official, ...","['NNP', 'NN', 'NN', 'VBN', 'NN', 'NN', 'NNP', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I...",Xinhua news agency quoted zoo official Zhang J...,"[B-org, O, O, O, O, O, B-per, I-per, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 11, 11, 11, 11, 11, 14, 7, 7, ...","{'input_ids': [tensor(101), tensor(59876), ten..."
398,261,Sentence: 46053,"[However, protesters, in, Iran, Friday, threw,...","['RB', 'NNS', 'IN', 'NNP', 'NNP', 'VBD', 'NNS'...","['O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'O', 'O...",However protesters in Iran Friday threw bricks...,"[O, O, O, B-geo, B-tim, O, O, O, O, O, O, B-or...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 2, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(12209), ten..."


In [79]:
train_ner["Predicted Labels"]=train_ner["Sentence"].apply(get_prediction)
test_ner["Predicted Labels"]=test_ner["Sentence"].apply(get_prediction)

In [83]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features,Predicted Labels
0,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten...","[O, O, B-org, I-org, I-org, I-org, B-per, B-pe..."
1,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten...","[O, O, O, O, O, B-geo, B-geo, O, O, O, O, O, O..."
2,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten...","[O, O, O, B-geo, O, O, O, O, O, O, B-geo, O, O..."
3,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten...","[O, B-org, O, O, O, O, O, O, O, O, O, O, O, O,..."
4,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten...","[O, O, B-gpe, O, O, O, O, O, O, B-org, I-org, ..."
...,...,...,...,...,...,...,...,...,...,...,...
1595,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1596,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten...","[O, O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O..."
1597,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten...","[O, O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O..."
1598,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten...","[O, B-per, O, B-geo, O, O, O, O, O, B-per, I-p..."


The length of predicted labels will not be same with the no of words in the sentence. So we need the tokens of each sentence.

In [84]:
def get_tokens(Sentence):
    # Step 1: Tokenization
    tokens = tokenizer.tokenize(Sentence)
    return tokens

train_ner["Tokens"]=train_ner["Sentence"].apply(get_tokens)
test_ner["Tokens"]=test_ner["Sentence"].apply(get_tokens)

In [86]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features,Predicted Labels,Tokens
0,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten...","[O, O, B-org, I-org, I-org, I-org, B-per, B-pe...","[Meanwhile, US, Secretary, of, State, Con, ##d..."
1,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten...","[O, O, O, O, O, B-geo, B-geo, O, O, O, O, O, O...","[Hours, later, in, southern, Za, ##bul, provin..."
2,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten...","[O, O, O, B-geo, O, O, O, O, O, O, B-geo, O, O...","[Police, in, Iraq, have, found, 14, bodies, in..."
3,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten...","[O, B-org, O, O, O, O, O, O, O, O, O, O, O, O,...","[Military, officials, say, security, forces, c..."
4,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten...","[O, O, B-gpe, O, O, O, O, O, O, B-org, I-org, ...","[The, French, are, trying, to, win, their, thi..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Official, ##s, say, they, found, bomb, ##maki..."
1596,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten...","[O, O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O...","[They, plan, to, meet, with, leaders, of, Paki..."
1597,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten...","[O, O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O...","[In, a, separate, incident, today, authorities..."
1598,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten...","[O, B-per, O, B-geo, O, O, O, O, O, B-per, I-p...","[Lima, says, Tokyo, is, inter, ##feri, ##ng, i..."


In [87]:
test_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features,Predicted Labels,Tokens
0,1860,Sentence: 22260,"[Rangoon, has, faced, growing, international, ...","['NNP', 'VBZ', 'VBN', 'VBG', 'JJ', 'NNS', 'TO'...","['B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Rangoon has faced growing international demand...,"[B-per, O, O, O, O, O, O, O, O, O, B-org, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(28221), ten...","[O, B-per, B-org, O, O, O, O, O, O, O, O, O, B...","[Rang, ##oon, has, faced, growing, internation..."
1,353,Sentence: 35257,"[In, Tehran, Mr., Rowhani, is, quoted, as, say...","['IN', 'NNP', ',', 'NNP', 'NNP', 'VBZ', 'VBN',...","['O', 'B-geo', 'O', 'B-per', 'I-per', 'O', 'O'...",In Tehran Mr Rowhani is quoted as saying Iran...,"[O, B-geo, B-per, I-per, O, O, O, O, B-geo, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 9, 14, 7, 7, 11, 11, 11, 11, 9, 11,...","{'input_ids': [tensor(101), tensor(10167), ten...","[O, O, B-geo, B-per, I-per, I-per, O, O, O, O,...","[In, Tehran, Mr, Row, ##hani, is, quoted, as, ..."
2,1333,Sentence: 36898,"[Russian, environmental, officials, have, dete...","['JJ', 'JJ', 'NNS', 'VBP', 'VBN', 'DT', 'NN', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Russian environmental officials have detected ...,"[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(13463), ten...","[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,...","[Russian, environmental, officials, have, det,..."
3,905,Sentence: 12595,"[Coach, went, out, and, set, up, our, new, pit...","['NNP', 'VBD', 'RB', 'CC', 'VB', 'RP', 'PRP$',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Coach went out and set up our new pitching mac...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(38345), ten...","[O, B-per, O, O, O, O, O, O, O, O, O, O, O, O,...","[Coach, went, out, and, set, up, our, new, pit..."
4,1289,Sentence: 41688,"[Gazprom, threatened, to, halt, gas, deliverie...","['NNP', 'VBD', 'TO', 'VB', 'NN', 'NNS', 'TO', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Gazprom threatened to halt gas deliveries to U...,"[B-org, O, O, O, O, O, O, O, O, B-tim, O, O, B...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(69699), ten...","[O, B-org, B-org, B-org, B-org, O, O, O, O, O,...","[Ga, ##z, ##pro, ##m, threatened, to, halt, ga..."
...,...,...,...,...,...,...,...,...,...,...,...,...
395,965,Sentence: 29660,"[Tehran, says, it, wants, to, produce, low-gra...","['NNP', 'VBZ', 'PRP', 'VBZ', 'TO', 'VB', 'JJ',...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Tehran says it wants to produce lowgrade fuel ...,"[B-org, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(73520), ten...","[O, B-org, O, O, O, O, O, O, O, O, O, O, O, O,...","[Tehran, says, it, wants, to, produce, low, ##..."
396,1284,Sentence: 17008,"[Palestinian, witnesses, say, Israeli, forces,...","['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O...",Palestinian witnesses say Israeli forces have ...,"[B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gpe, ...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 16, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(56119), ten...","[O, B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gp...","[Palestinian, witnesses, say, Israeli, forces,..."
397,1739,Sentence: 17329,"[Xinhua, news, agency, quoted, zoo, official, ...","['NNP', 'NN', 'NN', 'VBN', 'NN', 'NN', 'NNP', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I...",Xinhua news agency quoted zoo official Zhang J...,"[B-org, O, O, O, O, O, B-per, I-per, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 11, 11, 11, 11, 11, 14, 7, 7, ...","{'input_ids': [tensor(101), tensor(59876), ten...","[O, B-org, B-org, B-org, O, O, O, O, O, B-per,...","[Xi, ##nh, ##ua, news, agency, quoted, zoo, of..."
398,261,Sentence: 46053,"[However, protesters, in, Iran, Friday, threw,...","['RB', 'NNS', 'IN', 'NNP', 'NNP', 'VBD', 'NNS'...","['O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'O', 'O...",However protesters in Iran Friday threw bricks...,"[O, O, O, B-geo, B-tim, O, O, O, O, O, O, B-or...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 2, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(12209), ten...","[O, O, O, O, O, B-geo, B-tim, O, O, O, O, O, O...","[However, protest, ##ers, in, Iran, Friday, th..."


In [93]:
train_ner.to_csv("NER_Predicted_train.csv")

In [94]:
test_ner.to_csv("NER_Predicted_test.csv")

In [107]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features,Predicted Labels,Tokens
0,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten...","[O, O, B-org, I-org, I-org, I-org, B-per, B-pe...","[Meanwhile, US, Secretary, of, State, Con, ##d..."
1,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten...","[O, O, O, O, O, B-geo, B-geo, O, O, O, O, O, O...","[Hours, later, in, southern, Za, ##bul, provin..."
2,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten...","[O, O, O, B-geo, O, O, O, O, O, O, B-geo, O, O...","[Police, in, Iraq, have, found, 14, bodies, in..."
3,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten...","[O, B-org, O, O, O, O, O, O, O, O, O, O, O, O,...","[Military, officials, say, security, forces, c..."
4,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten...","[O, O, B-gpe, O, O, O, O, O, O, B-org, I-org, ...","[The, French, are, trying, to, win, their, thi..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Official, ##s, say, they, found, bomb, ##maki..."
1596,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten...","[O, O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O...","[They, plan, to, meet, with, leaders, of, Paki..."
1597,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten...","[O, O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O...","[In, a, separate, incident, today, authorities..."
1598,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten...","[O, B-per, O, B-geo, O, O, O, O, O, B-per, I-p...","[Lima, says, Tokyo, is, inter, ##feri, ##ng, i..."


In [199]:
s=0

for i in range(len(train_ner)):
    if(len(train_ner["Sentence"][i].split())!=len(train_ner["Labels"][i])):
        print("Labels Mismatch")
        print(i)
        print(len(train_ner["Sentence"][i].split()))
        print(len(train_ner["Labels"][i]))
        print(len(train_ner["Word"][i]))
        print(train_ner["Sentence"][i].split())
        print(train_ner["Word"][i])
        s=s+1
        print("\n")
        
print("No of mismatches:",s)

No of mismatches: 0


In [128]:
len(train_df_sample)

10

We will find the predicted labels.

In [163]:
import torch.nn.functional as F
from itertools import chain, repeat
import time

def process_data(dataset, tokenizer_fast, model, id2label):
    # Record start time
    start_time = time.time()

    total_tokens_list = []
    total_word_ids = []
    total_words = []
    total_predicted_labels = []
    total_actual_labels = []
    
    for i in range(len(dataset)):
        tokens = tokenizer_fast.encode_plus(dataset["Sentence"][i],
                                             max_length=32,  # Set your desired maximum sequence length
                                             padding='max_length',
                                             truncation=True,
                                             return_tensors='pt',
                                             return_attention_mask=True)
        word_ids_list = tokens.word_ids(batch_index=0)
        word_ids_notna = list(set([word for word in word_ids_list if word is not None]))
        input_ids = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        token_type_ids = tokens['token_type_ids']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # Apply softmax to get probabilities
        probs = F.softmax(outputs.logits, dim=-1)

        # Get the predicted labels (classes with highest probability)
        predicted_labels = torch.argmax(probs, dim=-1)
        # Convert predicted_labels to a list if necessary
        predicted_labels = predicted_labels.tolist()
        predicted_labels_list = [id2label[i] for i in predicted_labels[0]]
        tokens_list = tokenizer_fast.convert_ids_to_tokens(input_ids.squeeze())
        wordid2label = {"word_id": word_ids_list, "predicted_labels": predicted_labels_list, "Tokens": tokens_list}
        words_list = dataset["Sentence"][i].split()
        df = pd.DataFrame(wordid2label)
        words = []
        actual_labels = []
        for j in range(len(df)):
            if pd.isna(df.iloc[j, 0]):
                words.append("NA")
                actual_labels.append("NA")
            else:
                words.append(words_list[int(df.iloc[j, 0])])
                actual_labels.append(dataset["Labels"][i][int(df.iloc[j, 0])])
        total_tokens_list += tokens_list
        total_word_ids += word_ids_list
        total_words += words
        total_predicted_labels += predicted_labels_list
        total_actual_labels += actual_labels

    return total_tokens_list, total_word_ids, total_words, total_predicted_labels, total_actual_labels

In [172]:
train_ner=train_ner.reset_index(drop=True)

In [173]:
train_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
0,968,Sentence: 41313,"[Meanwhile, U.S., Secretary, of, State, Condol...","['RB', ',', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', ...","['O', 'O', 'B-org', 'O', 'O', 'B-org', 'B-per'...",Meanwhile US Secretary of State Condoleezza R...,"[O, B-org, O, O, B-org, B-per, I-per, O, O, B-...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 1, 11, 11, 1, 14, 14, 14, 14, 7, 11...","{'input_ids': [tensor(101), tensor(43309), ten..."
1,240,Sentence: 15886,"[Hours, later, in, southern, Zabul, province, ...","['NNS', 'RB', ',', 'IN', 'JJ', 'NNP', 'NN', ',...","['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', '...",Hours later in southern Zabul province a sim...,"[O, O, O, O, B-geo, O, O, O, O, O, O, B-gpe, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 9, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(60555), ten..."
2,819,Sentence: 23744,"[Police, in, Iraq, have, found, 14, bodies, in...","['NNS', 'IN', 'NNP', 'VBP', 'VBN', 'CD', 'NNS'...","['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', '...",Police in Iraq have found 14 bodies in norther...,"[O, O, B-geo, O, O, O, O, O, O, B-geo, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 9, 11, 11, 11, 11, 11, 11, 9, 1...","{'input_ids': [tensor(101), tensor(18051), ten..."
3,692,Sentence: 31835,"[Military, officials, say, security, forces, c...","['JJ', 'NNS', 'VBP', 'NN', 'NNS', 'VBN', 'CD',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Military officials say security forces capture...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(19425), ten..."
4,420,Sentence: 24700,"[The, French, are, trying, to, win, their, thi...","['DT', 'NNS', 'VBP', 'VBG', 'TO', 'VB', 'PRP$'...","['O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', '...",The French are trying to win their third Fed C...,"[O, B-gpe, O, O, O, O, O, O, B-org, I-org, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 16, 11, 11, 11, 11, 11, 11, 1, 6, 1...","{'input_ids': [tensor(101), tensor(10117), ten..."
...,...,...,...,...,...,...,...,...,...,...
1595,1130,Sentence: 32541,"[Officials, say, they, found, bomb-making, mat...","['NNS', 'VBP', 'PRP', 'VBD', 'JJ', 'NNS', 'IN'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Officials say they found bombmaking materials ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(17678), ten..."
1596,1294,Sentence: 15134,"[They, plan, to, meet, with, leaders, of, Paki...","['PRP', 'VBP', 'TO', 'VB', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', '...",They plan to meet with leaders of Pakistani Ka...,"[O, O, O, O, O, O, O, B-gpe, B-geo, O, O, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 16, 9, 11, ...","{'input_ids': [tensor(101), tensor(11696), ten..."
1597,860,Sentence: 38008,"[In, a, separate, incident, today, authorities...","['IN', 'DT', 'JJ', 'NN', 'NN', ',', 'NNS', 'VB...","['O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', '...",In a separate incident today authorities said...,"[O, O, O, O, B-tim, O, O, O, O, O, B-gpe, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 2, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(10167), ten..."
1598,1459,Sentence: 30686,"[Lima, says, Tokyo, is, interfering, in, Mr., ...","['NNP', 'VBZ', 'NNP', 'VBZ', 'VBG', 'IN', 'NNP...","['B-per', 'O', 'B-geo', 'O', 'O', 'O', 'B-per'...",Lima says Tokyo is interfering in Mr Fujimori ...,"[B-per, O, B-geo, O, O, O, B-per, B-geo, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 11, 9, 11, 11, 11, 11, 11, 14, 9, 9...","{'input_ids': [tensor(101), tensor(18671), ten..."


We will get the words, tokens and along with it's actual and predicted labels in one dataset. We are doing this for training data now.

In [174]:
start_time = time.time()

total_sentence_id = list(chain.from_iterable(repeat(i, 32) for i in range(1, len(train_ner))))

total_tokens_list_train, \
total_word_ids_train, \
total_words_train, \
total_predicted_labels_train, \
total_actual_labels_train = \
process_data(train_ner, tokenizer_fast, model, id2label)

dict_train = {
    "Sentence_id": total_tokens_list_train,
    "word_id": total_word_ids_train,
    "predicted_labels": total_predicted_labels_train,
    "Tokens": total_tokens_list_train,
    "Words": total_words_train,
    "actual_labels": total_actual_labels_train
}

total_df_train = pd.DataFrame(dict_train)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print("Execution time:", elapsed_time, "seconds")

Execution time: 544.8418350219727 seconds


In [180]:
print("Execution time:", elapsed_time/60, "minutes")

Execution time: 9.08069725036621 minutes


In [175]:
total_df_train

Unnamed: 0,Sentence_id,word_id,predicted_labels,Tokens,Words,actual_labels
0,[CLS],,O,[CLS],,
1,Meanwhile,0.0,O,Meanwhile,Meanwhile,O
2,US,1.0,B-org,US,US,B-org
3,Secretary,2.0,I-org,Secretary,Secretary,O
4,of,3.0,I-org,of,of,O
...,...,...,...,...,...,...
51195,[SEP],,O,[SEP],,
51196,[PAD],,I-org,[PAD],,
51197,[PAD],,I-org,[PAD],,
51198,[PAD],,I-org,[PAD],,


In [193]:
total_df_train.to_csv("Predicted_Total_train.csv")

Now we have to adjust the labels according to words so that for each word there will be one predicted entity.

In [176]:
import pandas as pd

def extract_data_to_dataframe(total_df):
    prev_id = -1
    word_list_2 = []
    actual_labels_list_2 = []
    sentence_id_list_2 = []
    predicted_labels_list_2 = []
    
    for i in range(len(total_df)):
        if (total_df["word_id"][i] != prev_id) and pd.notna(total_df["word_id"][i]):
            word_list_2.append(total_df["Words"][i])
            actual_labels_list_2.append(total_df["actual_labels"][i])
            predicted_labels_list_2.append(total_df["predicted_labels"][i])
            sentence_id_list_2.append(total_df["Sentence_id"][i])
            prev_id = total_df["word_id"][i]
    
    dict = {"Sentence ID": sentence_id_list_2, 
            "Words": word_list_2, 
            "Actual Labels": actual_labels_list_2, 
            "Predicted Labels": predicted_labels_list_2}
    
    df = pd.DataFrame(dict)
    return df

In [177]:
train_df_predicted = extract_data_to_dataframe(total_df_train)

In [178]:
train_df_predicted

Unnamed: 0,Sentence ID,Words,Actual Labels,Predicted Labels
0,Meanwhile,Meanwhile,O,O
1,US,US,B-org,B-org
2,Secretary,Secretary,O,I-org
3,of,of,O,I-org
4,State,State,B-org,I-org
...,...,...,...,...
30489,single,singleday,O,O
30490,record,record,O,O
30491,for,for,O,O
30492,the,the,O,O


In [194]:
train_df_predicted.to_csv("NER_train_predicted.csv")

In [179]:
len(train_df_predicted[train_df_predicted["Actual Labels"]==train_df_predicted["Predicted Labels"]])

29695

Now we will find the evaluation metric of the model.

In [181]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Example true labels and predicted labels
true_labels_train = train_df_predicted["Actual Labels"].tolist()
predicted_labels_train = train_df_predicted["Predicted Labels"].tolist()

# Calculate precision, recall, accuracy, and F1 score
accuracy_train = accuracy_score(true_labels_train, predicted_labels_train)
precision_train = precision_score(true_labels_train, predicted_labels_train,average="micro")
recall_train = recall_score(true_labels_train, predicted_labels_train,average="micro")
f1_train = f1_score(true_labels_train, predicted_labels_train,average="micro")

print("Training Accuracy:", accuracy_train)
print("Training Precision:", precision_train)
print("Training Recall:", recall_train)
print("Training F1 Score:", f1_train)

Training Accuracy: 0.9737981242211583
Training Precision: 0.9737981242211583
Training Recall: 0.9737981242211583
Training F1 Score: 0.9737981242211583


In [183]:
test_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
1860,1860,Sentence: 22260,"[Rangoon, has, faced, growing, international, ...","['NNP', 'VBZ', 'VBN', 'VBG', 'JJ', 'NNS', 'TO'...","['B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Rangoon has faced growing international demand...,"[B-per, O, O, O, O, O, O, O, O, O, B-org, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(28221), ten..."
353,353,Sentence: 35257,"[In, Tehran, Mr., Rowhani, is, quoted, as, say...","['IN', 'NNP', ',', 'NNP', 'NNP', 'VBZ', 'VBN',...","['O', 'B-geo', 'O', 'B-per', 'I-per', 'O', 'O'...",In Tehran Mr Rowhani is quoted as saying Iran...,"[O, B-geo, B-per, I-per, O, O, O, O, B-geo, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 9, 14, 7, 7, 11, 11, 11, 11, 9, 11,...","{'input_ids': [tensor(101), tensor(10167), ten..."
1333,1333,Sentence: 36898,"[Russian, environmental, officials, have, dete...","['JJ', 'JJ', 'NNS', 'VBP', 'VBN', 'DT', 'NN', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Russian environmental officials have detected ...,"[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(13463), ten..."
905,905,Sentence: 12595,"[Coach, went, out, and, set, up, our, new, pit...","['NNP', 'VBD', 'RB', 'CC', 'VB', 'RP', 'PRP$',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Coach went out and set up our new pitching mac...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(38345), ten..."
1289,1289,Sentence: 41688,"[Gazprom, threatened, to, halt, gas, deliverie...","['NNP', 'VBD', 'TO', 'VB', 'NN', 'NNS', 'TO', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Gazprom threatened to halt gas deliveries to U...,"[B-org, O, O, O, O, O, O, O, O, B-tim, O, O, B...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(69699), ten..."
...,...,...,...,...,...,...,...,...,...,...
965,965,Sentence: 29660,"[Tehran, says, it, wants, to, produce, low-gra...","['NNP', 'VBZ', 'PRP', 'VBZ', 'TO', 'VB', 'JJ',...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Tehran says it wants to produce lowgrade fuel ...,"[B-org, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(73520), ten..."
1284,1284,Sentence: 17008,"[Palestinian, witnesses, say, Israeli, forces,...","['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O...",Palestinian witnesses say Israeli forces have ...,"[B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gpe, ...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 16, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(56119), ten..."
1739,1739,Sentence: 17329,"[Xinhua, news, agency, quoted, zoo, official, ...","['NNP', 'NN', 'NN', 'VBN', 'NN', 'NN', 'NNP', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I...",Xinhua news agency quoted zoo official Zhang J...,"[B-org, O, O, O, O, O, B-per, I-per, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 11, 11, 11, 11, 11, 14, 7, 7, ...","{'input_ids': [tensor(101), tensor(59876), ten..."
261,261,Sentence: 46053,"[However, protesters, in, Iran, Friday, threw,...","['RB', 'NNS', 'IN', 'NNP', 'NNP', 'VBD', 'NNS'...","['O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'O', 'O...",However protesters in Iran Friday threw bricks...,"[O, O, O, B-geo, B-tim, O, O, O, O, O, O, B-or...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 2, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(12209), ten..."


In [184]:
test_ner=test_ner.reset_index(drop=True)

In [185]:
test_ner

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Word,POS,Tag,Sentence,Labels,tokenized_input,Adjusted Labels,features
0,1860,Sentence: 22260,"[Rangoon, has, faced, growing, international, ...","['NNP', 'VBZ', 'VBN', 'VBG', 'JJ', 'NNS', 'TO'...","['B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Rangoon has faced growing international demand...,"[B-per, O, O, O, O, O, O, O, O, O, B-org, O, O...","[input_ids, token_type_ids, attention_mask]","[-100, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(28221), ten..."
1,353,Sentence: 35257,"[In, Tehran, Mr., Rowhani, is, quoted, as, say...","['IN', 'NNP', ',', 'NNP', 'NNP', 'VBZ', 'VBN',...","['O', 'B-geo', 'O', 'B-per', 'I-per', 'O', 'O'...",In Tehran Mr Rowhani is quoted as saying Iran...,"[O, B-geo, B-per, I-per, O, O, O, O, B-geo, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 9, 14, 7, 7, 11, 11, 11, 11, 9, 11,...","{'input_ids': [tensor(101), tensor(10167), ten..."
2,1333,Sentence: 36898,"[Russian, environmental, officials, have, dete...","['JJ', 'JJ', 'NNS', 'VBP', 'VBN', 'DT', 'NN', ...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Russian environmental officials have detected ...,"[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(13463), ten..."
3,905,Sentence: 12595,"[Coach, went, out, and, set, up, our, new, pit...","['NNP', 'VBD', 'RB', 'CC', 'VB', 'RP', 'PRP$',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Coach went out and set up our new pitching mac...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(38345), ten..."
4,1289,Sentence: 41688,"[Gazprom, threatened, to, halt, gas, deliverie...","['NNP', 'VBD', 'TO', 'VB', 'NN', 'NNS', 'TO', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Gazprom threatened to halt gas deliveries to U...,"[B-org, O, O, O, O, O, O, O, O, B-tim, O, O, B...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(69699), ten..."
...,...,...,...,...,...,...,...,...,...,...
395,965,Sentence: 29660,"[Tehran, says, it, wants, to, produce, low-gra...","['NNP', 'VBZ', 'PRP', 'VBZ', 'TO', 'VB', 'JJ',...","['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",Tehran says it wants to produce lowgrade fuel ...,"[B-org, O, O, O, O, O, O, O, O, O, O, O, O]","[input_ids, token_type_ids, attention_mask]","[-100, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...","{'input_ids': [tensor(101), tensor(73520), ten..."
396,1284,Sentence: 17008,"[Palestinian, witnesses, say, Israeli, forces,...","['JJ', 'NNS', 'VBP', 'JJ', 'NNS', 'VBP', 'VBN'...","['B-gpe', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O...",Palestinian witnesses say Israeli forces have ...,"[B-gpe, O, O, B-gpe, O, O, O, O, O, O, B-gpe, ...","[input_ids, token_type_ids, attention_mask]","[-100, 16, 11, 11, 16, 11, 11, 11, 11, 11, 11,...","{'input_ids': [tensor(101), tensor(56119), ten..."
397,1739,Sentence: 17329,"[Xinhua, news, agency, quoted, zoo, official, ...","['NNP', 'NN', 'NN', 'VBN', 'NN', 'NN', 'NNP', ...","['B-org', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I...",Xinhua news agency quoted zoo official Zhang J...,"[B-org, O, O, O, O, O, B-per, I-per, O, O, O, ...","[input_ids, token_type_ids, attention_mask]","[-100, 1, 1, 1, 11, 11, 11, 11, 11, 14, 7, 7, ...","{'input_ids': [tensor(101), tensor(59876), ten..."
398,261,Sentence: 46053,"[However, protesters, in, Iran, Friday, threw,...","['RB', 'NNS', 'IN', 'NNP', 'NNP', 'VBD', 'NNS'...","['O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'O', 'O...",However protesters in Iran Friday threw bricks...,"[O, O, O, B-geo, B-tim, O, O, O, O, O, O, B-or...","[input_ids, token_type_ids, attention_mask]","[-100, 11, 11, 11, 11, 9, 2, 11, 11, 11, 11, 1...","{'input_ids': [tensor(101), tensor(12209), ten..."


In [188]:
start_time = time.time()

total_sentence_id_test = list(chain.from_iterable(repeat(i, 32) for i in range(1, (len(test_ner)+1))))

total_tokens_list_test, \
total_word_ids_test, \
total_words_test, \
total_predicted_labels_test, \
total_actual_labels_test = \
process_data(test_ner, tokenizer_fast, model, id2label)

dict_test = {
    "Sentence_id": total_sentence_id_test,
    "word_id": total_word_ids_test,
    "predicted_labels": total_predicted_labels_test,
    "Tokens": total_tokens_list_test,
    "Words": total_words_test,
    "actual_labels": total_actual_labels_test
}

total_df_test = pd.DataFrame(dict_test)

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print("Execution time:", elapsed_time, "seconds")

Execution time: 116.54774117469788 seconds


In [189]:
test_df_predicted = extract_data_to_dataframe(total_df_test)

In [190]:
test_df_predicted

Unnamed: 0,Sentence ID,Words,Actual Labels,Predicted Labels
0,1,Rangoon,B-per,B-org
1,1,has,O,O
2,1,faced,O,O
3,1,growing,O,O
4,1,international,O,O
...,...,...,...,...
7505,400,suburb,I-tim,O
7506,400,of,I-tim,O
7507,400,Beirut,I-tim,B-geo
7508,400,late,O,O


In [191]:
len(test_df_predicted[test_df_predicted["Actual Labels"]==test_df_predicted["Predicted Labels"]])

7114

Evaluation metric of test data.

In [192]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Example true labels and predicted labels
true_labels_test = test_df_predicted["Actual Labels"].tolist()
predicted_labels_test = test_df_predicted["Predicted Labels"].tolist()

# Calculate precision, recall, accuracy, and F1 score
accuracy_test = accuracy_score(true_labels_test, predicted_labels_test)
precision_test = precision_score(true_labels_test, predicted_labels_test,average="micro")
recall_test = recall_score(true_labels_test, predicted_labels_test,average="micro")
f1_test = f1_score(true_labels_test, predicted_labels_test,average="micro")

print("Test Accuracy:", accuracy_test)
print("Test Precision:", precision_test)
print("Test Recall:", recall_test)
print("Test F1 Score:", f1_test)

Test Accuracy: 0.9472703062583222
Test Precision: 0.9472703062583222
Test Recall: 0.9472703062583222
Test F1 Score: 0.9472703062583222


In [195]:
test_df_predicted.to_csv("NER_test_predicted.csv")

In [196]:
total_df_test.to_csv("Predicted_Total_test.csv")