# Data Tokenization

In [1]:
# Install Pytorch & other libraries
!pip install torch
!pip install "setuptools<71.0.0" scikit-learn
!pip install seqeval



In [2]:
!pip install --upgrade datasets
!pip install --upgrade evaluate
!pip install --upgrade accelerate
# !pip install --upgrade transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
!pip install git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1

Collecting git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1
  Cloning https://github.com/huggingface/transformers.git (to revision 6e0515e99c39444caae39472ee1b2fd76ece32f1) to /tmp/pip-req-build-eb610drf
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-eb610drf
  Running command git rev-parse -q --verify 'sha^6e0515e99c39444caae39472ee1b2fd76ece32f1'
  Running command git fetch -q https://github.com/huggingface/transformers.git 6e0515e99c39444caae39472ee1b2fd76ece32f1
  Running command git checkout -q 6e0515e99c39444caae39472ee1b2fd76ece32f1
  Resolved https://github.com/huggingface/transformers.git to commit 6e0515e99c39444caae39472ee1b2fd76ece32f1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
 

## Importing dataset

In [4]:
from datasets import load_from_disk
import os

In [5]:
ner_dataset = load_from_disk("/content/drive/MyDrive/pii_ner_dataset")

## Constants

In [6]:
MODEL_ID = "answerdotai/ModernBERT-base"

In [7]:
LOGS = "logs"
OUTPUT_DATASET_PATH = os.path.join(
    "data", "tokenized_ner_modernbert"
)  # "data"
OUTPUT_DIR = "models"
MODEL_PATH = os.path.join(OUTPUT_DIR, MODEL_ID)
OUTPUT_MODEL = os.path.join(OUTPUT_DIR, f"ner-{MODEL_ID.split('/')[-1]}")

In [8]:
EVAL_STRATEGY = "epoch"
LEARNING_RATE = 1e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH_SIZE = 64
NUM_TRAIN_EPOCHS = 5
WEIGHT_DECAY = 0.01

## Importing Model Tokenizer

In [10]:
import pandas as pd
from transformers import BertTokenizerFast

In [11]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_ID)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


## Function for Tokenization

In [12]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
q = tokenize_and_align_labels(examples=ner_dataset["train"][4:5])
print(q)

{'input_ids': [[50281, 51, 1432, 44, 22711, 13, 66, 30165, 49923, 1171, 8096, 376, 49259, 13, 24765, 48998, 66, 49, 1539, 9133, 59, 40, 3519, 50, 3763, 1540, 37, 13, 41571, 21997, 10206, 1706, 1518, 2945, 17599, 20777, 395, 8701, 1194, 46, 9290, 9133, 261, 21, 5831, 23, 26937, 1166, 19635, 1549, 13, 8701, 22045, 9133, 261, 41532, 1630, 22858, 27087, 15, 50282]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 1, 1, 2, 2, 0, 0, 3, 0, 0, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 15, 15, 15, 15, 15, 15, 15, 0, 0, 0, 19, 19, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0, 11, 11, 11, 11, 0, -100]]}


In [14]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
R_______________________________________ 1
aj______________________________________ 1
K_______________________________________ 2
umar____________________________________ 2
,_______________________________________ 0
a_______________________________________ 0
male____________________________________ 3
resident________________________________ 0
of______________________________________ 0
Am______________________________________ 8
ra______________________________________ 8
oti_____________________________________ 8
,_______________________________________ 0
poss____________________________________ 0
esses___________________________________ 0
a_______________________________________ 0
P_______________________________________ 0
AN______________________________________ 0
number__________________________________ 0
Z_______________________________________ 15
G_______________________________________ 15
OD______________________________________ 15
Q____

In [15]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

## Saving the tokenized dataset

In [16]:
import os

In [17]:
OUTPUT_DATASET_PATH = os.path.join(
    "data", "tokenized_ner_modernbert"
)

In [18]:
tokenized_datasets.save_to_disk(OUTPUT_DATASET_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/778 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/97 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/98 [00:00<?, ? examples/s]

# Model Finetuning

In [19]:
import os
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import torch
import evaluate
import json

import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Loading Dataset

In [20]:
tokenized_pii_ner_dataset = load_from_disk(OUTPUT_DATASET_PATH)
print(tokenized_pii_ner_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 778
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 97
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 98
    })
})


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
label_list = tokenized_pii_ner_dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

print(f"Labels: {label_list}")
print(f"Number of labels: {num_labels}")

Labels: ['O', 'B-NAME', 'I-NAME', 'B-GENDER', 'B-DOB', 'I-DOB', 'B-NATIONALITY', 'I-NATIONALITY', 'B-CITY', 'I-CITY', 'B-EMAIL', 'B-PHONE', 'I-PHONE', 'B-AADHAR', 'I-AADHAR', 'B-PAN', 'B-VOTER', 'B-PASSPORT', 'I-PASSPORT', 'B-LICENSE', 'I-LICENSE', 'B-ACCOUNTNUMBER', 'I-ACCOUNTNUMBER', 'B-BANKIFSC', 'B-CARDNUMBER', 'B-CVV', 'B-IP']
Number of labels: 27


In [23]:
tokenized_pii_ner_dataset.shape

{'train': (778, 6), 'validation': (97, 6), 'test': (98, 6)}

## Model Training

In [24]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [25]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, num_labels=num_labels)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
model = model.to(device)

Training Arguments

```python
EVAL_STRATEGY = "epoch"
LEARNING_RATE = 1e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH_SIZE = 64
NUM_TRAIN_EPOCHS = 5
WEIGHT_DECAY = 0.01
```

In [27]:
args = TrainingArguments(
    output_dir=MODEL_PATH,
    eval_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    report_to = None
)

In [28]:
#Using a Data Collator to dynamically pad inputs and ensure token-label alignment during training.
data_collator = DataCollatorForTokenClassification(tokenizer)

- The following function computes evaluation metrics such as precision, recall, F1 score, and accuracy.
- It filters out ignored tokens (label=-100) during computation.

In [29]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
    ]
    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [30]:
# Initializing the Trainer with the model, training arguments, datasets, and evaluation metrics.
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_pii_ner_dataset["train"],
   eval_dataset=tokenized_pii_ner_dataset["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [31]:
# WANDB API 066634aeb54bd0fa0ce505f85c7b2784fa59ca02
# Starting model training. This step optimizes the model's weights to fit the dataset.
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 1


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.942222,0.156584,0.049272,0.074957,0.478815
2,No log,1.421188,0.376258,0.334826,0.354335,0.611131
3,No log,1.096482,0.51279,0.486376,0.499234,0.693896
4,No log,0.911444,0.589458,0.592758,0.591104,0.750269
5,No log,0.842005,0.624721,0.626353,0.625536,0.771275


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=65, training_loss=1.4515242356520432, metrics={'train_runtime': 1776.6449, 'train_samples_per_second': 2.19, 'train_steps_per_second': 0.037, 'total_flos': 251335160488332.0, 'train_loss': 1.4515242356520432, 'epoch': 5.0})

## Saving the training results

In [32]:
import pandas as pd

In [35]:
!mkdir logs

In [36]:
results = pd.DataFrame(trainer.state.log_history)
results = results[['epoch', 'eval_precision', 'eval_recall', 'eval_f1', 'eval_accuracy', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']]
results.dropna(inplace=True)
results.reset_index(drop=True, inplace=True)

# Saving evaluation results in a CSV format for easy visualization and comparison.
results.to_csv(f"logs/{OUTPUT_MODEL.split('/')[-1]}-results.csv", index=False)

## Saving the Model

In [37]:
model.save_pretrained(OUTPUT_MODEL)
tokenizer.save_pretrained(OUTPUT_MODEL)

('models/ner-ModernBERT-base/tokenizer_config.json',
 'models/ner-ModernBERT-base/special_tokens_map.json',
 'models/ner-ModernBERT-base/tokenizer.json')

In [38]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [39]:
config = json.load(open(f"{OUTPUT_MODEL}/config.json"))

In [40]:
config["id2label"] = id2label
config["label2id"] = label2id

In [41]:
json.dump(config, open(f"{OUTPUT_MODEL}/config.json","w"))

## Local Inferencing

In [42]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained(OUTPUT_MODEL)
tokenizer = BertTokenizerFast.from_pretrained(OUTPUT_MODEL)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.


### Inferencing Model

In [43]:
from transformers import pipeline

In [44]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer, aggregation_strategy="simple")

example = """Hello Aadil, your aadhar number is 437623083136"""

ner_results = nlp(example)

print(ner_results)

Device set to use cuda:0


[{'entity_group': 'CVV', 'score': 0.13734789, 'word': 'Hello', 'start': 0, 'end': 5}, {'entity_group': 'CITY', 'score': 0.2062467, 'word': ' A', 'start': 5, 'end': 7}, {'entity_group': 'CITY', 'score': 0.15235835, 'word': 'ad', 'start': 7, 'end': 9}, {'entity_group': 'EMAIL', 'score': 0.1742059, 'word': 'il', 'start': 9, 'end': 11}, {'entity_group': 'PAN', 'score': 0.15270528, 'word': ' your', 'start': 12, 'end': 17}, {'entity_group': 'EMAIL', 'score': 0.14566822, 'word': 'ad', 'start': 19, 'end': 21}, {'entity_group': 'EMAIL', 'score': 0.17911491, 'word': 'har', 'start': 21, 'end': 24}, {'entity_group': 'PHONE', 'score': 0.20009741, 'word': ' is', 'start': 31, 'end': 34}, {'entity_group': 'ACCOUNTNUMBER', 'score': 0.22447583, 'word': ' 437', 'start': 34, 'end': 38}, {'entity_group': 'AADHAR', 'score': 0.37837622, 'word': '62', 'start': 38, 'end': 40}, {'entity_group': 'PASSPORT', 'score': 0.24950603, 'word': '30', 'start': 40, 'end': 42}, {'entity_group': 'AADHAR', 'score': 0.22187555

In [48]:
def clean_and_group_entities(ner_results, min_score=0.10):
    """
    Cleans and groups named entity recognition (NER) results based on a minimum score threshold.

    Args:
        ner_results (list of dict): A list of dictionaries containing NER results. Each dictionary should have the keys:
            - "word" (str): The recognized word or token.
            - "entity_group" (str): The entity group or label.
            - "start" (int): The start position of the entity in the text.
            - "end" (int): The end position of the entity in the text.
            - "score" (float): The confidence score of the entity recognition.
        min_score (float, optional): The minimum score threshold for considering an entity. Defaults to 0.40.

    Returns:
        list of dict: A list of grouped entities that meet the minimum score threshold. Each dictionary contains:
            - "entity_group" (str): The entity group or label.
            - "word" (str): The concatenated word or token.
            - "start" (int): The start position of the entity in the text.
            - "end" (int): The end position of the entity in the text.
            - "score" (float): The minimum confidence score of the grouped entity.
    """
    grouped_entities = []
    current_entity = None

    for result in ner_results:
        # Skip entities with score below threshold
        if result["score"] < min_score:
            if current_entity:
                # Add current entity if it meets threshold
                if current_entity["score"] >= min_score:
                    grouped_entities.append(current_entity)
                current_entity = None
            continue

        word = result["word"].replace("##", "")  # Remove subword token markers

        if current_entity and result["entity_group"] == current_entity["entity_group"] and result["start"] == current_entity["end"]:
            # Continue the current entity
            current_entity["word"] += word
            current_entity["end"] = result["end"]
            current_entity["score"] = min(current_entity["score"], result["score"])

            # If combined score drops below threshold, discard the entity
            if current_entity["score"] < min_score:
                current_entity = None
        else:
            # Finalize the current entity if it meets threshold
            if current_entity and current_entity["score"] >= min_score:
                grouped_entities.append(current_entity)

            # Start a new entity
            current_entity = {
                "entity_group": result["entity_group"],
                "word": word,
                "start": result["start"],
                "end": result["end"],
                "score": result["score"]
            }

    # Add the last entity if it meets threshold
    if current_entity and current_entity["score"] >= min_score:
        grouped_entities.append(current_entity)

    return grouped_entities

In [50]:
cleaned_results = clean_and_group_entities(ner_results)
cleaned_results

[{'entity_group': 'CVV',
  'word': 'Hello',
  'start': 0,
  'end': 5,
  'score': 0.13734789},
 {'entity_group': 'CITY',
  'word': ' Aad',
  'start': 5,
  'end': 9,
  'score': 0.15235835},
 {'entity_group': 'EMAIL',
  'word': 'il',
  'start': 9,
  'end': 11,
  'score': 0.1742059},
 {'entity_group': 'PAN',
  'word': ' your',
  'start': 12,
  'end': 17,
  'score': 0.15270528},
 {'entity_group': 'EMAIL',
  'word': 'adhar',
  'start': 19,
  'end': 24,
  'score': 0.14566822},
 {'entity_group': 'PHONE',
  'word': ' is',
  'start': 31,
  'end': 34,
  'score': 0.20009741},
 {'entity_group': 'ACCOUNTNUMBER',
  'word': ' 437',
  'start': 34,
  'end': 38,
  'score': 0.22447583},
 {'entity_group': 'AADHAR',
  'word': '62',
  'start': 38,
  'end': 40,
  'score': 0.37837622},
 {'entity_group': 'PASSPORT',
  'word': '30',
  'start': 40,
  'end': 42,
  'score': 0.24950603},
 {'entity_group': 'AADHAR',
  'word': '83136',
  'start': 42,
  'end': 47,
  'score': 0.14330311}]