### Importing Dataset and Dependencies

In [1]:
# Libraries Import
import pandas as pd
import ast
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your CSV
df = pd.read_csv("dataset.csv")
df.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Template,Filled Template,Tokenised Filled Template,Tokens
0,"In our video conference, discuss the role of e...","In our video conference, discuss the role of e...","['in', 'our', 'video', 'conference', ',', 'dis...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,Could you draft a letter for [NAME_1] to send ...,"Could you draft a letter for Dietrich, Schulis...","['could', 'you', 'draft', 'a', 'letter', 'for'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NA..."
2,Discuss the options for [FULLNAME_1] who wants...,Discuss the options for Jeffery Pfeffer who wa...,"['discuss', 'the', 'options', 'for', 'jeff', '...","['O', 'O', 'O', 'O', 'B-FULLNAME', 'I-FULLNAME..."
3,13. Write a press release announcing [FULLNAME...,13. Write a press release announcing Gayle Wat...,"['13', '.', 'write', 'a', 'press', 'release', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FULLNAM..."
4,9. Develop an inventory management plan for [F...,9. Develop an inventory management plan for Ev...,"['9', '.', 'develop', 'an', 'inventory', 'mana...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FU..."


### Preprocessing and Tokenization

In [2]:
# Function to convert string representation of lists into actual lists
def try_parse_list(cell):
    if isinstance(cell, str):
        return ast.literal_eval(cell)
    return cell

# This is necessary if the columns contain lists stored as strings
df["Tokenised Filled Template"] = df["Tokenised Filled Template"].apply(try_parse_list)
df["Tokens"] = df["Tokens"].apply(try_parse_list)
df.head()

Unnamed: 0,Template,Filled Template,Tokenised Filled Template,Tokens
0,"In our video conference, discuss the role of e...","In our video conference, discuss the role of e...","[in, our, video, conference, ,, discuss, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,Could you draft a letter for [NAME_1] to send ...,"Could you draft a letter for Dietrich, Schulis...","[could, you, draft, a, letter, for, dietrich, ...","[O, O, O, O, O, O, B-NAME, I-NAME, I-NAME, I-N..."
2,Discuss the options for [FULLNAME_1] who wants...,Discuss the options for Jeffery Pfeffer who wa...,"[discuss, the, options, for, jeff, ##ery, p, #...","[O, O, O, O, B-FULLNAME, I-FULLNAME, I-FULLNAM..."
3,13. Write a press release announcing [FULLNAME...,13. Write a press release announcing Gayle Wat...,"[13, ., write, a, press, release, announcing, ...","[O, O, O, O, O, O, O, B-FULLNAME, I-FULLNAME, ..."
4,9. Develop an inventory management plan for [F...,9. Develop an inventory management plan for Ev...,"[9, ., develop, an, inventory, management, pla...","[O, O, O, O, O, O, O, O, B-FULLNAME, I-FULLNAM..."


In [3]:
# Maximum number of tokens in any row
max_len = df["Tokenised Filled Template"].apply(len).max()
print(f"🔢 Maximum number of tokens in any row: {max_len}")

🔢 Maximum number of tokens in any row: 127


In [4]:
# Check for mismatches between tokens length and labels length
# This is to ensure that the tokenized filled template and tokens have the same length
counter = 0
for idx, row in df.iterrows():
    if len(row["Tokenised Filled Template"]) != len(row["Tokens"]):
        print(f"Mismatch at index {idx}")
        # print(f"Tokens ({len(row['Tokenised Filled Template'])}): {row['Tokenised Filled Template']}")
        # print(f"Labels ({len(row['Tokens'])}): {row['Tokens']}")
        counter += 1
print(f"Total mismatches found: {counter}")

Mismatch at index 701
Mismatch at index 1123
Mismatch at index 1906
Mismatch at index 2141
Mismatch at index 2192
Mismatch at index 2205
Mismatch at index 2375
Mismatch at index 2387
Mismatch at index 3218
Mismatch at index 3381
Mismatch at index 3654
Mismatch at index 3870
Mismatch at index 4265
Mismatch at index 4993
Mismatch at index 5071
Mismatch at index 5135
Mismatch at index 6312
Mismatch at index 6801
Mismatch at index 7190
Mismatch at index 7384
Mismatch at index 7779
Mismatch at index 7843
Mismatch at index 8652
Mismatch at index 8710
Mismatch at index 10344
Mismatch at index 10800
Mismatch at index 11358
Mismatch at index 12127
Mismatch at index 12358
Mismatch at index 13103
Mismatch at index 13467
Mismatch at index 13574
Mismatch at index 13642
Mismatch at index 13709
Mismatch at index 14074
Mismatch at index 14272
Mismatch at index 14604
Mismatch at index 15549
Mismatch at index 15775
Mismatch at index 15847
Mismatch at index 16239
Mismatch at index 16297
Mismatch at index

In [5]:
# Filter only rows where tokens and labels match in length
df = df[df["Tokenised Filled Template"].str.len() == df["Tokens"].str.len()]


# Check for mismatches between tokens and labels
counter = 0
for idx, row in df.iterrows():
    if len(row["Tokenised Filled Template"]) != len(row["Tokens"]):
        print(f"Mismatch at index {idx}")
        # print(f"Tokens ({len(row['Tokenised Filled Template'])}): {row['Tokenised Filled Template']}")
        # print(f"Labels ({len(row['Tokens'])}): {row['Tokens']}")
        counter += 1
print(f"Total mismatches found: {counter}")

Total mismatches found: 0


In [6]:
# Extract BIO tag scheme
all_tags = set(tag for tags in df["Tokens"] for tag in tags)
unique_tags = sorted(all_tags)

label2id = {tag: idx for idx, tag in enumerate(unique_tags)}
# print("Unique tags and their IDs:")
# for tag, idx in label2id.items():
#     print(f"{tag}: {idx}")

# Create a mapping from ID to label
id2label = {idx: tag for tag, idx in label2id.items()}
print("\nID to label mapping:")
for idx, tag in id2label.items():
    print(f"{idx}: {tag}")

# Add numeric label ids for each row
df["Label_ids"] = df["Tokens"].apply(lambda tags: [label2id[tag] for tag in tags])



ID to label mapping:
0: B-ACCOUNTNAME
1: B-ACCOUNTNUMBER
2: B-AMOUNT
3: B-BIC
4: B-BITCOINADDRESS
5: B-BUILDINGNUMBER
6: B-CITY
7: B-COUNTY
8: B-CREDITCARDCVV
9: B-CREDITCARDISSUER
10: B-CREDITCARDNUMBER
11: B-CURRENCY
12: B-CURRENCYCODE
13: B-CURRENCYNAME
14: B-CURRENCYSYMBOL
15: B-DISPLAYNAME
16: B-EMAIL
17: B-ETHEREUMADDRESS
18: B-FIRSTNAME
19: B-FULLNAME
20: B-GENDER
21: B-IBAN
22: B-IP
23: B-IPV4
24: B-IPV6
25: B-JOBAREA
26: B-JOBDESCRIPTOR
27: B-JOBTITLE
28: B-JOBTYPE
29: B-LASTNAME
30: B-LITECOINADDRESS
31: B-MAC
32: B-MASKEDNUMBER
33: B-NAME
34: B-NEARBYGPSCOORDINATE
35: B-NUMBER
36: B-ORDINALDIRECTION
37: B-PASSWORD
38: B-PIN
39: B-SECONDARYADDRESS
40: B-SEX
41: B-SEXTYPE
42: B-STATE
43: B-STREET
44: B-STREETADDRESS
45: B-URL
46: B-USERAGENT
47: B-USERNAME
48: B-ZIPCODE
49: I-ACCOUNTNAME
50: I-ACCOUNTNUMBER
51: I-AMOUNT
52: I-BIC
53: I-BITCOINADDRESS
54: I-BUILDINGNUMBER
55: I-CITY
56: I-CREDITCARDCVV
57: I-CREDITCARDISSUER
58: I-CREDITCARDNUMBER
59: I-CURRENCY
60: I-CURRENCY

In [7]:
# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Training set size: {train_df.shape}")
print(f"Validation set size: {val_df.shape}")
# Convert to Hugging Face datasets
def to_hf_dataset(dataframe):
    return Dataset.from_dict({
        "tokens": dataframe["Tokenised Filled Template"].tolist(),
        "labels": dataframe["Label_ids"].tolist()
    })

train_dataset = to_hf_dataset(train_df)
val_dataset = to_hf_dataset(val_df)

Training set size: (20644, 5)
Validation set size: (2294, 5)


In [8]:
df.head()  # Display the first few rows of the DataFrame to verify changes

Unnamed: 0,Template,Filled Template,Tokenised Filled Template,Tokens,Label_ids
0,"In our video conference, discuss the role of e...","In our video conference, discuss the role of e...","[in, our, video, conference, ,, discuss, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 9..."
1,Could you draft a letter for [NAME_1] to send ...,"Could you draft a letter for Dietrich, Schulis...","[could, you, draft, a, letter, for, dietrich, ...","[O, O, O, O, O, O, B-NAME, I-NAME, I-NAME, I-N...","[92, 92, 92, 92, 92, 92, 33, 79, 79, 79, 79, 7..."
2,Discuss the options for [FULLNAME_1] who wants...,Discuss the options for Jeffery Pfeffer who wa...,"[discuss, the, options, for, jeff, ##ery, p, #...","[O, O, O, O, B-FULLNAME, I-FULLNAME, I-FULLNAM...","[92, 92, 92, 92, 19, 66, 66, 66, 66, 92, 92, 9..."
3,13. Write a press release announcing [FULLNAME...,13. Write a press release announcing Gayle Wat...,"[13, ., write, a, press, release, announcing, ...","[O, O, O, O, O, O, O, B-FULLNAME, I-FULLNAME, ...","[92, 92, 92, 92, 92, 92, 92, 19, 66, 66, 92, 9..."
4,9. Develop an inventory management plan for [F...,9. Develop an inventory management plan for Ev...,"[9, ., develop, an, inventory, management, pla...","[O, O, O, O, O, O, O, O, B-FULLNAME, I-FULLNAM...","[92, 92, 92, 92, 92, 92, 92, 92, 19, 66, 92, 9..."


In [9]:
# Target Label Encoding
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",   # ensure uniform tensor sizes
        max_length=128,
        return_attention_mask=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For subword tokens, assign -100 or the same label as the word
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

### Tokenizer Selection


In [10]:
from transformers import BertTokenizerFast, BertForTokenClassification

model_checkpoint = "prajjwal1/bert-tiny"
# model_checkpoint = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

model = BertForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_tags),
    id2label=id2label,
    label2id=label2id
)

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

W0803 14:51:42.550000 18780 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 20644/20644 [00:04<00:00, 4610.87 examples/s]
Map: 100%|██████████| 2294/2294 [00:00<00:00, 5212.34 examples/s]


In [None]:
# Trainer and Training Arguments
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(predictions_output):
    preds, labels = predictions_output
    preds = np.argmax(preds, axis=2)

    true_preds = []
    true_labels = []

    for pred_seq, label_seq in zip(preds, labels):
        pred_tags = []
        label_tags = []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                pred_tags.append(id2label[p])
                label_tags.append(id2label[l])
        true_preds.append(pred_tags)
        true_labels.append(label_tags)

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

# Training Argument
training_args = TrainingArguments(
    output_dir="./ner_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
)


In [12]:
# Trainer start
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print("📈 Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")



  2%|▏         | 501/25810 [00:44<36:18, 11.62it/s]

{'loss': 2.3922, 'grad_norm': 3.694591760635376, 'learning_rate': 1.9612553273924838e-05, 'epoch': 0.19}


  4%|▍         | 1001/25810 [01:31<36:03, 11.47it/s]

{'loss': 1.116, 'grad_norm': 1.670870304107666, 'learning_rate': 1.922510654784967e-05, 'epoch': 0.39}


  6%|▌         | 1501/25810 [02:18<39:54, 10.15it/s]

{'loss': 0.7546, 'grad_norm': 1.3517329692840576, 'learning_rate': 1.8837659821774508e-05, 'epoch': 0.58}


  8%|▊         | 2001/25810 [03:01<34:26, 11.52it/s]

{'loss': 0.6183, 'grad_norm': 1.772323727607727, 'learning_rate': 1.845021309569934e-05, 'epoch': 0.77}


 10%|▉         | 2501/25810 [03:44<33:02, 11.76it/s]

{'loss': 0.5289, 'grad_norm': 1.8229690790176392, 'learning_rate': 1.8062766369624178e-05, 'epoch': 0.97}


                                                    


{'eval_loss': 0.4335784614086151, 'eval_accuracy': 0.9034560605557584, 'eval_precision': 0.33130252100840335, 'eval_recall': 0.4577648766328012, 'eval_f1': 0.3843997562461914, 'eval_runtime': 6.7605, 'eval_samples_per_second': 339.323, 'eval_steps_per_second': 42.452, 'epoch': 1.0}


 12%|█▏        | 3001/25810 [04:34<33:29, 11.35it/s]  

{'loss': 0.4682, 'grad_norm': 4.563076972961426, 'learning_rate': 1.7675319643549015e-05, 'epoch': 1.16}


 14%|█▎        | 3501/25810 [05:18<34:42, 10.71it/s]

{'loss': 0.4256, 'grad_norm': 3.139315605163574, 'learning_rate': 1.7287872917473848e-05, 'epoch': 1.36}


 16%|█▌        | 4002/25810 [06:03<29:49, 12.19it/s]

{'loss': 0.4, 'grad_norm': 1.0880024433135986, 'learning_rate': 1.6900426191398685e-05, 'epoch': 1.55}


 17%|█▋        | 4502/25810 [06:46<31:34, 11.25it/s]

{'loss': 0.3687, 'grad_norm': 2.9844460487365723, 'learning_rate': 1.651297946532352e-05, 'epoch': 1.74}


 19%|█▉        | 5002/25810 [07:28<28:50, 12.03it/s]

{'loss': 0.3483, 'grad_norm': 1.4899905920028687, 'learning_rate': 1.6125532739248355e-05, 'epoch': 1.94}


                                                    


{'eval_loss': 0.2825126349925995, 'eval_accuracy': 0.9381992629726769, 'eval_precision': 0.5085821772866213, 'eval_recall': 0.6278664731494921, 'eval_f1': 0.5619641465315667, 'eval_runtime': 6.6346, 'eval_samples_per_second': 345.761, 'eval_steps_per_second': 43.258, 'epoch': 2.0}


 21%|██▏       | 5502/25810 [08:17<28:32, 11.86it/s]  

{'loss': 0.3132, 'grad_norm': 3.479875326156616, 'learning_rate': 1.573808601317319e-05, 'epoch': 2.13}


 23%|██▎       | 6002/25810 [09:00<27:41, 11.92it/s]

{'loss': 0.3178, 'grad_norm': 4.397679328918457, 'learning_rate': 1.5350639287098025e-05, 'epoch': 2.32}


 25%|██▌       | 6502/25810 [09:42<28:20, 11.35it/s]

{'loss': 0.2929, 'grad_norm': 2.8243234157562256, 'learning_rate': 1.4963192561022861e-05, 'epoch': 2.52}


 27%|██▋       | 7002/25810 [10:24<28:42, 10.92it/s]

{'loss': 0.2813, 'grad_norm': 2.372600793838501, 'learning_rate': 1.4575745834947696e-05, 'epoch': 2.71}


 29%|██▉       | 7502/25810 [11:06<25:35, 11.92it/s]

{'loss': 0.2637, 'grad_norm': 2.3474748134613037, 'learning_rate': 1.4188299108872531e-05, 'epoch': 2.91}


                                                    
 30%|███       | 7744/25810 [11:33<5:31:19,  1.10s/it]

{'eval_loss': 0.22030100226402283, 'eval_accuracy': 0.9504830516915109, 'eval_precision': 0.5842364532019705, 'eval_recall': 0.6885341074020319, 'eval_f1': 0.632111925383078, 'eval_runtime': 6.6738, 'eval_samples_per_second': 343.734, 'eval_steps_per_second': 43.004, 'epoch': 3.0}


 31%|███       | 8002/25810 [11:55<24:32, 12.09it/s]  

{'loss': 0.2518, 'grad_norm': 2.163127899169922, 'learning_rate': 1.3800852382797368e-05, 'epoch': 3.1}


 33%|███▎      | 8502/25810 [12:37<23:41, 12.17it/s]

{'loss': 0.2453, 'grad_norm': 2.2425992488861084, 'learning_rate': 1.3413405656722203e-05, 'epoch': 3.29}


 35%|███▍      | 9002/25810 [13:21<23:17, 12.03it/s]

{'loss': 0.2457, 'grad_norm': 4.6998724937438965, 'learning_rate': 1.3025958930647038e-05, 'epoch': 3.49}


 37%|███▋      | 9502/25810 [14:04<24:05, 11.28it/s]

{'loss': 0.2392, 'grad_norm': 1.7923890352249146, 'learning_rate': 1.2638512204571873e-05, 'epoch': 3.68}


 39%|███▉      | 10002/25810 [14:46<21:38, 12.18it/s]

{'loss': 0.2286, 'grad_norm': 2.4995405673980713, 'learning_rate': 1.225106547849671e-05, 'epoch': 3.87}


                                                     
 40%|████      | 10324/25810 [15:21<22:04, 11.69it/s]Checkpoint destination directory ./ner_output\checkpoint-10324 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.18404588103294373, 'eval_accuracy': 0.9564423491915939, 'eval_precision': 0.6169483223120255, 'eval_recall': 0.7312046444121916, 'eval_f1': 0.6692348565356004, 'eval_runtime': 6.974, 'eval_samples_per_second': 328.936, 'eval_steps_per_second': 41.153, 'epoch': 4.0}


 41%|████      | 10502/25810 [15:36<20:39, 12.35it/s]  

{'loss': 0.2183, 'grad_norm': 2.9204883575439453, 'learning_rate': 1.1863618752421544e-05, 'epoch': 4.07}


 43%|████▎     | 11002/25810 [16:19<21:22, 11.54it/s]

{'loss': 0.2229, 'grad_norm': 3.029761552810669, 'learning_rate': 1.147617202634638e-05, 'epoch': 4.26}


 45%|████▍     | 11502/25810 [17:02<20:01, 11.91it/s]

{'loss': 0.2093, 'grad_norm': 1.8175920248031616, 'learning_rate': 1.1088725300271212e-05, 'epoch': 4.46}


 47%|████▋     | 12002/25810 [17:44<20:47, 11.07it/s]

{'loss': 0.2083, 'grad_norm': 5.703102111816406, 'learning_rate': 1.0701278574196047e-05, 'epoch': 4.65}


 48%|████▊     | 12502/25810 [18:27<19:25, 11.42it/s]

{'loss': 0.1925, 'grad_norm': 3.301931142807007, 'learning_rate': 1.0313831848120884e-05, 'epoch': 4.84}


                                                     
 50%|█████     | 12905/25810 [19:09<18:44, 11.47it/s]Checkpoint destination directory ./ner_output\checkpoint-12905 already exists and is non-empty. Saving will proceed but saved results may be invalid.
 50%|█████     | 12906/25810 [19:09<3:56:30,  1.10s/it]

{'eval_loss': 0.1632247120141983, 'eval_accuracy': 0.9605922778128216, 'eval_precision': 0.6405472636815921, 'eval_recall': 0.7474600870827286, 'eval_f1': 0.6898861352980576, 'eval_runtime': 6.6912, 'eval_samples_per_second': 342.837, 'eval_steps_per_second': 42.892, 'epoch': 5.0}


 50%|█████     | 13002/25810 [19:17<18:36, 11.48it/s]  

{'loss': 0.1881, 'grad_norm': 3.8463759422302246, 'learning_rate': 9.926385122045719e-06, 'epoch': 5.04}


 52%|█████▏    | 13500/25810 [20:00<16:44, 12.26it/s]

{'loss': 0.2002, 'grad_norm': 2.434706687927246, 'learning_rate': 9.538938395970556e-06, 'epoch': 5.23}


 54%|█████▍    | 14002/25810 [20:42<16:05, 12.23it/s]

{'loss': 0.1814, 'grad_norm': 2.885169506072998, 'learning_rate': 9.15149166989539e-06, 'epoch': 5.42}


 56%|█████▌    | 14502/25810 [21:25<15:46, 11.95it/s]

{'loss': 0.1826, 'grad_norm': 3.2642149925231934, 'learning_rate': 8.764044943820226e-06, 'epoch': 5.62}


 58%|█████▊    | 15002/25810 [22:08<15:28, 11.65it/s]

{'loss': 0.1787, 'grad_norm': 2.0341858863830566, 'learning_rate': 8.37659821774506e-06, 'epoch': 5.81}


                                                     


{'eval_loss': 0.14847137033939362, 'eval_accuracy': 0.9635636267056207, 'eval_precision': 0.6562578066450162, 'eval_recall': 0.7625544267053701, 'eval_f1': 0.7054242749731472, 'eval_runtime': 7.0798, 'eval_samples_per_second': 324.022, 'eval_steps_per_second': 40.538, 'epoch': 6.0}


 60%|██████    | 15500/25810 [22:57<37:37,  4.57it/s]  

{'loss': 0.1744, 'grad_norm': 1.4434393644332886, 'learning_rate': 7.989151491669897e-06, 'epoch': 6.01}


 62%|██████▏   | 16002/25810 [23:41<14:10, 11.54it/s]

{'loss': 0.1737, 'grad_norm': 3.1197919845581055, 'learning_rate': 7.601704765594732e-06, 'epoch': 6.2}


 64%|██████▍   | 16502/25810 [24:25<13:13, 11.73it/s]

{'loss': 0.1791, 'grad_norm': 2.9168827533721924, 'learning_rate': 7.214258039519567e-06, 'epoch': 6.39}


 66%|██████▌   | 17002/25810 [25:08<12:41, 11.56it/s]

{'loss': 0.1672, 'grad_norm': 2.3247663974761963, 'learning_rate': 6.826811313444401e-06, 'epoch': 6.59}


 68%|██████▊   | 17502/25810 [25:51<12:03, 11.48it/s]

{'loss': 0.1608, 'grad_norm': 2.9380857944488525, 'learning_rate': 6.439364587369237e-06, 'epoch': 6.78}


 70%|██████▉   | 18002/25810 [26:34<11:07, 11.70it/s]

{'loss': 0.1675, 'grad_norm': 3.9264659881591797, 'learning_rate': 6.051917861294072e-06, 'epoch': 6.97}


                                                     
 70%|███████   | 18068/25810 [26:47<2:25:14,  1.13s/it]

{'eval_loss': 0.13884370028972626, 'eval_accuracy': 0.9659705853059327, 'eval_precision': 0.6697569531445753, 'eval_recall': 0.7759071117561683, 'eval_f1': 0.7189349112426036, 'eval_runtime': 6.8568, 'eval_samples_per_second': 334.559, 'eval_steps_per_second': 41.856, 'epoch': 7.0}


 72%|███████▏  | 18502/25810 [27:24<10:14, 11.89it/s]  

{'loss': 0.1673, 'grad_norm': 1.7235767841339111, 'learning_rate': 5.664471135218908e-06, 'epoch': 7.17}


 74%|███████▎  | 19002/25810 [28:08<09:52, 11.49it/s]

{'loss': 0.1715, 'grad_norm': 2.867598295211792, 'learning_rate': 5.277024409143743e-06, 'epoch': 7.36}


 76%|███████▌  | 19502/25810 [28:51<09:29, 11.08it/s]

{'loss': 0.1623, 'grad_norm': 2.438126564025879, 'learning_rate': 4.8895776830685785e-06, 'epoch': 7.56}


 77%|███████▋  | 20002/25810 [29:35<08:18, 11.66it/s]

{'loss': 0.1553, 'grad_norm': 3.1397852897644043, 'learning_rate': 4.5021309569934135e-06, 'epoch': 7.75}


 79%|███████▉  | 20502/25810 [30:18<07:43, 11.45it/s]

{'loss': 0.1516, 'grad_norm': 5.04334831237793, 'learning_rate': 4.1146842309182484e-06, 'epoch': 7.94}


                                                     


{'eval_loss': 0.13281629979610443, 'eval_accuracy': 0.9671657647488463, 'eval_precision': 0.6766181635725038, 'eval_recall': 0.7828737300435413, 'eval_f1': 0.7258780783205492, 'eval_runtime': 6.7716, 'eval_samples_per_second': 338.769, 'eval_steps_per_second': 42.383, 'epoch': 8.0}


 81%|████████▏ | 21002/25810 [31:07<06:48, 11.76it/s]  

{'loss': 0.157, 'grad_norm': 2.00166654586792, 'learning_rate': 3.7272375048430843e-06, 'epoch': 8.14}


 83%|████████▎ | 21502/25810 [31:51<06:15, 11.47it/s]

{'loss': 0.155, 'grad_norm': 1.28708815574646, 'learning_rate': 3.3397907787679196e-06, 'epoch': 8.33}


 85%|████████▌ | 22002/25810 [32:35<05:20, 11.87it/s]

{'loss': 0.157, 'grad_norm': 1.8121753931045532, 'learning_rate': 2.952344052692755e-06, 'epoch': 8.52}


 87%|████████▋ | 22502/25810 [33:18<04:47, 11.51it/s]

{'loss': 0.152, 'grad_norm': 3.1360034942626953, 'learning_rate': 2.5648973266175904e-06, 'epoch': 8.72}


 89%|████████▉ | 23002/25810 [34:02<04:12, 11.12it/s]

{'loss': 0.1524, 'grad_norm': 0.817374050617218, 'learning_rate': 2.1774506005424258e-06, 'epoch': 8.91}


                                                     
 90%|█████████ | 23230/25810 [34:28<48:06,  1.12s/it]

{'eval_loss': 0.12933914363384247, 'eval_accuracy': 0.967979150758607, 'eval_precision': 0.6824358329139406, 'eval_recall': 0.7872278664731495, 'eval_f1': 0.7310958350181965, 'eval_runtime': 6.8163, 'eval_samples_per_second': 336.545, 'eval_steps_per_second': 42.105, 'epoch': 9.0}


 91%|█████████ | 23501/25810 [34:52<03:15, 11.82it/s]

{'loss': 0.1569, 'grad_norm': 4.487970352172852, 'learning_rate': 1.790003874467261e-06, 'epoch': 9.1}


 93%|█████████▎| 24001/25810 [35:36<02:34, 11.70it/s]

{'loss': 0.1571, 'grad_norm': 1.1316906213760376, 'learning_rate': 1.4025571483920963e-06, 'epoch': 9.3}


 95%|█████████▍| 24501/25810 [36:19<01:55, 11.36it/s]

{'loss': 0.1484, 'grad_norm': 2.4904260635375977, 'learning_rate': 1.0151104223169315e-06, 'epoch': 9.49}


 97%|█████████▋| 25001/25810 [37:02<01:09, 11.56it/s]

{'loss': 0.1517, 'grad_norm': 2.22245717048645, 'learning_rate': 6.276636962417668e-07, 'epoch': 9.69}


 99%|█████████▉| 25501/25810 [37:46<00:25, 12.09it/s]

{'loss': 0.1517, 'grad_norm': 0.9262222051620483, 'learning_rate': 2.402169701666021e-07, 'epoch': 9.88}


                                                     
100%|██████████| 25810/25810 [38:20<00:00, 11.22it/s]


{'eval_loss': 0.1283436119556427, 'eval_accuracy': 0.9682115467613958, 'eval_precision': 0.6833626982129374, 'eval_recall': 0.7880986937590712, 'eval_f1': 0.732003235373416, 'eval_runtime': 6.8195, 'eval_samples_per_second': 336.39, 'eval_steps_per_second': 42.085, 'epoch': 10.0}
{'train_runtime': 2300.1759, 'train_samples_per_second': 89.75, 'train_steps_per_second': 11.221, 'train_loss': 0.30106123488617237, 'epoch': 10.0}


100%|██████████| 287/287 [00:06<00:00, 42.83it/s]

📈 Evaluation Metrics:
eval_loss: 0.1283
eval_accuracy: 0.9682
eval_precision: 0.6834
eval_recall: 0.7881
eval_f1: 0.7320
eval_runtime: 6.7172
eval_samples_per_second: 341.5090
eval_steps_per_second: 42.7260
epoch: 10.0000





### Model Saving

In [13]:
# Model Saving
save_directory = f"./final_ner_model_{model_checkpoint.split('/')[-1]}"
print(f"Saving model to {save_directory}")

trainer.save_model(f"./final_ner_model_{model_checkpoint.split('/')[-1]}")
tokenizer.save_pretrained(f"./final_ner_model_{model_checkpoint.split('/')[-1]}")



Saving model to ./final_ner_model_bert-tiny


('./final_ner_model_bert-tiny\\tokenizer_config.json',
 './final_ner_model_bert-tiny\\special_tokens_map.json',
 './final_ner_model_bert-tiny\\vocab.txt',
 './final_ner_model_bert-tiny\\added_tokens.json',
 './final_ner_model_bert-tiny\\tokenizer.json')

### Model Inference

In [15]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=f"./final_ner_model_{model_checkpoint.split('/')[-1]}", tokenizer=f"./final_ner_model_{model_checkpoint.split('/')[-1]}", aggregation_strategy="simple")

text = "Dr. Marvin Rolfson and Julius Daugherty attended the arbitration. Their contact number is +91-8009992328"
print(ner_pipeline(text))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'FULLNAME', 'score': np.float32(0.94410837), 'word': 'dr. marvin rolfson', 'start': 0, 'end': 18}, {'entity_group': 'FULLNAME', 'score': np.float32(0.97039366), 'word': 'julius daugherty', 'start': 23, 'end': 39}, {'entity_group': 'NUMBER', 'score': np.float32(0.33254895), 'word': '91 - 80099', 'start': 91, 'end': 99}, {'entity_group': 'STREETADDRESS', 'score': np.float32(0.20343716), 'word': '##23', 'start': 100, 'end': 102}]
