# Scripts to train a NER Model on PII Dataset

This script uses ThirdAI's NER library to train a model on a sample PII datasets.

In [None]:
from thirdai import bolt, dataset

Creates a small PII training dataset

In [None]:
import json

pii_sentences = [
    ("John's social security number is 123-45-6789", ["B-PER", "O", "O", "O", "O", "B-PII"]),
    ("Contact Jane Doe at 555-1234 for more information", ["O", "B-PER", "I-PER", "O", "B-PII", "O", "O", "O"]),
    ("The credit card number 4111-1111-1111-1111 is invalid", ["O", "O", "O", "O", "B-PII", "O", "O"]),
    ("Bob's phone number is (123)-456-7890", ["B-PER", "O", "O", "O", "B-PII"]),
    ("Email john.doe@example.com for further details", ["O", "B-PII", "O", "O", "O"]),
]

pii_filename = "pii_ner_data.json"
with open(pii_filename, "w") as file:
    for sentence, tags in pii_sentences:
        tokens = sentence.split()
        data = {"source": tokens, "target": tags}
        json_line = json.dumps(data)
        file.write(json_line + "\n")

unique_tags = sorted({tag for _, tags in pii_sentences for tag in tags})
tag_map = {tag: idx for idx, tag in enumerate(unique_tags)}


Initializes a Bolt NER model for PII.

In [None]:
ner_model = bolt.NER("source", "target", tag_map)

Trains the model on training file created

In [None]:

train_data_source = dataset.NerDataSource(pii_filename)

ner_model.train(
    train_data=train_data_source,
    epochs=3,
    learning_rate=0.001,
    batch_size=1024,
    train_metrics=["loss"],
)

Predicts the relevant PII Tags

In [None]:
tokens = pii_sentences[0][0].split()
predicted_tags = ner_model.get_ner_tags([tokens])

Delete the files

In [None]:
import os
os.remove(pii_filename)