In [None]:
import pandas as pd
import random
import spacy
from spacy.training.example import Example
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

In [None]:
df = pd.read_excel(r'./DatabaseTableUpload/Appendix_A_Capstone_DataSharingProposal.xlsx', sheet_name='A.25_ServiceNow_Incidents')


# Named Entity Recognition (NER)



#### Enhance the transformer NER model with added examples.

The process of using NER to auto-detect named entities is illustrated here:

[https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/](https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/)

Code examples used from this article were used to complete the PII removal process.

We substituted `en_core_web_sm` with `en_core_web_trf` to emphasize detection of names within text.

To download: `python -m spacy download en_core_web_trf`


In [None]:
nlp = English()

# Instantiate a Tokenizer with the default settings for English, including punctuation rules and exceptions.
tokenizer = nlp.tokenizer

In [None]:
nlp = spacy.load('en_core_web_trf') # Load transformer model.
print(nlp.pipe_names) # From MachineLearningPlus.com article.

In [None]:
text_test = "Microsoft Excel throws error 0xC0000142"
phrase_len = len("Microsoft Outlook")
text_test[0:15]
print(phrase_len)

In [None]:
# Training examples - do this only for *individual* NERs, e.g. single-word NERs.
train_data = [
    ("Windows Defender blocked my internet access this morning, can you take a look?", {"entities": [(0, 16, "SOFTWARE")]}),
    ("BitLocker malfunctioned upon scanning this morning", {"entities": [(0,9, "SOFTWARE")]}),
    ("Please check to see if Windows Firewall is working", {"entities": [(23,39, "SOFTWARE")]}),
    ("Baird TrustDesk Migration to OneDrive - Error migrating", {"entities": [(0, 15, "SECURITY"), (29, 37, "SOFTWARE")]}),
    ("Hello. My name is spelled incorrectly on the DocuSign application. It is Veronica Fitzpatrick, some letters are all flipped around  backwards and I was hoping to have that fixed!", {"entities": [(45, 53, "SOFTWARE"), (73, 93, "PERSON")]}),
    ("I use Microsoft Office for my daily work", {"entities": [(6, 22, "SOFTWARE")]}),
    ("Microsoft Teams is crashing", {"entities": [(0, 15, "SOFTWARE")]}),
    ("Microsoft Outlook is crashing", {"entities": [(0, 17, "SOFTWARE")]}),
    ("Microsoft Excel throws error 0xC0000142", {"entities": [(0, 15, "SOFTWARE")]})
] # Example adapted from MachineLearningPlus.com article.

In [None]:
ner = nlp.get_pipe("ner")

# TRAINING THE MODEL
# Disable pipeline components you don't need to change.

# Original code from MachineLearningPlus.com article no longer working
# and needed to be adapted to work with spaCy==3.5.3.
from spacy.util import minibatch
pipe_exceptions = ["transformer", "ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.select_pipes(disable=unaffected_pipes):
    for itn in range(20): # Set iterations.
        random.shuffle(train_data)
        batches = minibatch(train_data, size=2)
        losses = {}
        for batch in batches:
            examples = []
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples=examples, losses=losses, drop=0.4)
        print("Iteration:", itn + 1, "Loss:", losses)

nlp.to_disk("./en_web_core_trf_updated")

In [None]:
nlp = spacy.load("./en_web_core_trf_updated")
print(nlp.pipe_names)
print(nlp.get_pipe("ner").labels)

In [None]:
for text, _ in train_data:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


## Using EntityRuler() to directly classify new entities based on a set of rules.


In [None]:
rulerSoftwares = nlp.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "SOFTWARE", "pattern": "Windows"},
    {"label": "SOFTWARE", "pattern": "Windows Defender"},
    {"label": "SOFTWARE", "pattern": "Windows OneDrive"},
    {"label": "SOFTWARE", "pattern": "BitLocker"},
    {"label": "SOFTWARE", "pattern": "BitDefender"},
    {"label": "SOFTWARE", "pattern": "OneDrive"},
    {"label": "SOFTWARE", "pattern": "Windows Firewall"},
    {"label": "SOFTWARE", "pattern": "Windows Server"},
    {"label": "SOFTWARE", "pattern": "Microsoft Teams"},
    {"label": "SOFTWARE", "pattern": "Microsoft"},
    {"label": "SOFTWARE", "pattern": "DocuSign"}
]
rulerSoftwares.add_patterns(patterns)
print(nlp.pipe_names)

In [None]:
doc = nlp("DocuSigns is doing its job.")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
# Example of token generation for the first body of text.
doc = nlp(df['short_description'][0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_) # Print: token, POS, syntactic dependency.

In [None]:
# Create function to add an article's tokens to `doc_list`.
# Tokenize one time, then use that object for the subsequent accumulators.
# Returns None many times.
doc_list = []
def to_doc_list(text):
    doc_list.append(nlp(text))

In [None]:
# Takes time to generate tokens from each cell's fulltext.
df['short_description'].apply(to_doc_list)

In [None]:
# Assign `doc_list` to `doc_series` as a Series object.
doc_series = pd.Series(doc_list)
doc_series


Structure for masking entities borrowed from: [https://towardsdatascience.com/remove-personal-information-from-a-text-with-python-part-ii-ner-2e6529d409a6](https://towardsdatascience.com/remove-personal-information-from-a-text-with-python-part-ii-ner-2e6529d409a6)


In [None]:
# Structure for masking entities borrowed from: https://towardsdatascience.com/remove-personal-information-from-a-text-with-python-part-ii-ner-2e6529d409a6
for doc in doc_series:
    print(doc)
    filtered_string = ""
    for token in doc:
        if token.ent_type_ in ['PERSON', 'PRODUCT', 'MONEY', 'CARDINAL', 'QUANTITY', 'PERCENT', 'SOFTWARE', 'SECURITY']: # Adjusted tokens to mask vs. original example in TDS article.
            new_token = " <{}>".format(token.ent_type_)
        elif token.pos_ == "PUNCT":
            new_token = token.text
        else:
            new_token = " {}".format(token.text)
        filtered_string += new_token
    filtered_string = filtered_string[1:]
    print(filtered_string, '\n')

In [None]:
# Remove phone numbers.
import re
def remove_phone_numbers(text):
    pattern = r"\b(?:\+?\d{1,3}[-.])?\(?\d{1,3}\)?[-.\s]?\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b"
    return re.sub(pattern, "<PHONE NUMBER>", text)

In [None]:
df['short_description'].apply(remove_phone_numbers)

In [None]:
# Remove email addresses.
def remove_email_addresses(text):
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    return re.sub(pattern, "<EMAIL>", text)


# Check NER results.



Dictionary accumulator for entities based on entity type


In [None]:
ent_dict = {}
def count_ent(doc):
    for ent in doc.ents:
        if ent.label_ not in ent_dict:
            ent_dict[ent.label_] = 1
        else:
            ent_dict[ent.label_]+=1
doc_series.apply(count_ent)
ent_dict

In [None]:
print(nlp.get_pipe("ner").labels)


#### PERSON.


In [None]:
ent_dict = {}
def count_person(doc):
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            if ent.text not in ent_dict:
                ent_dict[ent.text]=1
            else:
                ent_dict[ent.text]+=1
doc_series.apply(count_person)
sorted(ent_dict.items(), key=lambda x: x[1], reverse=True)[0:10]


&nbsp;
#### Save out above entity list + counts as a pd.Sereies() object.


In [None]:
output_dir = r'./PII'

In [None]:
ner_obj = sorted(ent_dict.items(), key=lambda x: x[1], reverse=True) # specify which dict to save
ner_obj = pd.Series(ner_obj)
ner_obj.to_csv(output_dir + 'ner_obj.csv', sep=';', encoding='utf-8-sig')