# Identify and anonymize Personally Identifiable Information #

In [75]:
import re
import spacy
from pathlib import Path

In [76]:
# For dutch: spacy.load("nl_core_news_sm")
nlp = spacy.load("en_core_web_sm")

In [77]:
data = Path('example.txt')

In [78]:
with open(data) as f:
    my_text = f.read()

### Identify PII ###

* **name**
* **email**
* **location**
* **phone nr**

In [79]:
def get_pii(string):
    """List all proper nouns, email adresses and phone nrs in a given string"""
    
    email_rgx = "(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
    mob_regex = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
    
    processed = nlp(string)
    pii = list()
    
    for token in processed:
        if token.pos_ == 'PROPN':
            pii.append(token.text)
        elif re.search(mob_regex, str(token)):
            pii.append(token.text)
        elif re.search(email_regex, token.text):
            pii.append(token.text)
        
    return pii

In [80]:
print(list_pii(my_text))

['Jane', 'Doe', 'Sent', 'Friday', 'September', 'President', 'U.S.', 'U.S.', 'President', 'Donald', 'Trump', 'United', 'States', 'Trump', 'US', 'Willem', 'Post', 'Jerry', 'Christopher', 'Columbus', 'Blvd', 'Philadelphia', '+31612345678', 'Jane']
