# Identify and anonymize Personally Identifiable Information #

In [8]:
import json
import re
import spacy
from pathlib import Path

## Identify PII ##

* **name**
* **email**
* **location**
* **phone nr**

In [10]:
myfile = Path('path/to/my/file.json')

### Extract from labeled json data ###

In [11]:
def read_json(path):
    with open(path) as f:
        data =json.loads(f.read())
    return data

In [25]:
mydata = read_json(myfile)

In [165]:
def extract_values(obj, key):
    """Pull all values of specified key from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if k == key:
                    arr.append(v)         
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    results = extract(obj, arr, key)
    return results
    

In [174]:
s = extract_values(mydata,'sender')

In [None]:
print(s)

In [176]:
# For dutch: spacy.load("nl_core_news_sm")
nlp = spacy.load("en_core_web_sm")

In [177]:
with open('example.txt') as f:
    my_text = f.read()

In [181]:
def get_pii(string):
    """List all proper nouns, email adresses and phone nrs in a given string"""
    
    email_regex = "(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
    mob_regex = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
    
    processed = nlp(string)
    pii = list()
    
    for token in processed:
        if token.pos_ == 'PROPN':
            pii.append(token.text)
        elif re.search(mob_regex, str(token)):
            pii.append(token.text)
        elif re.search(email_regex, token.text):
            pii.append(token.text)
        
    return pii

In [182]:
print(get_pii(my_text))

['Jane', 'Doe', 'Sent', 'Friday', 'September', 'customerservices@abc.ie', 'President', 'U.S.', 'U.S.', 'President', 'Donald', 'Trump', 'United', 'States', 'Trump', 'US', 'Willem', 'Post', 'Jerry', 'Christopher', 'Columbus', 'Blvd', 'Philadelphia', '+31612345678', 'Jane']
