In [384]:
# cd "Desktop/MachineLearning/Final_Project"
import pandas as pd 
import numpy as np
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import PatternRecognizer
from presidio_image_redactor import ImageRedactorEngine
from PIL import Image


In [385]:
#df_str = str(df.to_dict(orient="list"))
#analyzer_results = analyzer.analyze(text=df_str, language='en')
#pd.DataFrame.from_dict(analyzer_results, orient='index')

In [386]:
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

import pandas as pd

from presidio_analyzer import AnalyzerEngine, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import EngineResult

In [569]:
#from https://microsoft.github.io/presidio/samples/python/batch_processing/
@dataclass
class DictAnalyzerResult:
    """Hold the analyzer results per value or list of values."""
    key: str
    value: Union[str, List[str]]
    recognizer_results: Union[List[RecognizerResult], List[List[RecognizerResult]]]


class BatchAnalyzerEngine(AnalyzerEngine):
    """
    Class inheriting from AnalyzerEngine and adds the funtionality to analyze lists or dictionaries.
    """
    
    def analyze_list(self, list_of_texts: Iterable[str], **kwargs) -> List[List[RecognizerResult]]:
        """
        Analyze an iterable of strings
        
        :param list_of_texts: An iterable containing strings to be analyzed.
        :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
        """
        
        list_results = []
        for text in list_of_texts:
            results = self.analyze(text=text, **kwargs) if isinstance(text, str) else []
            list_results.append(results)
        return list_results

    def analyze_dict(
     self, input_dict: Dict[str, Union[object, Iterable[object]]], **kwargs) -> Iterator[DictAnalyzerResult]:
        """
        Analyze a dictionary of keys (strings) and values (either object or Iterable[object]). 
        Non-string values are returned as is.

                :param input_dict: The input dictionary for analysis
        :param kwargs: Additional keyword arguments for the `AnalyzerEngine.analyze` method
        """
        
        for key, value in input_dict.items():
            if not value:
                results = []
            else:
                if isinstance(value, str):
                    results: List[RecognizerResult] = self.analyze(text=value, **kwargs)
                elif isinstance(value, collections.abc.Iterable):
                    results: List[List[RecognizerResult]] = self.analyze_list(
                                list_of_texts=value, 
                                **kwargs)
                else:
                    results = []
            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)


class BatchAnonymizerEngine(AnonymizerEngine):
    """
    Class inheriting from the AnonymizerEngine and adding additional functionality 
    for anonymizing lists or dictionaries.
    """
    
    def anonymize_list(
        self, 
        texts:List[str], 
        recognizer_results_list: List[List[RecognizerResult]], 
        **kwargs
    ) -> List[EngineResult]:
        """
        Anonymize a list of strings.
        
        :param texts: List containing the texts to be anonymized (original texts)
        :param recognizer_results_list: A list of lists of RecognizerResult, 
        the output of the AnalyzerEngine on each text in the list.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        return_list = []
        for text, recognizer_results in zip(texts, recognizer_results_list):
            if isinstance(text,str):
                res = self.anonymize(text=text,analyzer_results=recognizer_results,**kwargs)
                return_list.append(res.text)
            else:
                return_list.append(text)

        return return_list


    def anonymize_dict(self, analyzer_results: Iterator[DictAnalyzerResult],**kwargs) -> Dict[str, str]:

        """
        Anonymize values in a dictionary.
        
        :param analyzer_results: Iterator of `DictAnalyzerResult` 
        containing the output of the AnalyzerEngine.analyze_dict on the input text.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        
        return_dict = {}
        for result in analyzer_results:
            if isinstance(result.value, str):
                resp = self.anonymize(text=result.value, analyzer_results=result.recognizer_results, **kwargs)
                return_dict[result.key] = resp.text
            elif isinstance(result.value, collections.abc.Iterable):
                anonymize_respones = self.anonymize_list(texts=result.value,
                                                         recognizer_results_list=result.recognizer_results, 
                                                         **kwargs)
                return_dict[result.key] = anonymize_respones 
            else:
                return_dict[result.key] = result.value

        return return_dict            

In [523]:
df = pd.read_csv("pii_data.txt", sep="\t", index_col = 0).reset_index(drop = True)
#df = df.head()
#df = df.head(100)
df_dict = df.to_dict(orient="list")
#df_dict

In [634]:
df.dtypes

ID                         int64
CREATED_BY                object
CREATED_ON                object
UPDATED_BY                object
UPDATED_ON                object
DELETED                    int64
DISABLED                   int64
ADDEDFROMUI                int64
UNIQUEKEY                 object
ISCURRENT                  int64
NAME                      object
OPTLOCKVER                 int64
VALID_FROM                object
VALID_TILL                object
INACTIVE                   int64
ISADMIN                    int64
LOCKED                     int64
PASSWORD                  object
USERNAME                  object
SUPERVISOR                 int64
FIRST_NAME                object
IS_MANAGER                 int64
LAST_NAME                 object
MANAGER                    int64
ASSET_ID                   int64
phone numbers              int64
Social Security number    object
DOB                       object
email                     object
address                   object
city      

In [635]:
df['zip'] = df['zip'].astype(str)

In [651]:
df.dtypes

ID                         int64
CREATED_BY                object
CREATED_ON                object
UPDATED_BY                object
UPDATED_ON                object
DELETED                    int64
DISABLED                   int64
ADDEDFROMUI                int64
UNIQUEKEY                 object
ISCURRENT                  int64
NAME                      object
OPTLOCKVER                 int64
VALID_FROM                object
VALID_TILL                object
INACTIVE                   int64
ISADMIN                    int64
LOCKED                     int64
PASSWORD                  object
USERNAME                  object
SUPERVISOR                 int64
FIRST_NAME                object
IS_MANAGER                 int64
LAST_NAME                 object
MANAGER                    int64
ASSET_ID                   int64
phone numbers              int64
Social Security number    object
DOB                       object
email                     object
address                   object
city      

In [652]:
df.describe(include='all')

Unnamed: 0,ID,CREATED_BY,CREATED_ON,UPDATED_BY,UPDATED_ON,DELETED,DISABLED,ADDEDFROMUI,UNIQUEKEY,ISCURRENT,...,MANAGER,ASSET_ID,phone numbers,Social Security number,DOB,email,address,city,state,zip
count,999.0,999,999,999,999,999.0,999.0,999.0,999,999.0,...,999.0,999.0,999.0,999,999,999,999,999,999,999.0
unique,,1,1,1,1,,,,999,,...,,,,999,959,969,500,342,47,451.0
top,,gladmin,12/10/2020 14:43,gladmin,12/10/2020 14:43,,,,26d04379f776b97fb53dfe78ff146a95,,...,,,,835-32-2618,3/27/1970,rcabrera@pth.com,6649 N Blue Gum St,New York,CA,90248.0
freq,,999,999,999,999,,,,1,,...,,,,1,3,12,2,28,144,8.0
mean,501.406406,,,,,0.0,0.0,0.0,,0.0,...,100045.305305,4.0,5355331000.0,,,,,,,
std,289.223338,,,,,0.0,0.0,0.0,,0.0,...,27.724704,0.0,1803853000.0,,,,,,,
min,2.0,,,,,0.0,0.0,0.0,,0.0,...,100027.0,4.0,2012479000.0,,,,,,,
25%,251.5,,,,,0.0,0.0,0.0,,0.0,...,100028.0,4.0,5046219000.0,,,,,,,
50%,501.0,,,,,0.0,0.0,0.0,,0.0,...,100029.0,4.0,5046219000.0,,,,,,,
75%,750.5,,,,,0.0,0.0,0.0,,0.0,...,100058.0,4.0,5361004000.0,,,,,,,


In [661]:
df['state'].unique()

array(['LA', 'MI', 'NJ', 'AK', 'OH', 'IL', 'CA', 'SD', 'MD', 'PA', 'NY',
       'TX', 'AZ', 'TN', 'WI', 'KS', 'NM', 'OR', 'FL', 'MN', 'MA', 'SC',
       'RI', 'CO', 'ID', 'NC', 'IN', 'WY', 'VA', 'HI', 'GA', 'AR', 'NV',
       'ME', 'WA', 'MS', 'CT', 'MO', 'NH', 'ND', 'MT', 'IA', 'OK', 'KY',
       'UT', 'NE', 'DC'], dtype=object)

In [653]:
df.shape

(999, 33)

In [654]:
#df.isna().sum()
print(df.isnull().sum().sum())
print(df.columns[df.isna().any()].tolist())# which columns have null values

132
['FIRST_NAME']


In [681]:
location_list = pd.read_csv("us_cities_states_counties.csv", sep = '|').reset_index(drop = True)
#df = pd.read_csv("pii_data.txt", sep="\t", index_col = 0).reset_index(drop = True)
location_list.head()

Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville
2,Adjuntas,PR,Puerto Rico,ADJUNTAS,URB San Joaquin
3,Adjuntas,PR,Puerto Rico,ADJUNTAS,Jard De Adjuntas
4,Adjuntas,PR,Puerto Rico,ADJUNTAS,Colinas Del Gigante


In [709]:
location_list['State short'].nunique()

61

In [719]:
batch_analyzer = BatchAnalyzerEngine()

# Adding zip code in the entity list. Make sure zip code is turned into a string for regex to work
zip_pattern = Pattern(name="zip_pattern",regex= '(\\b\\d{5}(?:\\-\\d{4})?\\b)', score = 0.5)
zip_recognizer = PatternRecognizer(supported_entity="ZIPCODE", patterns = [zip_pattern])
batch_analyzer.registry.add_recognizer(zip_recognizer)


#Adding State
state_recognizer = PatternRecognizer(supported_entity="STATE",
                                        deny_list=list(location_list['State short'].dropna().unique()))
batch_analyzer.registry.add_recognizer(state_recognizer)


#Adding List of Cities
city_recognizer = PatternRecognizer(supported_entity="CITY",
                                      deny_list=list(location_list['City'].dropna().unique()))

batch_analyzer.registry.add_recognizer(city_recognizer)

In [720]:
df_dict = df.to_dict(orient="list") #df being converted to a distionary
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
analyzer_df = pd.DataFrame(analyzer_results) #converting into a dataframe
analyzer_df

Unnamed: 0,key,value,recognizer_results
0,ID,"[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 4...","[[], [], [], [], [], [], [], [], [], [], [], [..."
1,CREATED_BY,"[gladmin, gladmin, gladmin, gladmin, gladmin, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
2,CREATED_ON,"[12/10/2020 14:43, 12/10/2020 14:43, 12/10/202...","[[type: DATE_TIME, start: 0, end: 16, score: 0..."
3,UPDATED_BY,"[gladmin, gladmin, gladmin, gladmin, gladmin, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
4,UPDATED_ON,"[12/10/2020 14:43, 12/10/2020 14:43, 12/10/202...","[[type: DATE_TIME, start: 0, end: 16, score: 0..."
5,DELETED,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
6,DISABLED,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
7,ADDEDFROMUI,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
8,UNIQUEKEY,"[26d04379f776b97fb53dfe78ff146a95, 75851ebf88d...","[[], [], [], [], [], [type: LOCATION, start: 0..."
9,ISCURRENT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."


In [528]:
#print(analyzer_df['key'][29])
#print(analyzer_df['value'][29])
#print(analyzer_df['recognizer_results'][29])

#print(analyzer_df['value'][29][10])
#print(analyzer_df['recognizer_results'][29][10])
#print('PERSON' in str(analyzer_df['recognizer_results'][29][10]))

In [721]:
#analysing values within the address column
col = 29
print('PERSON:', str(analyzer_df['recognizer_results'][col]).count('PERSON'))
print('LOCATION:',str(analyzer_df['recognizer_results'][col]).count('LOCATION'))
print('DATE_TIME:',str(analyzer_df['recognizer_results'][col]).count('DATE_TIME'))
print('ZIP CODE:',str(analyzer_df['recognizer_results'][col]).count('ZIPCODE'))
print('STATE:',str(analyzer_df['recognizer_results'][col]).count('STATE'))
print('CITY:',str(analyzer_df['recognizer_results'][col]).count('CITY'))
print('Empty []:', str(analyzer_df['recognizer_results'][col]).count('[]'))
print("Total Types:", str(analyzer_df['recognizer_results'][col]).count('type'))

PERSON: 162
LOCATION: 168
DATE_TIME: 74
ZIP CODE: 176
STATE: 0
CITY: 583
Empty []: 258
Total Types: 1169


In [722]:
#city
col = 30
print('PERSON:', str(analyzer_df['recognizer_results'][col]).count('PERSON'))
print('LOCATION:',str(analyzer_df['recognizer_results'][col]).count('LOCATION'))
print('DATE_TIME:',str(analyzer_df['recognizer_results'][col]).count('DATE_TIME'))
print('ZIP CODE:',str(analyzer_df['recognizer_results'][col]).count('ZIPCODE'))
print('STATE:',str(analyzer_df['recognizer_results'][col]).count('STATE'))
print('CITY:',str(analyzer_df['recognizer_results'][col]).count('CITY'))
print('Empty []:', str(analyzer_df['recognizer_results'][col]).count('[]'))
print("Total Types:", str(analyzer_df['recognizer_results'][col]).count('type'))

PERSON: 104
LOCATION: 743
DATE_TIME: 0
ZIP CODE: 0
STATE: 0
CITY: 1025
Empty []: 0
Total Types: 1872


In [723]:
#state
col = 31
print('PERSON:', str(analyzer_df['recognizer_results'][col]).count('PERSON'))
print('LOCATION:',str(analyzer_df['recognizer_results'][col]).count('LOCATION'))
print('DATE_TIME:',str(analyzer_df['recognizer_results'][col]).count('DATE_TIME'))
print('ZIP CODE:',str(analyzer_df['recognizer_results'][col]).count('ZIPCODE'))
print('STATE:',str(analyzer_df['recognizer_results'][col]).count('STATE'))
print('CITY:',str(analyzer_df['recognizer_results'][col]).count('CITY'))
print('Empty []:', str(analyzer_df['recognizer_results'][col]).count('[]'))
print("Total Types:", str(analyzer_df['recognizer_results'][col]).count('type'))

PERSON: 28
LOCATION: 521
DATE_TIME: 0
ZIP CODE: 0
STATE: 999
CITY: 20
Empty []: 0
Total Types: 1568


In [672]:
#zip code
col = 32
print('PERSON:', str(analyzer_df['recognizer_results'][col]).count('PERSON'))
print('LOCATION:',str(analyzer_df['recognizer_results'][col]).count('LOCATION'))
print('DATE_TIME:',str(analyzer_df['recognizer_results'][col]).count('DATE_TIME'))
print('ZIP CODE:',str(analyzer_df['recognizer_results'][col]).count('ZIPCODE'))
print('STATE:',str(analyzer_df['recognizer_results'][col]).count('STATE'))
print('Empty []:', str(analyzer_df['recognizer_results'][col]).count('[]'))
print("Total Types:", str(analyzer_df['recognizer_results'][col]).count('type'))

PERSON: 20
LOCATION: 0
DATE_TIME: 336
ZIP CODE: 843
STATE: 0
Empty []: 94
Total Types: 1199


## Testing

In [598]:
#customizing presidio https://microsoft.github.io/presidio/samples/python/customizing_presidio_analyzer/

from presidio_analyzer import PatternRecognizer

from presidio_analyzer import Pattern                                      
# Define the regex pattern in a Presidio `Pattern` object:
zip_pattern = Pattern(name="zip_pattern",regex= "(\\b\\d{5}(?:\\-\\d{4})?\\b)", score = 0.5)

# Define the recognizer with one or more patterns
zip_recognizer = PatternRecognizer(supported_entity="ZIPCODE", patterns = [zip_pattern])
text2 = "My address is 99501"

zip_result = zip_recognizer.analyze(text=text2, entities=["ZIPCODE"])
print(zip_result)

#addng the recognizer
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(zip_recognizer)


results = analyzer.analyze(text=text2,
                           #entities=["ZIPCODE"],
                           language='en')
print(results)

[type: ZIPCODE, start: 14, end: 19, score: 0.5]
[type: ZIPCODE, start: 14, end: 19, score: 0.5]


In [718]:
state_recognizer = PatternRecognizer(supported_entity="STATE", 
                                        deny_list=list(location_list['State short'].dropna().unique()))


city_recognizer = PatternRecognizer(supported_entity="CITY",
                                      deny_list=list(location_list['City'].dropna().unique()))


#addng the recognizer
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(state_recognizer)
analyzer.registry.add_recognizer(city_recognizer)


results = analyzer.analyze(text="I live in New York",
                           entities=["CITY"],language='en')
print(results)

[type: CITY, start: 10, end: 18, score: 1.0]


In [714]:
#top 100 rows testing
df100 = df.head(100)
df_dict100 = df100.to_dict(orient="list")


batch_analyzer100 = BatchAnalyzerEngine()
#Adding ZIP CODE Recognizer
zip_pattern = Pattern(name="zip_pattern",regex= '(\\b\\d{5}(?:\\-\\d{4})?\\b)', score = 0.5)
zip_recognizer = PatternRecognizer(supported_entity="ZIPCODE", patterns = [zip_pattern])
batch_analyzer100.registry.add_recognizer(zip_recognizer)

#Adding State
state_recognizer = PatternRecognizer(supported_entity="STATE",
                                      deny_list=list(location_list['State short'].dropna().unique()))
batch_analyzer100.registry.add_recognizer(state_recognizer)

#Adding City
city_recognizer = PatternRecognizer(supported_entity="CITY",
                                      deny_list=list(location_list['City'].dropna().unique()))
batch_analyzer100.registry.add_recognizer(city_recognizer)



#batch_analyzer100.get_supported_entities(language='en') # checking if it is included in the entity list
analyzer_results100 = batch_analyzer100.analyze_dict(df_dict100, language="en")
analyzer_df100 = pd.DataFrame(analyzer_results100)
analyzer_df100
#print('Empty []:', str(analyzer_df100['recognizer_results'][32]).count('[]'))

Unnamed: 0,key,value,recognizer_results
0,ID,"[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 4...","[[], [], [], [], [], [], [], [], [], [], [], [..."
1,CREATED_BY,"[gladmin, gladmin, gladmin, gladmin, gladmin, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
2,CREATED_ON,"[12/10/2020 14:43, 12/10/2020 14:43, 12/10/202...","[[type: DATE_TIME, start: 0, end: 16, score: 0..."
3,UPDATED_BY,"[gladmin, gladmin, gladmin, gladmin, gladmin, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
4,UPDATED_ON,"[12/10/2020 14:43, 12/10/2020 14:43, 12/10/202...","[[type: DATE_TIME, start: 0, end: 16, score: 0..."
5,DELETED,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
6,DISABLED,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
7,ADDEDFROMUI,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."
8,UNIQUEKEY,"[26d04379f776b97fb53dfe78ff146a95, 75851ebf88d...","[[], [], [], [], [], [type: LOCATION, start: 0..."
9,ISCURRENT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[], [], [], [], [], [], [], [], [], [], [], [..."


In [733]:
#small dataframe testing
test = {'zip': ['12345','12234','19010','45678'],
'name': ['TOM',"Bianca", "Harry", "Mary"],
'state': ['PA', 'NY','CA','tx']}
test_result = batch_analyzer100.analyze_dict(test, language="en")
test_df = pd.DataFrame(test_result)
test_df

Unnamed: 0,key,value,recognizer_results
0,zip,"[12345, 12234, 19010, 45678]","[[type: DATE_TIME, start: 0, end: 5, score: 0...."
1,name,"[TOM, Bianca, Harry, Mary]","[[type: PERSON, start: 0, end: 3, score: 0.85]..."
2,state,"[PA, NY, CA, tx]","[[type: STATE, start: 0, end: 2, score: 1.0], ..."


In [734]:
test_df['recognizer_results'][2]

[[type: STATE, start: 0, end: 2, score: 1.0],
 [type: STATE, start: 0, end: 2, score: 1.0,
  type: LOCATION, start: 0, end: 2, score: 0.85],
 [type: STATE, start: 0, end: 2, score: 1.0],
 [type: LOCATION, start: 0, end: 2, score: 0.85]]