In [1]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


^C


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.8/400.7 MB 2.8 MB/s eta 0:02:23
     ---------------------------------------- 1.0/400.7 MB 2.7 MB/s eta 0:02:31
     ---------------------------------------- 1.6/400.7 MB 2.2 MB/s eta 0:03:06
     ---------------------------------------- 2.1/400.7 MB 2.3 MB/s eta 0:02:53
     ---------------------------------------- 2.6/400.7 MB 2.4 MB/s eta 0:02:44
     ---------------------------------------- 3.4/400.7 MB 2.5 MB/s eta 0:02:38
     ---------------------------------------- 3.9/400.7 MB 2.5 MB/s eta 0:02:39
     ---------------------------------------- 4.7/400.7 MB 2.6 MB/s eta 0:02:30
      -------------------------------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine, OperatorConfig
from presidio_anonymizer.operators import Operator, OperatorType

from typing import Dict
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
text = "peter gave his book to heidi which later gave it to nicole. peter lives in london and nicole lives in tashkent."
print("original text:")
pprint(text)
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text, language="en")
print("analyzer results:")
pprint(analyzer_results)


original text:
('peter gave his book to heidi which later gave it to nicole. peter lives in '
 'london and nicole lives in tashkent.')
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
analyzer results:
[type: PERSON, start: 0, end: 5, score: 0.85,
 type: PERSON, start: 23, end: 28, score: 0.85,
 type: PERSON, start: 52, end: 58, score: 0.85,
 type: PERSON, start: 60, end: 65, score: 0.85,
 type: LOCATION, start: 75, end: 81, score: 0.85,
 type: PERSON, start: 86, end: 92, score: 0.85,
 type: LOCATION, start: 102, end: 110, score: 0.85]


In [4]:
class InstanceCounterAnonymizer(Operator):
    """
    Anonymizer which replaces the entity value
    with an instance counter per entity.
    """

    REPLACING_FORMAT = "<{entity_type}_{index}>"

    def operate(self, text: str, params: Dict = None) -> str:
        """Anonymize the input text."""

        entity_type: str = params["entity_type"]

        # entity_mapping is a dict of dicts containing mappings per entity type
        entity_mapping: Dict[Dict:str] = params["entity_mapping"]

        entity_mapping_for_type = entity_mapping.get(entity_type)
        if not entity_mapping_for_type:
            new_text = self.REPLACING_FORMAT.format(
                entity_type=entity_type, index=0
            )
            entity_mapping[entity_type] = {}

        else:
            if text in entity_mapping_for_type:
                return entity_mapping_for_type[text]

            previous_index = self._get_last_index(entity_mapping_for_type)
            new_text = self.REPLACING_FORMAT.format(
                entity_type=entity_type, index=previous_index + 1
            )

        entity_mapping[entity_type][text] = new_text
        return new_text

    @staticmethod
    def _get_last_index(entity_mapping_for_type: Dict) -> int:
        """Get the last index for a given entity type."""
        return len(entity_mapping_for_type)

    def validate(self, params: Dict = None) -> None:
        """Validate operator parameters."""

        if "entity_mapping" not in params:
            raise ValueError("An input Dict called `entity_mapping` is required.")
        if "entity_type" not in params:
            raise ValueError("An entity_type param is required.")

    def operator_name(self) -> str:
        return "entity_counter"

    def operator_type(self) -> OperatorType:
        return OperatorType.Anonymize

In [5]:
# Create Anonymizer engine and add the custom anonymizer
anonymizer_engine = AnonymizerEngine()
anonymizer_engine.add_anonymizer(InstanceCounterAnonymizer)

# Create a mapping between entity types and counters
entity_mapping = dict()

# Anonymize the text

anonymized_result = anonymizer_engine.anonymize(
    text,
    analyzer_results,
    {
        "DEFAULT": OperatorConfig(
            "entity_counter", {"entity_mapping": entity_mapping}
        )
    },
)

print(anonymized_result.text)


<PERSON_2> gave his book to <PERSON_3> which later gave it to <PERSON_0>. <PERSON_2> lives in <LOCATION_2> and <PERSON_0> lives in <LOCATION_0>.


In [6]:
pprint(entity_mapping, indent=2)

{ 'LOCATION': {'london': '<LOCATION_2>', 'tashkent': '<LOCATION_0>'},
  'PERSON': { 'heidi': '<PERSON_3>',
              'nicole': '<PERSON_0>',
              'peter': '<PERSON_2>'}}


In [7]:
class InstanceCounterDeanonymizer(Operator):
    """
    Deanonymizer which replaces the unique identifier 
    with the original text.
    """

    def operate(self, text: str, params: Dict = None) -> str:
        """Anonymize the input text."""

        entity_type: str = params["entity_type"]

        # entity_mapping is a dict of dicts containing mappings per entity type
        entity_mapping: Dict[Dict:str] = params["entity_mapping"]

        if entity_type not in entity_mapping:
            raise ValueError(f"Entity type {entity_type} not found in entity mapping!")
        if text not in entity_mapping[entity_type].values():
            raise ValueError(f"Text {text} not found in entity mapping for entity type {entity_type}!")

        return self._find_key_by_value(entity_mapping[entity_type], text)

    @staticmethod
    def _find_key_by_value(entity_mapping, value):
        for key, val in entity_mapping.items():
            if val == value:
                return key
        return None
    
    def validate(self, params: Dict = None) -> None:
        """Validate operator parameters."""

        if "entity_mapping" not in params:
            raise ValueError("An input Dict called `entity_mapping` is required.")
        if "entity_type" not in params:
            raise ValueError("An entity_type param is required.")

    def operator_name(self) -> str:
        return "entity_counter_deanonymizer"

    def operator_type(self) -> OperatorType:
        return OperatorType.Deanonymize


In [9]:
text="<PERSON_2> gave his book to <PERSON_1> at <LOCATION_0>"

In [10]:
deanonymizer_engine = DeanonymizeEngine()
deanonymizer_engine.add_deanonymizer(InstanceCounterDeanonymizer)

deanonymized = deanonymizer_engine.deanonymize(
    text, 
    anonymized_result.items, 
    {"DEFAULT": OperatorConfig("entity_counter_deanonymizer", 
                               params={"entity_mapping": entity_mapping})}
)
print("anonymized text:")
pprint(anonymized_result.text)
print("de-anonymized text:")
pprint(deanonymized.text)

InvalidParamError: Invalid analyzer result, start: 131 and end: 143, while text length is only 54.