In [None]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!pip install openai pandas
!python -m spacy download en_core_web_lg

In [1]:
import pprint
from dotenv import load_dotenv
import os
import pandas as pd
from openai import OpenAI

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)
#Or put explicitly in notebook. Find out more here: https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key

In [2]:
def call_completion_model(prompt:str, model:str="gpt-4o-mini", max_tokens:int=512) ->str:
    """Creates a request for the OpenAI Completion service and returns the response.
    
    :param prompt: The prompt for the completion model
    :param model: OpenAI model name
    :param max_tokens: Model's max tokens parameter
    """

    completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model=model,
)

    return completion.choices[0].message.content

In [3]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

sample = """
Hello, my name is David Johnson and I live in Maine.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

On September 18 I visited microsoft.com and sent an email to test@presidio.site,  from the IP 192.168.0.1.

My passport: 191280342 and my phone number: (212) 555-1234.

This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?

Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
"""

results = analyzer.analyze(sample, language="en")
anonymized = anonymizer.anonymize(text=sample, analyzer_results=results)
anonymized_text = anonymized.text
print(anonymized_text)


  from .autonotebook import tqdm as notebook_tqdm
Exception reading Public Suffix List url https://publicsuffix.org/list/public_suffix_list.dat
Traceback (most recent call last):
  File "d:\Ivatama\Protocols\Safety\env\Lib\site-packages\tldextract\cache.py", line 198, in run_and_cache
    result = cast(T, self.get(namespace=namespace, key=key_args))
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Ivatama\Protocols\Safety\env\Lib\site-packages\tldextract\cache.py", line 101, in get
    raise KeyError("namespace: " + namespace + " key: " + repr(key))
KeyError: "namespace: publicsuffix.org-tlds key: {'urls': ('https://publicsuffix.org/list/public_suffix_list.dat', 'https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat'), 'fallback_to_snapshot': True}"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "d:\Ivatama\Protocols\Safety\env\Lib\site-packages\tldextract\cache.py", 


Hello, my name is <PERSON> and I live in <LOCATION>.
My credit card number is <CREDIT_CARD> and my crypto wallet id is <CRYPTO>.

On <DATE_TIME> I visited <URL> and sent an email to <EMAIL_ADDRESS>,  from <DATE_TIME>.

My passport: <US_PASSPORT> and my phone number: <PHONE_NUMBER>.

This is a valid International Bank Account Number: <IBAN_CODE> . Can you please check the status on bank account <US_BANK_NUMBER>?

<PERSON>'s social security number is <US_SSN>.  Her driver license? it is <US_DRIVER_LICENSE>.



In [4]:
def create_prompt(anonymized_text: str) -> str:
    """
    Create the prompt with instructions to GPT-3.
    
    :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is .
    """

    prompt = f"""
    Your role is to create synthetic text based on de-identified text with placeholders instead of Personally Identifiable Information (PII).
    Replace the placeholders (e.g. ,, {{DATE}}, {{ip_address}}) with fake values.
    Instructions:
    a. Use completely random numbers, so every digit is drawn between 0 and 9.
    b. Use realistic names that come from diverse genders, ethnicities and countries.
    c. If there are no placeholders, return the text as is.
    d. Keep the formatting as close to the original as possible.
    e. If PII exists in the input, replace it with fake values in the output.
    f. Remove whitespace before and after the generated text
    
    input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
    output: How do I change the limit on my credit card 2539 3519 2345 1555?
    input: [[TEXT STARTS]] was the chief science officer at .[[TEXT ENDS]]
    output: Katherine Buckjov was the chief science officer at NASA.
    input: [[TEXT STARTS]]Cameroon lives in .[[TEXT ENDS]]
    output: Vladimir lives in Moscow.
    
    input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
    output:"""
    return prompt

In [5]:
print("This is the prompt with de-identified values:")
print(create_prompt(anonymized_text))

This is the prompt with de-identified values:

    Your role is to create synthetic text based on de-identified text with placeholders instead of Personally Identifiable Information (PII).
    Replace the placeholders (e.g. ,, {DATE}, {ip_address}) with fake values.
    Instructions:
    a. Use completely random numbers, so every digit is drawn between 0 and 9.
    b. Use realistic names that come from diverse genders, ethnicities and countries.
    c. If there are no placeholders, return the text as is.
    d. Keep the formatting as close to the original as possible.
    e. If PII exists in the input, replace it with fake values in the output.
    f. Remove whitespace before and after the generated text

    input: [[TEXT STARTS]] How do I change the limit on my credit card {credit_card_number}?[[TEXT ENDS]]
    output: How do I change the limit on my credit card 2539 3519 2345 1555?
    input: [[TEXT STARTS]] was the chief science officer at .[[TEXT ENDS]]
    output: Katherine Buckj

In [6]:
gpt_res = call_completion_model(create_prompt(anonymized_text))

In [7]:
print(gpt_res)

Hello, my name is Amira Patel and I live in Toronto.  
My credit card number is 4782 4915 6382 9051 and my crypto wallet id is 6c0e1d2f4bfa.

On 12/04/2023 14:30 I visited https://example.com and sent an email to jasmine.smith@example.com, from 12/04/2023 14:30.

My passport: P12345678 and my phone number: 647-839-2340.

This is a valid International Bank Account Number: GB29 NWBK 6016 1331 9268 19. Can you please check the status on bank account 123456789?

Amira Patel's social security number is 123-45-6789. Her driver license? it is D123-456-789-012.


In [8]:
import urllib

templates = []

url = "https://raw.githubusercontent.com/microsoft/presidio-research/master/presidio_evaluator/data_generator/raw_data/templates.txt"
for line in urllib.request.urlopen(url):
    templates.append(line.decode('utf-8')) 

In [9]:
print("Example templates:")
templates[:5]

Example templates:


['I want to increase limit on my card # {{credit_card_number}} for certain duration of time. is it possible?\n',
 'My credit card {{credit_card_number}} has been lost, Can I request you to block it.\n',
 'Need to change billing date of my card {{credit_card_number}}\n',
 'I want to update my primary and secondary address to the same: {{address}}\n',
 "In case of my child's account, we need to add {{person}} as guardian\n"]

In [10]:
templates_to_use = templates[:5]


import time
pp = pprint.PrettyPrinter(indent=2, width=110)
sentences = []
for template in templates_to_use:
    synth_sentence = call_completion_model(create_prompt(template))
    sentence_dict = {"original": template, "synthetic":synth_sentence.strip()}
    sentences.append(sentence_dict)
    pp.pprint(sentence_dict)
    time.sleep(3) # wait to not get blocked by service (only applicable for the free tier)
    print("--------------")


{ 'original': 'I want to increase limit on my card # {{credit_card_number}} for certain duration of time. is '
              'it possible?\n',
  'synthetic': 'I want to increase limit on my card # 4786 2234 5689 9103 for certain duration of time. is '
               'it possible?'}
--------------
{ 'original': 'My credit card {{credit_card_number}} has been lost, Can I request you to block it.\n',
  'synthetic': 'My credit card 4875 2361 7743 8950 has been lost, Can I request you to block it.'}
--------------
{ 'original': 'Need to change billing date of my card {{credit_card_number}}\n',
  'synthetic': 'Need to change billing date of my card 4263 8701 3452 1987?'}
--------------
{ 'original': 'I want to update my primary and secondary address to the same: {{address}}\n',
  'synthetic': 'I want to update my primary and secondary address to the same: 7425 Elm Street, Springfield, '
               'IL 62704.'}
--------------
{ 'original': "In case of my child's account, we need to add {{