In [1]:
from langchain_core.documents import Document

In [2]:
document_content = """Date: October 19, 2021
 Witness: John Doe
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.

 Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@example.com.

 Please consider this information to be highly confidential and respect my privacy.

 The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, support@bankname.com.
 My representative there is Victoria Cherry (her business phone: 987-654-3210).

 Thank you for your assistance,

 John Doe"""


Create Document object

In [3]:
documents= [Document(page_content=document_content)]

In [4]:
documents

[Document(page_content="Date: October 19, 2021\n Witness: John Doe\n Subject: Testimony Regarding the Loss of Wallet\n\n Testimony Content:\n\n Hello Officer,\n\n My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.\n\n Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.\n\n Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.\n\n What's more, I had my polish identity card there, with the number ABC123456.\n\n I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.\n\n In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@

* Before moving to Qa we will change the data first and then carry on with next steps
* The above document has many PII values and some of them occur multiple times

In [5]:
## Function for coloring PII markers
### This is only for notebook purpose
import re
def print_colored_pii(string):
    colored_string= re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

### Lets anonymize data

In [6]:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer


In [7]:
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators=False,
)
print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME_2>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_NUMBER>[0m,

### Look at values vs mapping

In [8]:
import pprint
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021', '<DATE_TIME_2>': '9:30 AM'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


The above mapping has same tag for date as well as time.
1. We can add different tags for date and for time as well.
2. The id number which is said as Polish Id seems to be un-annonymized. Lets add that as well into configuration.

- Solution: We will add new recognizers to anonymizer.

### Adding new recognizers

In [9]:
from presidio_analyzer import Pattern, PatternRecognizer

## Define pattern for Polish id
polish_id_pattern = Pattern(
    name= "polish_id_pattern",
    regex= "[A-Z]{3}\d{6}",
    score=1    
)
##Defining pattern for time 
time_pattern= Pattern(
    name="time_pattern",
    regex="(1[0-2]|0?[1-9]):[0-5][0-9] (AM/PM)",
    score=1
)

##Definig recognizer with one or more patterns
polish_id_recognizer= PatternRecognizer(
    supported_entity="POLISH_ID",
    patterns= [polish_id_pattern]
)
time_recognizer= PatternRecognizer(supported_entity="TIME", patterns=[time_pattern])

### Adding recognizers to anonymizers

In [10]:
anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

### Now Before applying anonymization concept to the text. It is important to remove recognizer that detects DATAE_TIME for time as it has already ben defined. Else we can reset the entire mapping

In [11]:
anonymizer.reset_deanonymizer_mapping()

##### Anonymize the text and check the results

In [12]:
print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME_2>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_N

In [13]:
import pprint
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021', '<DATE_TIME_2>': '9:30 AM'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'POLISH_ID': {'<POLISH_ID>': 'ABC123456'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


### Now we have completely anonymized the PII values correctly.

- Instead of Anonymization lets use synthetic data to mask PII
- This can be done by adding one more parameter `add_default_faker_operators=True`(by default it is True)

In [14]:
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators=True,
    
    )

anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

In [15]:
print_colored_pii(anonymizer.anonymize(document_content))

Date: 1985-03-20
 Witness: Matthew Thomas
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Matthew Thomas and on 1985-03-20, my wallet was stolen in the vicinity of South Alison during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 060482979212, which is registered under my name and linked to my bank account, GB50KIFE87332308531493.

 Additionally, the wallet had a driver's license - DL No: 132882427 issued to my name. It also houses my Social Security Number, 812-62-9741.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 2015-07-14.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 8778735875, or through my personal email, greenekelli@example.net.

 Please consider this i

* As we can see every PII value has been replaecd with synthetic data except custom recognizers.
* We will create custom operators that deal with respective data such has polish ids
* Using faker module we will fake the data/recognizer

In [16]:
from faker import Faker

In [17]:
## Bothify is combination of numerify and letterify which will add both letter and numericals for the id
fake= Faker()
def fake_polish_id(_=None):
    return fake.bothify(text="???######").upper()

fake_polish_id()

'JDO898775'

In [18]:
### Lets configure fake time as well
def fake_time(_=None):
    return fake.time(pattern="%I:%M %p")

fake_time()


'02:26 PM'

#### Lets add newly created operetors to anonymizer:

In [19]:
from presidio_anonymizer.entities import OperatorConfig

In [20]:
new_operators= {
    "POLISH_ID":OperatorConfig("custom", {"lambda": fake_polish_id}),
    "TIME":OperatorConfig("custom", {"lambda": fake_time})
}
anonymizer.add_operators(new_operators)

In [21]:
anonymizer.reset_deanonymizer_mapping()
print_colored_pii(anonymizer.anonymize(document_content))

Date: 2019-07-13
 Witness: Shannon Miranda
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Shannon Miranda and on 2019-07-13, my wallet was stolen in the vicinity of Underwoodview during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 6011953951709066, which is registered under my name and linked to my bank account, GB23IHGP23730153095518.

 Additionally, the wallet had a driver's license - DL No: 328360869 issued to my name. It also houses my Social Security Number, 894-12-1137.

 What's more, I had my polish identity card there, with the number KLV979168.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 1992-05-08.

 In case any information arises regarding my wallet, please reach out to me on my phone number, +1-803-296-6129x65114, or through my personal email, brittanyramirez@example.net.

 Please cons

In [22]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'6011953951709066': '4111 1111 1111 1111'},
 'DATE_TIME': {'1992-05-08': '9:30 AM', '2019-07-13': 'October 19, 2021'},
 'EMAIL_ADDRESS': {'brittanyramirez@example.net': 'johndoe@example.com',
                   'susan93@example.com': 'support@bankname.com'},
 'IBAN_CODE': {'GB23IHGP23730153095518': 'PL61109010140000071219812874'},
 'LOCATION': {'Underwoodview': 'Kilmarnock'},
 'PERSON': {'Amy Adams': 'Victoria Cherry', 'Shannon Miranda': 'John Doe'},
 'PHONE_NUMBER': {'+1-803-296-6129x65114': '999-888-7777'},
 'POLISH_ID': {'KLV979168': 'ABC123456'},
 'UK_NHS': {'8160080313': '987-654-3210'},
 'US_DRIVER_LICENSE': {'328360869': '999000680'},
 'US_SSN': {'894-12-1137': '602-76-4532'}}


### Now all the values has been replaced with synthetic data and we have mapping as well to deanonymize.

### Q and A system with Langchain and Anonymization
* We use `PresidioReversibleAnonymizer` and Langchain Expression Language (LCEL)

In [23]:
### initialize anonymizer
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators= True
    
)

anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

### INitiate LLm 
* Using FAISS as Vector Db 

In [24]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

### Steps to use:
- Load the data
- Anonymize data before indexing



In [25]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')

### Lets load document from file

In [26]:
with open("/Users/vikaslakka/Desktop/FSDS/GenAI/poc/data_privacy/data_privacy/cases/theft_case.txt", 'r') as theft:
    theft_case= theft.read()
documents= [Document(page_content= theft_case)]

In [27]:
## Load data
## Anonymize data before indexing
for doc in documents:
    doc.page_content= anonymizer.anonymize(doc.page_content)

# Split the documents into chunks
text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks= text_splitter.split_documents(documents)

## Indexing the chunks (We use this using OPENAI because the data is already anonymized)
embeddings= OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)
docsearch= FAISS.from_documents(chunks, embeddings)
retriever= docsearch.as_retriever()

In [28]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_openai import ChatOpenAI

In [29]:
# Create anonymier chain
template= """" Answer the question based only on the following context:
{context}

Question: {anonymized_question}
"""

prompt = ChatPromptTemplate.from_template(template=template)
model= ChatOpenAI(temperature=0.3, model_name='gpt-3.5-turbo', openai_api_key= OPENAI_API_KEY)

* RunnableParallel: It will let us run multiple prompts at sametime. eg: Telling joke and writing poem, you can pass two prompts in runnnableparallel and it will provide.
* RunnablePassthrough: This says that the value will be picked from invoke. (what ever the text we pass in invoke it will go to this.)

In [30]:


###Define Runnableparallel
_inputs= RunnableParallel(
    question= RunnablePassthrough(),
    ### Important to add question anonymization
    anonymized_question= RunnableLambda(anonymizer.anonymize),
    )

anonymize_chain= (
    _inputs
    | {"context":itemgetter("anonymized_question")|retriever,
       "anonymized_question": itemgetter("anonymized_question"),
       }
    |prompt
    |model
    |StrOutputParser()
)


In [31]:
pprint.pprint(anonymize_chain.invoke(
    "What was there in wallet? can you provide as bullet points?"
))

('- Credit card with number 4545963852993388\n'
 "- Driver's license with DL No: 738179484\n"
 '- Social Security Number: 575-63-3248\n'
 '- Polish identity card with number <POLISH_ID>')


In [32]:
### Add De-anonymization to the step
chain_with_deanonymization= anonymize_chain|RunnableLambda(anonymizer.deanonymize)

pprint.pprint(chain_with_deanonymization.invoke(
    "What was there in wallet? can you provide as bullet points?"
))

('- Credit card with number 4111 1111 1111 1111\n'
 "- Driver's license with DL No: 999000680\n"
 '- Social Security Number: 602-76-4532\n'
 '- Polish identity card with number ABC123456')


In [None]:
while True:
    ques= input("ask about the incident")
    if ques=='exit':
        break
    else:
        pprint.pprint(chain_with_deanonymization.invoke(ques))

### Lets include guardrails as well into the picture

In [34]:
from nemoguardrails import RailsConfig, LLMRails

In [96]:
config= RailsConfig.from_path("/Users/vikaslakka/Desktop/FSDS/GenAI/poc/data_privacy/data_privacy/experiment/config")
rails= LLMRails(config= config)

In [125]:
# Create anonymier chain
template= """" Answer the question based only on the following context:
{context}

Question: {question}
"""



from langchain.prompts import PromptTemplate
prompt= PromptTemplate(
    template= template,
    input_variables=["context", "question"]
)
_inputs= RunnableParallel(
    question= RunnablePassthrough(),
    ### Important to add question anonymization
    anonymized_question= RunnableLambda(anonymizer.anonymize),
    )
#prompt = ChatPromptTemplate.from_template(template=template)
from langchain.chains import RetrievalQA
##Qa retriever
chain_type_kwargs= {"prompt": prompt}
qa= RetrievalQA.from_chain_type(
    llm= rails.llm,
    chain_type="stuff",
    retriever= retriever,
    chain_type_kwargs= chain_type_kwargs
    
)
rails.register_action(qa, name="qa_chain")

In [111]:
from pprint import pprint

In [128]:
anonymizer.anonymize("what is the id of John Doe?")

'what is the id of Mitchell Jones?'

In [127]:

pprint(await rails.generate_async(
    "what is the id of John Doe?"))

('Based on the provided context, there is no information about John Doe or his '
 'ID. The information provided is about Mitchell Jones and the details of his '
 'stolen wallet.')


In [103]:

await rails.generate_async(
    "Can a Dog open account?")

'Sir, This is a cat bank and we do not entertain dogs here. Hope you understand'