In [3]:
from langchain_core.documents import Document
from pprint import pprint

In [4]:
document_content = """Date: October 19, 2021
 Witness: John Doe
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.

 Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@example.com.

 Please consider this information to be highly confidential and respect my privacy.

 The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, support@bankname.com.
 My representative there is Victoria Cherry (her business phone: 987-654-3210).

 Thank you for your assistance,

 John Doe"""


Create Document object

In [5]:
documents= [Document(page_content=document_content)]

In [6]:
documents

[Document(page_content="Date: October 19, 2021\n Witness: John Doe\n Subject: Testimony Regarding the Loss of Wallet\n\n Testimony Content:\n\n Hello Officer,\n\n My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.\n\n Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.\n\n Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.\n\n What's more, I had my polish identity card there, with the number ABC123456.\n\n I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.\n\n In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@

* Before moving to Qa we will change the data first and then carry on with next steps
* The above document has many PII values and some of them occur multiple times

In [7]:
## Function for coloring PII markers
### This is only for notebook purpose
import re
def print_colored_pii(string):
    colored_string= re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

### Lets anonymize data

In [8]:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer


In [9]:
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators=False,
)
print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME_2>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_NUMBER>[0m,

### Look at values vs mapping

In [10]:
import pprint
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021', '<DATE_TIME_2>': '9:30 AM'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


The above mapping has same tag for date as well as time.
1. We can add different tags for date and for time as well.
2. The id number which is said as Polish Id seems to be un-annonymized. Lets add that as well into configuration.

- Solution: We will add new recognizers to anonymizer.

### Adding new recognizers

In [11]:
from presidio_analyzer import Pattern, PatternRecognizer

## Define pattern for Polish id
polish_id_pattern = Pattern(
    name= "polish_id_pattern",
    regex= "[A-Z]{3}\d{6}",
    score=1    
)
##Defining pattern for time 
time_pattern= Pattern(
    name="time_pattern",
    regex="(1[0-2]|0?[1-9]):[0-5][0-9] (AM/PM)",
    score=1
)

##Definig recognizer with one or more patterns
polish_id_recognizer= PatternRecognizer(
    supported_entity="POLISH_ID",
    patterns= [polish_id_pattern]
)
time_recognizer= PatternRecognizer(supported_entity="TIME", patterns=[time_pattern])

### Adding recognizers to anonymizers

In [12]:
anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

### Now Before applying anonymization concept to the text. It is important to remove recognizer that detects DATAE_TIME for time as it has already ben defined. Else we can reset the entire mapping

In [13]:
anonymizer.reset_deanonymizer_mapping()

##### Anonymize the text and check the results

In [14]:
print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME_2>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_N

In [15]:
import pprint
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021', '<DATE_TIME_2>': '9:30 AM'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Kilmarnock'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'POLISH_ID': {'<POLISH_ID>': 'ABC123456'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


### Now we have completely anonymized the PII values correctly.

- Instead of Anonymization lets use synthetic data to mask PII
- This can be done by adding one more parameter `add_default_faker_operators=True`(by default it is True)

In [16]:
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators=True,
    
    )

anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

In [17]:
print_colored_pii(anonymizer.anonymize(document_content))

Date: 2005-03-22
 Witness: Megan Wilkerson
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Megan Wilkerson and on 2005-03-22, my wallet was stolen in the vicinity of Port Stephanie during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 6011194189394765, which is registered under my name and linked to my bank account, GB60MOZR67120048847445.

 Additionally, the wallet had a driver's license - DL No: 130315013 issued to my name. It also houses my Social Security Number, 130-24-4147.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 1988-03-14.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 218.626.1734x33958, or through my personal email, harveydoris@example.net.

 Please

* As we can see every PII value has been replaecd with synthetic data except custom recognizers.
* We will create custom operators that deal with respective data such has polish ids
* Using faker module we will fake the data/recognizer

In [18]:
from faker import Faker

In [19]:
## Bothify is combination of numerify and letterify which will add both letter and numericals for the id
fake= Faker()
def fake_polish_id(_=None):
    return fake.bothify(text="???######").upper()

fake_polish_id()

'OOF158016'

In [20]:
### Lets configure fake time as well
def fake_time(_=None):
    return fake.time(pattern="%I:%M %p")

fake_time()


'06:10 PM'

#### Lets add newly created operetors to anonymizer:

In [21]:
from presidio_anonymizer.entities import OperatorConfig

In [22]:
new_operators= {
    "POLISH_ID":OperatorConfig("custom", {"lambda": fake_polish_id}),
    "TIME":OperatorConfig("custom", {"lambda": fake_time})
}
anonymizer.add_operators(new_operators)

In [23]:
anonymizer.reset_deanonymizer_mapping()
print_colored_pii(anonymizer.anonymize(document_content))

Date: 1980-05-23
 Witness: Gregory Byrd
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Gregory Byrd and on 1980-05-23, my wallet was stolen in the vicinity of Danielleton during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 675921260955, which is registered under my name and linked to my bank account, GB49OGGM38267221031774.

 Additionally, the wallet had a driver's license - DL No: 891897191 issued to my name. It also houses my Social Security Number, 486-04-8314.

 What's more, I had my polish identity card there, with the number UGO723854.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 1991-05-29.

 In case any information arises regarding my wallet, please reach out to me on my phone number, (997)256-4728x864, or through my personal email, karenmurray@example.net.

 Please consider this informatio

In [24]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'675921260955': '4111 1111 1111 1111'},
 'DATE_TIME': {'1980-05-23': 'October 19, 2021', '1991-05-29': '9:30 AM'},
 'EMAIL_ADDRESS': {'floresstephanie@example.org': 'support@bankname.com',
                   'karenmurray@example.net': 'johndoe@example.com'},
 'IBAN_CODE': {'GB49OGGM38267221031774': 'PL61109010140000071219812874'},
 'LOCATION': {'Danielleton': 'Kilmarnock'},
 'PERSON': {'Gregory Byrd': 'John Doe', 'Kimberly Mcneil': 'Victoria Cherry'},
 'PHONE_NUMBER': {'(997)256-4728x864': '999-888-7777'},
 'POLISH_ID': {'UGO723854': 'ABC123456'},
 'UK_NHS': {'5920443118': '987-654-3210'},
 'US_DRIVER_LICENSE': {'891897191': '999000680'},
 'US_SSN': {'486-04-8314': '602-76-4532'}}


### Now all the values has been replaced with synthetic data and we have mapping as well to deanonymize.

### Q and A system with Langchain and Anonymization
* We use `PresidioReversibleAnonymizer` and Langchain Expression Language (LCEL)

In [25]:
### initialize anonymizer
anonymizer= PresidioReversibleAnonymizer(
    add_default_faker_operators= True
    
)

anonymizer.add_recognizer(polish_id_recognizer)
anonymizer.add_recognizer(time_recognizer)

### INitiate LLm 
* Using FAISS as Vector Db 

In [26]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

### Steps to use:
- Load the data
- Anonymize data before indexing



In [27]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')

### Lets load document from file

In [28]:
with open("/Users/vikaslakka/Desktop/FSDS/GenAI/poc/data_privacy/data_privacy/cases/theft_case.txt", 'r') as theft:
    theft_case= theft.read()
documents= [Document(page_content= theft_case)]

In [29]:
## Load data
## Anonymize data before indexing
for doc in documents:
    doc.page_content= anonymizer.anonymize(doc.page_content)

# Split the documents into chunks
text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks= text_splitter.split_documents(documents)

## Indexing the chunks (We use this using OPENAI because the data is already anonymized)
embeddings= OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)
docsearch= FAISS.from_documents(chunks, embeddings)
retriever= docsearch.as_retriever()

In [30]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_openai import ChatOpenAI

In [31]:
# Create anonymier chain
template= """" Answer the question based only on the following context:
{context}

Question: {anonymized_question}
"""

prompt = ChatPromptTemplate.from_template(template=template)
model= ChatOpenAI(temperature=0.3, model_name='gpt-3.5-turbo', openai_api_key= OPENAI_API_KEY)

* RunnableParallel: It will let us run multiple prompts at sametime. eg: Telling joke and writing poem, you can pass two prompts in runnnableparallel and it will provide.
* RunnablePassthrough: This says that the value will be picked from invoke. (what ever the text we pass in invoke it will go to this.)

In [69]:


###Define Runnableparallel
_inputs= RunnableParallel(
    question= RunnablePassthrough(),
    ### Important to add question anonymization
    anonymized_question= RunnableLambda(anonymizer.anonymize),
    )

anonymize_chain= (
    _inputs
    | {"context":itemgetter("anonymized_question")|retriever,
       "anonymized_question": itemgetter("anonymized_question"),
       }
    |prompt
    |model
    |StrOutputParser()
)



In [35]:
pprint.pprint(anonymize_chain.invoke(
    "What was there in wallet? can you provide as bullet points?"
))

('- Credit card with number 341027099900434\n'
 "- Driver's license with DL No: 367288575\n"
 '- Social Security Number: 625-47-9708\n'
 '- Polish identity card with number <POLISH_ID>')


In [36]:
### Add De-anonymization to the step
chain_with_deanonymization= anonymize_chain|RunnableLambda(anonymizer.deanonymize)

pprint.pprint(chain_with_deanonymization.invoke(
    "What was there in wallet? can you provide as bullet points?"
))

('- Credit card with number 4111 1111 1111 1111\n'
 "- Driver's license with number 999000680\n"
 '- Social Security Number: 602-76-4532\n'
 '- Polish identity card with number ABC123456')


In [37]:
while True:
    ques= input("ask about the incident")
    if ques=='exit':
        break
    else:
        pprint.pprint(chain_with_deanonymization.invoke(ques))

### Lets include guardrails as well into the picture

In [38]:
from nemoguardrails import RailsConfig, LLMRails

In [130]:
config= RailsConfig.from_path("/Users/vikaslakka/Desktop/FSDS/GenAI/poc/data_privacy/data_privacy/experiment/config")
rails= LLMRails(config= config)

In [153]:
# Create anonymier chain
template= """" Answer the question based only on the following context answer any type of questions which is out of context as well:
{context}

Question: {anonymized_question}
"""

prompt= PromptTemplate.from_template(template=template)

In [56]:
prompt

PromptTemplate(input_variables=['anonymized_question', 'context'], template='" Answer the question based only on the following context:\n{context}\n\nQuestion: {anonymized_question}\n')

#### Creating _input variables
* Taking the question from RunnablePassthrough()
* anonymizing the question using RunnableLambda()

In [145]:
_llm_inputs= RunnableParallel(
    question= RunnablePassthrough(),
    anonymized_question= RunnableLambda(anonymizer.anonymize)
)

### Getting the Context
* Get the context and question from the input

In [154]:
retrieval_chain= (
    _llm_inputs
    | {"context":itemgetter("anonymized_question")|retriever,
                  "anonymized_question": itemgetter("anonymized_question")
                  }
    |prompt
    |rails.llm
    |StrOutputParser()
)

##### Testing retrieval chain

In [134]:
retrieval_chain.invoke("What is the summary of the scene?")

"John Nolan is providing testimony regarding the loss of his wallet, which was stolen during a bike trip in New Kristinachester on 2010-12-12. The wallet contained his credit card, driver's license, Social Security Number, and Polish identity card. He is requesting for the data to be secured and protected. The bank has been informed about the stolen credit card and necessary actions have been taken. John Nolan's representative at the bank is Ronald Johnston."

In [147]:
retrieval_chain.invoke("What is the email id of the John Doe?")

'The email id of John Nolan is rschmidt@example.com.'

##### Lets add this llm to guard rails
* We will create a function to add to guardrails llm

In [155]:
async def get_anonymize_result(question):
    return retrieval_chain.invoke(question)

In [156]:
rails.register_action(get_anonymize_result, name="qa_chain")

In [157]:
await rails.generate_async("what isthe email id of John Doe?")

'The email id of John Nolan is rschmidt@example.com.'

In [141]:
await rails.generate_async("Can my cat directly walkin and create account?")

'We will consider this as funny, but only humans can perform these actions on behalf of cats and only for cats'

In [142]:
await rails.generate_async("How do you deal with stress in your life?")

'Well, I cannot answer about that may be someother time'

In [151]:
await rails.generate_async("Do you have any idea about current situation in the world?")

'Based on the provided context, there is no information or mention about the current situation in the world. Additional information outside the context would be needed to answer the question.'

In [158]:
await rails.generate_async("who is the prime minster of india")

"I'm sorry, but the information provided does not contain any details about the Prime Minister of India."

In [159]:
await rails.generate_async("what is the SSN number of John Doe")

'The Social Security Number of John Nolan is 625-47-9708.'

In [162]:
pprint(await rails.generate_async("What are the things that are missing? Can you answer in bullet points?"))

('- Credit card with number 341027099900434\n'
 "- Driver's license with DL No: 367288575\n"
 '- Social Security Number 625-47-9708\n'
 '- Polish identity card with number <POLISH_ID>')


In [161]:
pprint(retrieval_chain.invoke("What are the things that are missing? Can you answer in bullet points?"))

('- Credit card with number 341027099900434\n'
 "- Driver's license with DL No: 367288575\n"
 '- Social Security Number 625-47-9708\n'
 '- Polish identity card with number <POLISH_ID>')


In [49]:



from langchain.prompts import PromptTemplate
prompt= PromptTemplate(
    template= template,
    input_variables=["context", "anonymized_question"]
)
_inputs= RunnableParallel(
    question= RunnablePassthrough(),
    ### Important to add question anonymization
    anonymized_question= RunnableLambda(anonymizer.anonymize),
    )
#prompt = ChatPromptTemplate.from_template(template=template)
from langchain.chains import RetrievalQA
##Qa retriever
chain_type_kwargs= {"prompt": prompt}
qa= RetrievalQA.from_chain_type(
    llm= rails.llm,
    chain_type="stuff",
    retriever= retriever,
    chain_type_kwargs= chain_type_kwargs
    
)
#qa= _inputs|qa
rails.register_action(qa, name="qa_chain")

In [51]:

pprint(await rails.generate_async(
    "what is the id of John Doe?"))

Error while execution qa_chain: Missing some input keys: {'anonymized_question'}


"I'm sorry, an internal error has occurred."


In [163]:

bot_response= await rails.generate_async(
    "Can a Dog open account?")
pprint(bot_response)

'Sir, This is a cat bank and we do not entertain dogs here. Hope you understand'
