In [5]:
import os
from typing import List, Dict, Tuple
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from presidio_analyzer import Pattern, PatternRecognizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv


In [None]:

class PIIAgent:
    def __init__(self):

        self.anonymizer = PresidioReversibleAnonymizer(
            analyzed_fields=[
                "EMAIL_ADDRESS",
                "PERSON",
                "LOCATION",
                "DATE_TIME",
                "PHONE_NUMBER",
                "ADDRESS",
                "IP_ADDRESS",
                "SSN",
            ],
            add_default_faker_operators=False,
        )
    
        self._add_recognizers()

    def _add_recognizers(self):
        ## Order ID (eCommerce)
        order_id_pattern = Pattern(
            name="order_id_pattern",
            regex=r"\bORD-\d{8}\b",
            score=1,
        )
        order_id_recognizer = PatternRecognizer(
            supported_entity="ORDER_ID", patterns=[order_id_pattern]
        )
        self.anonymizer.add_recognizer(order_id_recognizer)
        ## Employee ID (HR)
        employee_id_pattern = Pattern(
            name="employee_id_pattern",
            regex=r"\bEMP\d{6}\b",
            score=1,
        )
        employee_id_recognizer = PatternRecognizer(
            supported_entity="EMPLOYEE_ID", patterns=[employee_id_pattern]
        )
        self.anonymizer.add_recognizer(employee_id_recognizer)
        ## Patient ID (Healthcare)
        patient_id_pattern = Pattern(
            name="patient_id_pattern",
            regex=r"\bPAT\d{7}\b",
            score=1,
        )
        patient_id_recognizer = PatternRecognizer(
            supported_entity="PATIENT_ID", patterns=[patient_id_pattern]
        )
        self.anonymizer.add_recognizer(patient_id_recognizer)
        ## Phone Number
        phone_number_pattern = Pattern(
            name="phone_number_pattern",
            regex=r"\b\d{10}\b|\b\+\d{1,3}\s?\d{10}\b",
            score=1,
        )
        phone_number_recognizer = PatternRecognizer(
            supported_entity="PHONE_NUMBER", patterns=[phone_number_pattern]
        )
        self.anonymizer.add_recognizer(phone_number_recognizer)   
        ## Address
        address_pattern = Pattern(
            name="address_pattern",
            regex=r"\d{1,5}\s\w+(\s\w+)*,\s\w+,\s[A-Z]{2}\s\d{5}",
            score=1,
        )
        address_recognizer = PatternRecognizer(
            supported_entity="ADDRESS", patterns=[address_pattern]
        )
        self.anonymizer.add_recognizer(address_recognizer)
        ## IP Address
        ip_address_pattern = Pattern(
            name="ip_address_pattern",
            regex=r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
            score=1,
        )
        ip_address_recognizer = PatternRecognizer(
            supported_entity="IP_ADDRESS", patterns=[ip_address_pattern]
        )
        self.anonymizer.add_recognizer(ip_address_recognizer)
       ## Social Security Number (SSN)
        ssn_pattern = Pattern(
            name="ssn_pattern",
            regex=r"\b\d{3}-\d{2}-\d{4}\b",
            score=1,
        )
        ssn_recognizer = PatternRecognizer(
            supported_entity="SSN", patterns=[ssn_pattern]
        )
        self.anonymizer.add_recognizer(ssn_recognizer)

    def mask(self, text):
       
        return self.anonymizer.anonymize(text)

    def unmask(self, anonymized_text):
       
        return self.anonymizer.deanonymize(anonymized_text)


In [None]:

class RAGPII:
    def __init__(self, persist_directory = "./chroma_db", model_name = "gemini-pro", temperature = 0.1, top_p = 0.8, top_k = 40, max_output_tokens = 2048):
        self.pii_agent = PIIAgent()
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        
        self.llm = ChatGoogleGenerativeAI(
            model=model_name,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_output_tokens=max_output_tokens,
            api_key=""
        )

        self.persist_directory = persist_directory
        self.vector_store = Chroma(persist_directory=persist_directory, embedding_function=self.embeddings)
        
        self.text_mappings = {}
        self.qa_prompt = PromptTemplate(
            input_variables=["context", "question"],
            template="""
            Context: {context}
            
            Question: {question}
            
            Please provide a clear and concise answer based on the context above. If the information isn't available in the context, please say so.
            
            Answer:
            """
        )
    
    def process_document(self, document_dir, batch_size=100):
        loader = DirectoryLoader(document_dir, glob="**/*.txt", loader_cls=TextLoader)
        documents = loader.load()
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(documents)
        
        for i in range(0, len(splits), batch_size):
            batch = splits[i:i+batch_size]
            self._process_batch(batch)
        
        self.vector_store.persist()

    def _process_batch(self, batch):
        anony_docs = []
        for doc in batch:
            anonymized_text = self.pii_agent.mask(doc.page_content)
            doc_id = hash(doc.page_content)
            self.text_mappings[doc_id] = (doc.page_content, anonymized_text)
            anony_doc = Document(page_content=anonymized_text, metadata={**doc.metadata, "doc_id": doc_id})
            anony_docs.append(anony_doc)
        
        self.vector_store.add_documents(anony_docs)
    
    def query(self, query, k=4):
        anonymized_query = self.pii_agent.mask(query)
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(search_kwargs={"k": k}),
            chain_type_kwargs={"prompt": self.qa_prompt, "verbose": True}
        )
        
        response = qa_chain.run(anonymized_query)
        return self.pii_agent.unmask(response)

    def add_single_document(self, content, metadata=None):
        if metadata is None:
            metadata = {}
        
        anonymized_content = self.pii_agent.mask(content)
        doc_id = hash(content)
        self.text_mappings[doc_id] = (content, anonymized_content)
        
        doc = Document(page_content=anonymized_content, metadata={**metadata, "doc_id": doc_id})
        
        self.vector_store.add_documents([doc])
        self.vector_store.persist()
    
    def set_custom_prompt(self, template, input_variables=None):
        if input_variables is None:
            input_variables = ["context", "question"]
        self.qa_prompt = PromptTemplate(input_variables=input_variables, template=template)


In [None]:
rag_app = RAGPII()

In [None]:




ecommerce_doc = """
Order Confirmation
Order ID: ORD-20240215
Customer: John Doe
Shipping Address: 1234 Elm Street, Dubai
Total Amount: $299.99
Payment Status: Paid
"""

hr_doc = """
Employee Record
Employee ID: EMP123456
Name: Sarah Connor
Department: Finance
Joining Date: 12 March 2023
Salary: Confidential
"""

healthcare_doc = """
Patient Report
Patient ID: PAT7654321
Name: Alice Johnson
Diagnosis: Hypertension
Prescription: Beta Blockers
Next Appointment: 15th March 2024
"""

In [None]:
rag_app.add_single_document(ecommerce_doc, {"category": "eCommerce"})
rag_app.add_single_document(hr_doc, {"category": "HR"})
rag_app.add_single_document(healthcare_doc, {"category": "Healthcare"})


In [None]:
query = "What is the order ID for the recent purchase?"

In [None]:
response = rag_app.query(query)

In [None]:
query = "Who is the employee with ID EMP123456?"

In [None]:
response = rag_app.query(query)

In [None]:
query = "What is the diagnosis for patient PAT7654321?"

In [None]:
response = rag_app.query(query)