In [None]:
!pip -q install  \
  milvus \
  pymilvus \
  langchain==0.2.0 \
  langchain-core==0.2.0 \
  langchain-community==0.2.0 \
  langchain-openai==0.1.7 \
  langchain-text-splitters==0.2.0 \
  privacera_shield==1.1.5


In [None]:
get_ipython().system_raw('milvus-server &')
!while ! (ps aux | grep -q '[m]ilvus' && ps aux | grep -q '[m]ilvus-server'); do sleep 1; done; echo 'Milvus is ready'

# Replace with your actual Milvus server parameters if different
MILVUS_HOST = "127.0.0.1"
MILVUS_PORT = "19530"

while True:
    try:
        import time
        from pymilvus import connections

        connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
        print("Connected to Milvus")
        break
    except Exception as e:
        print(f"Connection failed: {e}")
        time.sleep(1)

In [None]:
from pymilvus import CollectionSchema, FieldSchema, DataType

COLLECTION_NAME = "PrivaceraSampleCollection"

def create_collection():
    source = FieldSchema(
        name="source",
        dtype=DataType.VARCHAR,
        max_length=65535
    )
    text = FieldSchema(
        name="text",
        dtype=DataType.VARCHAR,
        max_length=65535
    )
    pk = FieldSchema(
        name="pk",
        dtype=DataType.INT64,
        is_primary=True,
        auto_id=True
    )
    vector = FieldSchema(
        name="vector",
        dtype=DataType.FLOAT_VECTOR,
        dim=1536
    )
    users = FieldSchema(
        name="users",
        dtype=DataType.ARRAY,
        element_type=DataType.VARCHAR,
        max_length=65535,
        max_capacity=1024
    )
    groups = FieldSchema(
        name="groups",
        dtype=DataType.ARRAY,
        element_type=DataType.VARCHAR,
        max_length=65535,
        max_capacity=1024
    )
    metadata = FieldSchema(
        name="metadata",
        dtype=DataType.JSON
    )

    schema = CollectionSchema(
        fields=[source, text, pk, vector, users, groups, metadata],
        description="Sample Privacera Milvus Collection",
        enable_dynamic_field=True
    )

    from pymilvus import connections
    connections.connect(
        alias="default",
        host=MILVUS_HOST,
        port=MILVUS_PORT
    )

    from pymilvus import Collection

    collection = Collection(
        name=COLLECTION_NAME,
        schema=schema,
        using='default'
    )

    from pymilvus import Collection

    collection = Collection(COLLECTION_NAME)

    index_params = {
        "index_type": "HNSW",
        "metric_type": "L2",
        "params": {
            "M": 10,
            "efConstruction": 8
        }
    }

    collection.create_index(
        field_name="vector",
        index_params=index_params,
        index_name="index"
    )
    print(f"Collection = {COLLECTION_NAME} created")

create_collection()

In [None]:
import os

def create_raw_data():
    raw_data_dir = "raw_data"

    file_contents = {
        "Nancy.txt": """Nancy is a customer in our dataset.
She resides in the United States and can be reached at her email address, nancy@yahoo.com.
Nancy's unique identifier, often associated with individuals in the United States, is her Social Security Number (SSN), which is 201-99-5532.
She can be contacted via her US phone number, 856-232-9702, and her physical address is 939 Park Avenue.
Her account is identified by the account ID 159635478, and her zipcode is 33317.""",

        "Gene.txt": """Gene is a customer,
and he is based in the United Kingdom.
His email address is gene@google.us.
His unique identifier, 202-99-5532, is his Social Security Number (SSN).
Gene's contact number is 954-583-0575.
He resides at 303 Johnston Blvd and has an account with the ID 236854569. His UK postal code is 95202.""",

        "Edward.txt": """Edward is a customer,
and is based in the United States.
You can contact him at edward@facebook.com.
His unique identifier is his Social Security Number (SSN), which is 203-99-5532.
Edward's US phone number is 209-626-9041, and his address is 130 Hollister.
He has an account with the ID 365412985 and resides in the zipcode 60173.""",

        "Pearlene.txt": """Pearlene is a customer,
based in the United States.
You can contact her at pearlene@gmail.com.
Her unique identifier is her Social Security Number (SSN), which is 204-99-5532.
Pearlene's US phone number is 708-471-6810, and her address is 17 Warren Rd.
She has an account with the ID 452189732 and resides in the zipcode 90017.""",

        "Pamela.txt": """Pamela, is a customer and resides in the United Kingdom.
You can contact her at pamela@cuvox.de.
Her unique identifier is her Social Security Number (SSN), which is 206-99-5532.
Pamela's UK phone number is 650-526-5259, and her address is 861 Strick Rd.
She has an account with the ID 685231473 and lives in the postal code 80214."""
    }

    os.mkdir(raw_data_dir)

    for file_path, content in file_contents.items():
        file_path_with_dir = raw_data_dir + "/" + file_path
        with open(file_path_with_dir, 'w') as file:
            file.write(content)

    print("Raw data created successfully.")

create_raw_data()

In [None]:
from typing import Optional, List, Iterator
from langchain_community.document_loaders import TextLoader
from langchain.schema import Document

class CustomTextLoader(TextLoader):
    def __init__(self, file_path: str, encoding: Optional[str] = None, autodetect_encoding: bool = False):
        super().__init__(file_path, encoding, autodetect_encoding)
        print(f"inside CustomTextLoader init, file_path={file_path}")

    def lazy_load(self) -> Iterator[Document]:
        documents = super().lazy_load()

        print(f"lazy_load: before fore loop")
        for doc in documents:
            file_name = os.path.basename(self.file_path)
            print(f"lazy_load: file_name={file_name}")
            if file_name == "Nancy.txt":
                doc.metadata["users"] = ["tom", "tobin", "john", "bob"]
                doc.metadata["groups"] = ["accounts", "privacera-all", "privacera-us"]
                doc.metadata["metadata"] = {"security": "confidential", "country": "US", "file_name": file_name}
            elif file_name == "Gene.txt":
                doc.metadata["users"] = ["testuser", "hannah", "john", "bob"]
                doc.metadata["groups"] = ["hr", "privacera-all", "privacera-us"]
                doc.metadata["metadata"] = {"security": "confidential", "country": "UK", "file_name": file_name}
            elif file_name == "Edward.txt":
                doc.metadata["users"] = ["testuser", "ryan", "john", "bob"]
                doc.metadata["groups"] = ["sales", "privacera-all", "privacera-us"]
                doc.metadata["metadata"] = {"country": "US", "file_name": file_name}
            elif file_name == "Pearlene.txt":
                doc.metadata["users"] = ["mark", "mary", "john", "bob"]
                doc.metadata["groups"] = ["marketing", "privacera-all", "privacera-us"]
                doc.metadata["metadata"] = {"country": "US", "file_name": file_name}
            elif file_name == "Pamela.txt":
                doc.metadata["users"] = ["aaron", "adam", "john", "bob"]
                doc.metadata["groups"] = ["sales", "privacera-all", "privacera-us"]
                doc.metadata["metadata"] = {"country": "UK", "file_name": file_name}

            yield doc

In [None]:
from getpass import getpass

#if os.environ.get("OPENAI_API_KEY") is None:
openai_api_key = getpass("🔑 Enter your OpenAI API key and hit Enter:")
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.milvus import Milvus

text_loader_kwargs = {'autodetect_encoding': True}
loader = DirectoryLoader("raw_data", glob="**/*.txt", loader_cls=CustomTextLoader, loader_kwargs=text_loader_kwargs)
docs = loader.load()

print(f"len docs = {len(docs)}")

text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs = text_splitter.split_documents(docs)

# Create OpenAI Embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

vector_store = Milvus.from_documents(
    docs,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
)

print("Loaded data into collection successfully.")

In [None]:
from google.colab import files
uploaded = files.upload()
files = uploaded.keys()
if len(files) > 1:
  print("Upload only the application config json file")
else:
  app_config_file_content = uploaded[list(files)[0]].decode('UTF-8')

In [None]:
import privacera_shield
from privacera_shield import client as privacera_shield_client
from langchain.memory import ConversationBufferWindowMemory
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain


memory = ConversationBufferWindowMemory(memory_key="chat_history", return_messages=True, k=3)

# Create Milvus vector store
vector_store = Milvus(embeddings, COLLECTION_NAME, connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT})

# expose this index in a retriever interface
milvus_retriever = vector_store.as_retriever(
    search_type="similarity", search_kwargs={"k": 100}
)

# Initialize Privacera Shield
privacera_shield_client.setup(frameworks=["milvus", "langchain"], application_config=app_config_file_content)

llm = ChatOpenAI(openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")
template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])



In [None]:
# Let's assume the user is "testuser"
user = "testuser"
prompt_text = "Give the contact details of our customers."
print(f"Prompt: {prompt_text}")
print()

llm_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=milvus_retriever, memory=memory, verbose=True)
try:
    with privacera_shield_client.create_shield_context(username=user):
        response = llm_chain.invoke({"question": prompt_text})
        print(f"LLM Response: {response.get('answer')}")
except privacera_shield.exception.AccessControlException as e:
    # If access is denied, then this exception will be thrown. You can handle it accordingly.
    print(f"AccessControlException: {e}")