### Create and Test Collections

In [20]:
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
import chromadb
from src.utils import get_device

device = get_device()
print(f"Using device: {device}")

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", " ", ""]
)

texts = []
metadatas = []
ids = []

base_dir = '/Users/silaspenda/Desktop/workspace/my_projects/Marketing_AI_Agent/CUAD_v1'


for root, dirs, files in os.walk(base_dir):
    for file in files:
        parent_folder = os.path.basename(root)

        if file.endswith(('.pdf', '.PDF')):
            pdf_path = os.path.join(root, file)
            reader = PdfReader(pdf_path)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    chunks = splitter.split_text(text)
                    for idx, chunk in enumerate(chunks):
                        texts.append(chunk)
                        metadatas.append({"page": i, "document_type": parent_folder, "source": file})
                        ids.append(f"{os.path.basename(pdf_path)}_page{i}_chunk{idx}")

# Embed the texts in batches
embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)

persist_dir = "./audit_chromadb_dir"
os.makedirs(persist_dir, exist_ok=True) 

client = chromadb.PersistentClient(path=persist_dir)

collection_name = "enterprise_docs"

# Remove existing collection if it exists
existing_collections = [col.name for col in client.list_collections()]
if collection_name in existing_collections:
    client.delete_collection(name=collection_name)

collection = client.create_collection(name=collection_name)

# Add embeddings and metadata to collection in smaller batches
BATCH_SIZE = 5000

for i in range(0, len(texts), BATCH_SIZE):
    batch_texts = texts[i : i + BATCH_SIZE]
    batch_embeddings = embeddings[i : i + BATCH_SIZE]
    batch_metadatas = metadatas[i : i + BATCH_SIZE]
    batch_ids = ids[i : i + BATCH_SIZE]

    collection.add(
        documents=batch_texts,
        embeddings=batch_embeddings.tolist(),
        metadatas=batch_metadatas,
        ids=batch_ids
    )

print("Added documents to Chroma via chromadb client")


Using device: mps


Batches:   0%|          | 0/1165 [00:00<?, ?it/s]

Added documents to Chroma via chromadb client


In [14]:
# import chromadb
# from sentence_transformers import SentenceTransformer

client = chromadb.Client()
collection = client.get_collection(name="enterprise_docs")

model = SentenceTransformer("all-MiniLM-L6-v2", device="mps")

query_text = "contract termination clauses"
query_embedding = model.encode([query_text])[0].tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

for i, doc in enumerate(results['documents'][0]):
    print(f"Result {i+1}:")
    print("Document:", doc)
    print("Metadata:", results['metadatas'][0][i])
    print("Distance:", results['distances'][0][i])
    print()


Result 1:
Document: 12.5 Accrued Rights and Obligations. Expiration or termination of this Agreement shall not diminish either Party’s rights, or relieve either Party of
any of its obligations, in each case that have been accrued prior to the effective date of such expiration or termination.
12.6 Termination Not Sole Remedy. Except as set forth in Section 5.7, termination is not the sole remedy under this Agreement and, whether or not
termination is effected and notwithstanding anything contained in this Agreement to the contrary, all other remedies shall remain available except as
agreed to otherwise herein.
Article XIII.
REPRESENTATIONS, WARRANTIES AND COVENANTS; CLOSING CONDITIONS
13.1 Representations and Warranties of Each Party. Each Party hereby represents and warrants, as of the Execution, and covenants (as
applicable) to the other Party as follows:
(a) It is a company or corporation duly organized, validly existing, and in good standing under the laws of the jurisdiction in whi

In [21]:
import yaml
from sentence_transformers import SentenceTransformer
import chromadb
from src.utils import get_device

yaml_path = "./src/policies.yaml"  # adjust to your actual YAML file path

# Load YAML
with open(yaml_path, "r") as f:
    data = yaml.safe_load(f)

texts = []
metadatas = []
ids = []

# Iterate through all top-level keys (policy categories)
for policy_category, rules in data.items():
    for i, rule in enumerate(rules):
        content = rule.get("content")
        metadata = rule.get("metadata", {}).copy()
        # Add policy category as extra metadata field (optional)
        # metadata["policy_category"] = policy_category
        
        texts.append(content)
        metadatas.append(metadata)
        ids.append(f"{policy_category}_{i}")

device = get_device()
print(f"Using device: {device}")

model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)

persist_dir = "./audit_chromadb_dir"
os.makedirs(persist_dir, exist_ok=True) 

client = chromadb.PersistentClient(path=persist_dir)

collection_name = "policies"  # change or parameterize as you want

# Remove existing collection if it exists
existing_collections = [col.name for col in client.list_collections()]
if collection_name in existing_collections:
    client.delete_collection(name=collection_name)

collection = client.create_collection(name=collection_name)

collection.add(
    documents=texts,
    embeddings=embeddings.tolist(),
    metadatas=metadatas,
    ids=ids
)

print(f"Added {len(texts)} rules to Chroma collection '{collection_name}'.")


Using device: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 25 rules to Chroma collection 'policies'.


In [12]:
query_text = "What are the GDPR rules about data breaches?"

results = collection.query(
    query_texts=[query_text],
    n_results=5,  # number of closest matches to return
    include=["documents", "metadatas", "distances"]
)

for i, (doc, meta, dist) in enumerate(zip(results["documents"][0], results["metadatas"][0], results["distances"][0])):
    print(f"Result {i+1} (distance: {dist:.4f}):")
    print(f"Content: {doc}")
    print(f"Metadata: {meta}")
    print("-" * 40)


Result 1 (distance: 0.5217):
Content: GDPR requires that data breaches must be reported to the relevant supervisory authority within 72 hours, unless unlikely to pose a risk to rights and freedoms.
Metadata: {'law': 'GDPR', 'source': 'policy_manual', 'category': 'data_breach', 'severity': 'critical'}
----------------------------------------
Result 2 (distance: 0.8126):
Content: Under GDPR, data subjects have the right to access their personal data and obtain information about how it is being used.
Metadata: {'category': 'data_access', 'source': 'policy_manual', 'severity': 'medium', 'law': 'GDPR'}
----------------------------------------
Result 3 (distance: 0.8567):
Content: Data subjects have the right under GDPR to have their personal data rectified if it is inaccurate or incomplete.
Metadata: {'source': 'policy_manual', 'severity': 'medium', 'category': 'data_accuracy', 'law': 'GDPR'}
----------------------------------------
Result 4 (distance: 0.9277):
Content: GDPR mandates data c