In [None]:
# !pip install -q langchain langchain-community langchain-mistralai chromadb pypdf numpy regex
# !pip install scikit-learn
# pip install ollama
# !pip install langchain-community

In [1]:

import re
import json
import requests
from pathlib import Path
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from typing import List
from langchain_core.documents import Document
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import requests.exceptions as req_exc

In [2]:
SECURITY_RANKS = {
    "public": 1,
    "confidential": 2,
    "restricted": 3
}


PROHIBITED_PATTERNS = {
    "public": [r"\b(internal[\s-]?ip)\b", r"\b(passw(o)?r?d)\b"],
    "confidential": [r"\b(database[\s-]?access)\b"],
    "restricted": [r"\b(top[\s-]?secret\b)", r"\b(executive[\s-]?compensation\b)"]  # New patterns
}

# LM Studio configuration
LM_STUDIO_ENDPOINT = "http://10.1.82.21:8080"
EMBEDDING_MODEL = "text-embedding-nomic-embed-text-v1.5@f32"
LLM_MODEL = "gemma-3-27b-it"

# Processing parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
RETRIEVAL_LIMIT = 20  
FINAL_LIMIT = 5

In [3]:
from langchain_core.embeddings import Embeddings
from typing import List

class LMStudioEmbedder(Embeddings):
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=2, max=10),
        retry=(
            retry_if_exception_type(req_exc.ConnectionError) |
            retry_if_exception_type(req_exc.Timeout) |
            retry_if_exception_type(req_exc.HTTPError)
    ))
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        response = requests.post(
            f"{LM_STUDIO_ENDPOINT}/v1/embeddings",
            json={"model": EMBEDDING_MODEL, "input": texts},
            timeout=10
        )
        response.raise_for_status()
        return [item["embedding"] for item in response.json()["data"]]
    
    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]
    


embedder = LMStudioEmbedder()

# Load and process documents
loader = PyPDFDirectoryLoader("data")
raw_docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

processed_docs = []
for doc in text_splitter.split_documents(raw_docs):
    file_path = Path(doc.metadata["source"])
    folder_name = file_path.parent.name.lower()
    
    
    if folder_name == "restricted":
        security_level = SECURITY_RANKS["restricted"]
    else:
        security_level = SECURITY_RANKS.get(folder_name, SECURITY_RANKS["public"])
    
    processed_docs.append({
        "page_content": doc.page_content,
        "metadata": {"security_level": security_level}
    })

vector_store = Chroma.from_documents(
    documents=[Document(**doc) for doc in processed_docs],
    embedding=embedder,
    persist_directory="secure_chroma_db"
)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [4]:
# vector_store._collection.embedding_function = embedder.embed_documents

def security_check(text, level):
    for pattern in PROHIBITED_PATTERNS.get(level, []):
        if re.search(pattern, text, re.IGNORECASE):
            return False
    return True

def format_response(prompt, context, security_level):
    return [
        {"role": "system", "content": f"Security Level: {security_level}\nContext: {context}"},
        {"role": "user", "content": prompt}
    ]


In [6]:
question = "what are three phases of implementation of policy?"
user_level = "restricted"  # Example user level

def normalize_content(text: str) -> str:
    """Normalize text for deduplication"""
    # Remove all whitespace (including newlines) and lowercase
    return re.sub(r'\s+', ' ', text).strip().lower()


try:
    # Get adjusted retrieval limit
    total_docs = vector_store._collection.count()
    retrieval_limit = min(RETRIEVAL_LIMIT, total_docs)
    
    # Retrieve documents
    docs = vector_store.similarity_search(
        question, 
        k=retrieval_limit,
        filter={"security_level": {"$lte": SECURITY_RANKS[user_level.lower()]}}
    )
    
    # Deduplicate and filter
    seen_hashes = set()
    filtered_docs = []
    for doc in docs:
        normalized = normalize_content(doc.page_content)
        content_hash = hash(normalized)
    
        if content_hash not in seen_hashes and security_check(doc.page_content, user_level):
            seen_hashes.add(content_hash)
            filtered_docs.append(doc)
        if len(filtered_docs) >= FINAL_LIMIT:
            break

    # Print retrieved context
    print("=== RETRIEVED CONTEXT ===")
    context = "\n".join([doc.page_content for doc in filtered_docs])
    print(context)
    print("="*30 + "\n")

    if not filtered_docs:
        print("Couldn't find context")
    else:
        # Create strict system prompt
        messages = [
            {
                "role": "system",
                "content": f"""You are a security analyst. Answer ONLY using the context below.
                If the answer isn't in the context, respond with "Couldn't find context".
                DO NOT use any prior knowledge.
                
                Context:
                {context}"""
            },
            {"role": "user", "content": question}
        ]
        
        # Make API call
        response = requests.post(
            f"{LM_STUDIO_ENDPOINT}/v1/chat/completions",
            json={
                "model": LLM_MODEL,
                "messages": messages,
                "temperature": 0.0,  # Lower temperature for more deterministic responses
                "response_format": {
                    "type": "json_schema",
                    "json_schema": {
                        "name": "security_response",
                        "strict": "true",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "answer": {"type": "string"}
                            },
                            "required": ["answer"]
                        }
                    }
                },
                "max_tokens": 500
            },
            timeout=300
        )
        
        # Handle response
        response.raise_for_status()
        response_data = response.json()
        
        if not response_data.get("choices"):
            raise ValueError("Empty choices in API response")
            
        try:
            content = json.loads(response_data["choices"][0]["message"]["content"])
            if "couldn't find context" in content["answer"].lower():
                print("Response: Couldn't find relevant information in documents")
            else:
                print("Response:", content["answer"])
        except KeyError:
            print("Invalid response format - missing 'answer' field")
        except json.JSONDecodeError:
            print("Received invalid JSON:", response_data["choices"][0]["message"]["content"])

except requests.exceptions.HTTPError as e:
    error_message = f"HTTP Error {e.response.status_code}: "
    try:
        error_details = e.response.json()
        error_message += error_details.get("error", "Unknown error")
    except json.JSONDecodeError:
        error_message += e.response.text
    print(error_message)
    
except Exception as e:
    print(f"Error: {str(e)}")

=== RETRIEVED CONTEXT ===
Policy Internal Analysis - Restricted
Access Level: Restricted
This document contains internal analysis and implementation details of the policy.
It includes internal discussions, risk assessments, and strategies for phased implementation.
Only authorized personnel should access this document.
Key Points:
1. Implementation will occur in three phases: Pilot, Rollout, and Full Integration.
2. Risk assessment includes financial, operational, and reputational factors.
3. Key performance indicators (KPIs) will measure the success of the policy.
Policy Overview - Public Information
Access Level: Public
This document outlines the general guidelines and objectives of the new policy. 
The policy aims to ensure transparency, efficiency, and compliance with legal standards. 
It covers the rights and responsibilities of stakeholders, operational procedures, and expected
outcomes.
Key Points:
1. The policy is designed to improve operational efficiency and legal compliance.