In [3]:
import faiss
print(faiss.__version__)

1.10.0


In [9]:
import ollama
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from typing import List
import pdfplumber

# 1. PDF Processing and Chunking
class DataPrivacyProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, good for legal text
        
    def extract_text_from_pdfs(self, pdf_paths: List[str]) -> List[str]:
        """Extract text from multiple PDFs."""
        all_chunks = []
        for pdf_path in pdf_paths:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        # Split into chunks (e.g., by paragraph)
                        chunks = [chunk.strip() for chunk in text.split('\n\n') if len(chunk) > 50]
                        all_chunks.extend(chunks)
        return all_chunks

    def vectorize_chunks(self, chunks: List[str]) -> np.ndarray:
        """Convert text chunks to embeddings."""
        embeddings = self.embedder.encode(chunks, show_progress_bar=True)
        return embeddings

# 2. Retrieval System
class PrivacyRetriever:
    def __init__(self, embeddings: np.ndarray, chunks: List[str], embedder: SentenceTransformer):
        self.chunks = chunks
        self.embedder = embedder  # Store the embedder passed from DataPrivacyProcessor
        self.index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance index
        self.index.add(embeddings)  # Add embeddings to FAISS
        
    def retrieve(self, query: str, top_k: int = 5) -> List[str]:
        """Retrieve top-k relevant chunks."""
        query_embedding = self.embedder.encode([query])
        distances, indices = self.index.search(query_embedding, top_k)
        return [self.chunks[idx] for idx in indices[0]]

# 3. RAG with LLaMA3:8B
class PrivacyRAG:
    def __init__(self, retriever: PrivacyRetriever, model_name='llama3:8b'):
        self.retriever = retriever
        self.model_name = model_name
        self.system_prompt = """You are a data privacy expert. Using the provided context from data privacy laws and guidelines, answer the query accurately and concisely in formal legal language."""
    
    def generate_response(self, query: str) -> str:
        """Generate response using retrieved context and LLaMA3:8B."""
        # Retrieve relevant chunks
        context_chunks = self.retriever.retrieve(query)
        context = "\n\n".join(context_chunks)
        
        # Construct prompt
        full_prompt = f"{self.system_prompt}\n\nContext:\n{context}\n\nQuery:\n{query}"
        
        # Generate response
        response = ollama.generate(
            model=self.model_name,
            prompt=full_prompt
        )
        return response['response']

# Example Usage
if __name__ == "__main__":
    # Step 1: Process PDFs
    processor = DataPrivacyProcessor()
    pdf_paths = ["RBI-Guidelines.pdf"]  # Replace with your 25 PDFs
    chunks = processor.extract_text_from_pdfs(pdf_paths)
    embeddings = processor.vectorize_chunks(chunks)
    
    # Step 2: Set up retriever, passing the embedder
    retriever = PrivacyRetriever(embeddings, chunks, processor.embedder)
    
    # Step 3: Initialize RAG system
    rag = PrivacyRAG(retriever)
    
    # Test query
    query = "What are the penalties for data privacy violations in DPDPA 2023"
    response = rag.generate_response(query)
    print(response)

Batches: 100%|██████████| 6/6 [00:01<00:00,  3.50it/s]


Based on the provided context from data privacy laws and guidelines, it appears that there is no mention of a specific law or regulation called "DPDPA 2023". However, I can provide information on the general principles and regulations related to data privacy.

In India, the General Data Protection Regulation (GDPR) came into effect on August 24, 2018, which provides guidelines for data protection. The GDPR is applicable to any organization that processes personal data of individuals in the European Union, regardless of their location.

Regarding penalties for data privacy violations, under the GDPR, organizations can face severe penalties if they fail to comply with the regulation's provisions. These penalties include:

1. Fines: The maximum fine for non-compliance is €20 million or 4% of the organization's global annual revenue, whichever is higher.
2. Compensation: Individuals who have suffered damage due to a data breach can seek compensation from the responsible organization.

In a