In [1]:
import ollama
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pdfplumber
import pytesseract
from PIL import Image
from transformers import LlamaTokenizer
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from langchain_ollama import OllamaLLM
from langchain.embeddings import HuggingFaceEmbeddings  # Local embeddings
from typing import List, Dict, Union, Tuple
import logging


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. PDF Processing and Chunking with OCR
class DataPrivacyProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b")
        
    def extract_text_from_pdfs(self, pdf_paths: List[str]) -> List[str]:
        all_chunks = []
        for pdf_path in pdf_paths:
            try:
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        text = page.extract_text()
                        if text and len(text.strip()) > 10:
                            chunks = [chunk.strip() for chunk in text.split('\n\n') if len(chunk) > 50]
                            all_chunks.extend(chunks)
                        else:
                            img = page.to_image(resolution=300).original
                            text = pytesseract.image_to_string(img)
                            chunks = [chunk.strip() for chunk in text.split('\n\n') if len(chunk) > 50]
                            all_chunks.extend(chunks)
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")
        return all_chunks

    def vectorize_chunks(self, chunks: List[str]) -> np.ndarray:
        try:
            embeddings = self.embedder.encode(chunks, show_progress_bar=True)
            return embeddings
        except Exception as e:
            raise ValueError(f"Vectorization failed: {e}")

In [3]:
class PrivacyRetriever:
    def __init__(self, embeddings: np.ndarray, chunks: List[str], embedder: SentenceTransformer):
        self.chunks = chunks
        self.embedder = embedder
        # Normalize embeddings for cosine similarity
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        self.index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner product for cosine similarity
        self.index.add(embeddings.astype(np.float32))  # FAISS requires float32
        
    def retrieve(self, query: str, top_k: int = 5, min_similarity: float = 0.5) -> List[Tuple[str, float]]:
        """
        Retrieve relevant chunks with similarity scores
        
        Args:
            query: Search query string
            top_k: Number of results to return
            min_similarity: Minimum similarity score (0-1)
            
        Returns:
            List of tuples containing (chunk_text, similarity_score)
            
        Raises:
            ValueError: If query encoding fails
            RuntimeError: If retrieval fails
        """
        try:
            # Encode and normalize query
            query_embedding = self.embedder.encode([query], show_progress_bar=False)
            query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
            query_embedding = query_embedding.astype(np.float32)
            
            # Search index
            similarities, indices = self.index.search(query_embedding, top_k)
            
            # Process results
            results = []
            for sim, idx in zip(similarities[0], indices[0]):
                if sim >= min_similarity and 0 <= idx < len(self.chunks):
                    results.append((self.chunks[idx], float(sim)))
            
            return results
            
        except ValueError as ve:
            raise ValueError(f"Query encoding failed: {ve}")
        except IndexError as ie:
            raise RuntimeError(f"FAISS index error: {ie}")
        except Exception as e:
            raise RuntimeError(f"Retrieval failed unexpectedly: {e}")
        


In [4]:
# 3. RAG with LLaMA3:8B (Updated to handle tuples)
class PrivacyRAG:
    def __init__(self, retriever: PrivacyRetriever, model_name='llama3:8b', max_tokens=4096):
        self.retriever = retriever
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b")
        self.system_prompt = """You are a data privacy expert. Using the provided context from data privacy laws and guidelines, answer the query accurately and concisely in formal legal language."""
    
    def _truncate_context(self, context: str, query: str) -> str:
        prompt_tokens = len(self.tokenizer.encode(self.system_prompt))
        query_tokens = len(self.tokenizer.encode(query))
        available_tokens = self.max_tokens - prompt_tokens - query_tokens - 100
        context_tokens = self.tokenizer.encode(context)
        if len(context_tokens) > available_tokens:
            context_tokens = context_tokens[:available_tokens]
            return self.tokenizer.decode(context_tokens)
        return context
    
    def generate_response(self, query: str) -> str:
        try:
            context_chunks_with_scores = self.retriever.retrieve(query)
            # Extract only the chunks (first element of each tuple)
            context_chunks = [chunk for chunk, _ in context_chunks_with_scores]
            context = "\n\n".join(context_chunks)
            context = self._truncate_context(context, query)
            full_prompt = f"{self.system_prompt}\n\nContext:\n{context}\n\nQuery:\n{query}"
            response = ollama.generate(model=self.model_name, prompt=full_prompt)
            return response['response']
        except Exception as e:
            raise RuntimeError(f"Generation failed: {e}")

In [5]:
# 4. RAGAS Evaluation with Ollama
def evaluate_rag(rag: PrivacyRAG, test_data: List[dict]) -> dict:
    """Evaluate RAG system using RAGAS with Ollama and local embeddings."""
    # Set up Ollama as the evaluation LLM
    ollama_llm = OllamaLLM(model="llama3:8b")
    
    # Set up local embeddings
    local_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Prepare dataset
    questions = [item['question'] for item in test_data]
    ground_truths = [item['ground_truth'] for item in test_data]
    contexts = []
    answers = []
    
    for q in questions:
        context_chunks = rag.retriever.retrieve(q)
        contexts.append(context_chunks)
        answers.append(rag.generate_response(q))
    
    dataset = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths
    }
    
    # Evaluate with Ollama LLM and local embeddings
    result = evaluate(
        dataset=dataset,
        metrics=[faithfulness, answer_relevancy, context_precision],
        llm=ollama_llm,
        embeddings=local_embeddings  # Override default OpenAI embeddings
    )
    return result

In [6]:
# Example Usage
if __name__ == "__main__":
    processor = DataPrivacyProcessor()
    pdf_paths = ["./data-privacy-pdf/RBI-Guidelines.pdf", 
                 "./data-privacy-pdf/4.-CBPR-Policies-Rules-and-Guidelines-Revised-For-Posting-3-16-updated-1709-2019.pdf", 
                 "./data-privacy-pdf/16MC9102DB7D5FE742CCB5D0715A77F6666E.pdf", 
                 "./data-privacy-pdf/2024-0118-Policy-SEBI_Circular_on_Cybersecurity_and_Cyber_Resilience_Framework_(CSCRF)_for_SEBI_Regulated.pdf", 
                 "./data-privacy-pdf/20240905112741.pdf", 
                 "./data-privacy-pdf/Aadhaar_Act_2016_as_amended.pdf", 
                 "./data-privacy-pdf/book_indiacybersecurity.pdf", 
                 "./data-privacy-pdf/DPDPA - 2023.pdf", 
                 "./data-privacy-pdf/EPRS_ATA(2020)659275_EN.pdf", 
                 "./data-privacy-pdf/GBS300411F.pdf", 
                 "./data-privacy-pdf/health_management_policy_bac9429a79.pdf", 
                 "./data-privacy-pdf/in098en.pdf", 
                 "/data-privacy-pdf/Information-Technology-Intermediary-Guidelines-and-Digital-Media-Ethics-Code-Rules-2021-updated-06.04.202.pdf", 
                 "./data-privacy-pdf/it_act_2000_updated.pdf", 
                 "./data-privacy-pdf/Legal Framework for Data Protection and Security and Privacy norms.pdf", 
                 "./data-privacy-pdf/Personal Data Protection Bill, 2019.pdf", 
                 "./data-privacy-pdf/Privacy and Data Protection.pdf", 
                 "./data-privacy-pdf/rti-act.pdf", 
                 "./data-privacy-pdf/Takshashila_07_11_2017.pdf"]
    chunks = processor.extract_text_from_pdfs(pdf_paths)
    embeddings = processor.vectorize_chunks(chunks)
    
    retriever = PrivacyRetriever(embeddings, chunks, processor.embedder)
    rag = PrivacyRAG(retriever)
    
    query = "How to plan an IS Audit as per the RBI guidelines"
    response = rag.generate_response(query)
    print("Response:", response)
    
'''    test_data = [
        {
            "question": "What are the penalties for data privacy violations in the updated IT act?",
            "ground_truth": "Under DPDPA 2023, penalties can include fines up to INR 2 lakhs per instance."
        }
    ]
    evaluation_results = evaluate_rag(rag, test_data)
    print("RAGAS Results:", evaluation_results)
    '''

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to 

Error processing /data-privacy-pdf/Information-Technology-Intermediary-Guidelines-and-Digital-Media-Ethics-Code-Rules-2021-updated-06.04.202.pdf: [Errno 2] No such file or directory: '/data-privacy-pdf/Information-Technology-Intermediary-Guidelines-and-Digital-Media-Ethics-Code-Rules-2021-updated-06.04.202.pdf'


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Response: As a data privacy expert, I'd be happy to help you with that!

According to the RBI guidelines, planning an IS Audit involves using a Risk-Based Approach. Here's a step-by-step guide to help you plan an IS Audit as per the RBI guidelines:

1. **Define the IS Audit Universe**: Identify the scope of the audit by determining which areas of the bank's operations will be audited.
2. **Conduct IT Risk Assessment**: Assess the IT risks associated with each area identified in step 1. This involves identifying, evaluating, and prioritizing potential IT risks.
3. **Develop an Audit Program**: Based on the risk assessment, develop an audit program that outlines the scope, objectives, and procedures for the IS Audit.
4. **Identify Audit Procedures**: Determine which audit procedures will be performed to achieve the audit's objectives. This may include reviewing records, testing controls, observing processes, or conducting interviews.
5. **Develop a Testing Plan**: Create a plan outlining

'    test_data = [\n        {\n            "question": "What are the penalties for data privacy violations in the updated IT act?",\n            "ground_truth": "Under DPDPA 2023, penalties can include fines up to INR 2 lakhs per instance."\n        }\n    ]\n    evaluation_results = evaluate_rag(rag, test_data)\n    print("RAGAS Results:", evaluation_results)\n    '