In [30]:
import ollama
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from typing import List
import pdfplumber
import pytesseract
from PIL import Image
from transformers import LlamaTokenizer
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from langchain_ollama import OllamaLLM
from langchain.embeddings import HuggingFaceEmbeddings  # Local embeddings

In [31]:
# 1. PDF Processing and Chunking with OCR
class DataPrivacyProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b")
        
    def extract_text_from_pdfs(self, pdf_paths: List[str]) -> List[str]:
        all_chunks = []
        for pdf_path in pdf_paths:
            try:
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        text = page.extract_text()
                        if text and len(text.strip()) > 10:
                            chunks = [chunk.strip() for chunk in text.split('\n\n') if len(chunk) > 50]
                            all_chunks.extend(chunks)
                        else:
                            img = page.to_image(resolution=300).original
                            text = pytesseract.image_to_string(img)
                            chunks = [chunk.strip() for chunk in text.split('\n\n') if len(chunk) > 50]
                            all_chunks.extend(chunks)
            except Exception as e:
                print(f"Error processing {pdf_path}: {e}")
        return all_chunks

    def vectorize_chunks(self, chunks: List[str]) -> np.ndarray:
        try:
            embeddings = self.embedder.encode(chunks, show_progress_bar=True)
            return embeddings
        except Exception as e:
            raise ValueError(f"Vectorization failed: {e}")

In [32]:
#2. Retrieval System
class PrivacyRetriever:
    def __init__(self, embeddings: np.ndarray, chunks: List[str], embedder: SentenceTransformer):
        self.chunks = chunks
        self.embedder = embedder
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)
        
    def retrieve(self, query: str, top_k: int = 5) -> List[str]:
        try:
            query_embedding = self.embedder.encode([query])
            distances, indices = self.index.search(query_embedding, top_k)
            return [self.chunks[idx] for idx in indices[0]]
        except Exception as e:
            raise RuntimeError(f"Retrieval failed: {e}")

In [33]:
# 3. RAG with LLaMA3:8B
class PrivacyRAG:
    def __init__(self, retriever: PrivacyRetriever, model_name='llama3:8b', max_tokens=4096):
        self.retriever = retriever
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b")
        self.system_prompt = """You are a data privacy expert. Using the provided context from data privacy laws and guidelines, answer the query accurately and concisely in formal legal language."""
    
    def _truncate_context(self, context: str, query: str) -> str:
        prompt_tokens = len(self.tokenizer.encode(self.system_prompt))
        query_tokens = len(self.tokenizer.encode(query))
        available_tokens = self.max_tokens - prompt_tokens - query_tokens - 100
        context_tokens = self.tokenizer.encode(context)
        if len(context_tokens) > available_tokens:
            context_tokens = context_tokens[:available_tokens]
            return self.tokenizer.decode(context_tokens)
        return context
    
    def generate_response(self, query: str) -> str:
        try:
            context_chunks = self.retriever.retrieve(query)
            context = "\n\n".join(context_chunks)
            context = self._truncate_context(context, query)
            full_prompt = f"{self.system_prompt}\n\nContext:\n{context}\n\nQuery:\n{query}"
            response = ollama.generate(model=self.model_name, prompt=full_prompt)
            return response['response']
        except Exception as e:
            raise RuntimeError(f"Generation failed: {e}")

In [34]:
# 4. RAGAS Evaluation with Ollama
def evaluate_rag(rag: PrivacyRAG, test_data: List[dict]) -> dict:
    """Evaluate RAG system using RAGAS with Ollama and local embeddings."""
    # Set up Ollama as the evaluation LLM
    ollama_llm = OllamaLLM(model="llama3:8b")
    
    # Set up local embeddings
    local_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Prepare dataset
    questions = [item['question'] for item in test_data]
    ground_truths = [item['ground_truth'] for item in test_data]
    contexts = []
    answers = []
    
    for q in questions:
        context_chunks = rag.retriever.retrieve(q)
        contexts.append(context_chunks)
        answers.append(rag.generate_response(q))
    
    dataset = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths
    }
    
    # Evaluate with Ollama LLM and local embeddings
    result = evaluate(
        dataset=dataset,
        metrics=[faithfulness, answer_relevancy, context_precision],
        llm=ollama_llm,
        embeddings=local_embeddings  # Override default OpenAI embeddings
    )
    return result

In [35]:
# Example Usage
if __name__ == "__main__":
    processor = DataPrivacyProcessor()
    pdf_paths = ["it_act_2000_updated.pdf"]
    chunks = processor.extract_text_from_pdfs(pdf_paths)
    embeddings = processor.vectorize_chunks(chunks)
    
    retriever = PrivacyRetriever(embeddings, chunks, processor.embedder)
    rag = PrivacyRAG(retriever)
    
    query = "What are the penalties for data privacy violations in the updated IT act?"
    response = rag.generate_response(query)
    print("Response:", response)
    
'''    test_data = [
        {
            "question": "What are the penalties for data privacy violations in the updated IT act?",
            "ground_truth": "Under DPDPA 2023, penalties can include fines up to INR 2 lakhs per instance."
        }
    ]
    evaluation_results = evaluate_rag(rag, test_data)
    print("RAGAS Results:", evaluation_results)
    '''

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Response: Based on the provided context from the updated IT Act, here are some of the penalties for data privacy violations:

1. **Punishment for contravention**: Whoever contravenes any of the provisions of this Act shall be punishable with imprisonment for a term which may extend to three years or with fine which may extend to two lakh rupees or with both.
2. **Punishment for abetment of offences**: Whoever abets any offence shall, if the act abetted is committed in consequence of the abetment, and no express provision is made by this Act for the punishment of such abetment, be punished with the punishment provided for the offence under this Act (Section 84B).
3. **Punishment for failure to follow procedures**: Every Certifying Authority shall make use of hardware, software, and procedures that are secure from intrusion and misuse, provide a reasonable level of reliability in its services which are reasonably suited to the performance of intended functions, adhere to security procedu

'    test_data = [\n        {\n            "question": "What are the penalties for data privacy violations in the updated IT act?",\n            "ground_truth": "Under DPDPA 2023, penalties can include fines up to INR 2 lakhs per instance."\n        }\n    ]\n    evaluation_results = evaluate_rag(rag, test_data)\n    print("RAGAS Results:", evaluation_results)\n    '