In [15]:
# Install required libraries
!pip install PyPDF2 langchain-google-genai google-generativeai langchain langchain-community langchain-core chromadb


[31mERROR: Operation cancelled by user[0m[31m
[0m

In [16]:
# Import necessary libraries
import re
from PyPDF2 import PdfReader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
import os
import configparser


In [17]:
# Function to configure Google Generative AI with the API key
def configure_genai(api_key):
    os.environ["GOOGLE_API_KEY"] = "AIzaSyBJ_H8m0DGzB-z83rhXpVE4X6UmiDviz3s"
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-pro')
    return model


In [18]:
# Class to split text into manageable chunks
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size=2000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, context):
        return [context[i:i + self.chunk_size] for i in range(0, len(context), self.chunk_size - self.chunk_overlap)]


# Function to extract text from PDF files
def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error processing PDF file: {str(e)}")
    return text


# Function to clean the extracted text
def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


# Function to process PDF files and return cleaned text chunks
def process_documents(files):
    texts = []
    for file in files:
        raw_text = extract_text_from_pdf(file)
        cleaned_text = clean_text(raw_text)
        texts.append(cleaned_text)

    if texts:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
        combined_text = '\n'.join(texts)
        split_texts = text_splitter.split_text(combined_text)
        return split_texts
    return []


# Function to create embeddings and build a retriever using Google Generative AI
def create_embeddings_and_index(texts):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_index = Chroma.from_texts(texts, embeddings).as_retriever()
    return vector_index


In [19]:
# Function to generate a response from the model using a prompt
def generate_response(prompt, model, vector_index):
    docs = vector_index.invoke(prompt)
    retrieved_texts = [doc.page_content for doc in docs]

    # Combine the retrieved documents into the context for the AI
    context = f"User: {prompt}\nDocuments: {' '.join(retrieved_texts)}"

    # Create a prompt using ChatPromptTemplate
    chat_prompt_template = ChatPromptTemplate.from_messages([
        ("system", "Hello! How can I assist you today?"),
        ("user", "{query}")
    ])

    chat_prompt = chat_prompt_template.invoke({"query": context})
    prompt_text = chat_prompt.to_string()

    # Generate the AI response
    response = model.generate_content(prompt_text)
    response_text = response.text.split("AI:")[1].strip() if "AI:" in response.text else response.text.strip()

    return response_text


In [20]:
# Example main function to tie everything together
def main(pdf_files, query):
    # Load API key and configure Google Generative AI
    api_key = "AIzaSyBJ_H8m0DGzB-z83rhXpVE4X6UmiDviz3s"
    model = configure_genai(api_key)

    # Process the documents
    processed_texts = process_documents(pdf_files)

    # Create embeddings and build the vector index
    vector_index = create_embeddings_and_index(processed_texts)

    # Generate response from the AI based on user query
    response = generate_response(query, model, vector_index)

    return response


Example

In [21]:
from google.colab import files
uploaded = files.upload()

Saving Digital Personal Data Protection Act 2023.pdf to Digital Personal Data Protection Act 2023 (1).pdf


In [24]:
query = "Summarize the file"
response = main(uploaded, query)
print(response)


Individuals have the right to request a summary of their personal data and processing activities from data fiduciaries (companies processing their data). They can also request a list of other data fiduciaries and processors the data has been shared with. However, this right does not apply to data sharing with other data fiduciaries authorized by law for law enforcement purposes.


In [23]:
query = "Highlight the Key takeaways of the file"
response = main(uploaded, query)
print(response)

Key Takeaways:
- Data Principals have the right to access information about their personal data being processed.
- Data Fiduciaries must undertake measures for data protection, including impact assessments, audits, and other prescribed measures.
- Data Principals have the right to request corrections, updates, or deletions of their personal data.
- Data Fiduciaries must not track or behaviorally monitor children or target them with advertising without obtaining consent.
- The government may notify Significant Data Fiduciaries based on factors like data volume, sensitivity, and potential impact on India.
- Significant Data Fiduciaries have additional obligations, including appointing a Data Protection Officer and an independent data auditor.


In [25]:
query = "How does the Act regulate the transfer of personal data outside the country?"
response = main(uploaded, query)
print(response)

Section 16 of the Act states that the Central Government may restrict the transfer of personal data by a Data Fiduciary for processing to such country or territory outside India as may be so notified.


In [26]:
query = "What are the penalties for non-compliance with the Digital Personal Data Protection Act 2023?"
response = main(uploaded, query)
print(response)

- Breach of provisions of the Act or rules: May extend to two hundred and fifty crore rupees
- Breach in observing the obligation of Data Fiduciary to take reasonable security safeguards: May extend to two hundred crore rupees
- Breach in observing the obligation to give notice of a personal data breach: May extend to two hundred crore rupees
- Breach in observance of additional obligations in relation to children: May extend to one hundred and fifty crore rupees
- Breach in observance of additional obligations of Significant Data Fiduciary: May extend to ten thousand rupees
- Breach in observance of the duties under section 15: Up to the extent applicable for the breach in respect of which the proceedings under section 28
- Breach of any term of voluntary undertaking accepted by the Board: Up to the extent applicable for the breach in respect of which the proceedings under section 28
- Breach of any other provision of the Act or the rules made thereunder: Up to the extent applicable f