In [None]:
!pip install -qU semantic-chunkers==0.0.3 datasets==2.19.1 PyPDF2 google-generativeai>=0.7.2 pdfplumber chromadb langdetect

In [None]:
import os
import pdfplumber
from datasets import Dataset
import pandas as pd
import chromadb
import google.generativeai as genai
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from typing import List
import numpy as np
from transformers import AutoTokenizer, AutoModel, MarianMTModel, MarianTokenizer
import torch
from sklearn.cluster import KMeans
import nltk
from langdetect import detect
nltk.download('punkt')

# Define supported languages
SUPPORTED_LANGUAGES = {
    'en': 'English',
    'es': 'Spanish',
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'pt': 'Portuguese',
    'nl': 'Dutch',
    'pl': 'Polish',
    'ru': 'Russian',
    'zh': 'Chinese',
    'ja': 'Japanese',
    'ko': 'Korean'
}

class TranslationManager:
    def __init__(self):
        self.translators = {}
        # Initialize translation models only when needed
        self.initialized_langs = set(['en'])

    def initialize_language(self, lang_code):
        if lang_code not in self.initialized_langs and lang_code != 'en':
            try:
                # Initialize translation to English
                model_name = f'Helsinki-NLP/opus-mt-{lang_code}-en'
                self.translators[f'{lang_code}_to_en'] = {
                    'model': MarianMTModel.from_pretrained(model_name),
                    'tokenizer': MarianTokenizer.from_pretrained(model_name)
                }

                # Initialize translation from English
                reverse_model_name = f'Helsinki-NLP/opus-mt-en-{lang_code}'
                self.translators[f'en_to_{lang_code}'] = {
                    'model': MarianMTModel.from_pretrained(reverse_model_name),
                    'tokenizer': MarianTokenizer.from_pretrained(reverse_model_name)
                }

                self.initialized_langs.add(lang_code)
            except Exception as e:
                print(f"Warning: Could not initialize {lang_code} translation: {e}")
                return False
        return True

    def translate(self, text: str, source_lang: str, target_lang: str) -> str:
        if source_lang == target_lang:
            return text

        if not self.initialize_language(source_lang) or not self.initialize_language(target_lang):
            return text

        try:
            if source_lang != 'en':
                # Translate to English first
                translator = self.translators[f'{source_lang}_to_en']
                inputs = translator['tokenizer'](text, return_tensors="pt", padding=True, truncation=True, max_length=512)
                outputs = translator['model'].generate(**inputs)
                text = translator['tokenizer'].decode(outputs[0], skip_special_tokens=True)

            if target_lang != 'en':
                # Then translate to target language
                translator = self.translators[f'en_to_{target_lang}']
                inputs = translator['tokenizer'](text, return_tensors="pt", padding=True, truncation=True, max_length=512)
                outputs = translator['model'].generate(**inputs)
                text = translator['tokenizer'].decode(outputs[0], skip_special_tokens=True)

            return text
        except Exception as e:
            print(f"Translation error: {e}")
            return text

class SemanticSimilarityChunker:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        # Initialize with a sentence transformer model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_embeddings(self, sentences: List[str]) -> np.ndarray:
        """Convert sentences to numerical vectors (embeddings)"""
        inputs = self.tokenizer(
            sentences,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].numpy()

        return embeddings

    def chunk_by_similarity(self, text: str, num_chunks: int = 20) -> List[str]:
        sentences = nltk.sent_tokenize(text)
        grouped_sentences = [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]

        if len(grouped_sentences) < num_chunks:
            return grouped_sentences

        embeddings = self.get_embeddings(grouped_sentences)
        kmeans = KMeans(n_clusters=num_chunks, random_state=42)
        clusters = kmeans.fit_predict(embeddings)

        chunked_text = [[] for _ in range(num_chunks)]
        for sentence, cluster_id in zip(grouped_sentences, clusters):
            chunked_text[cluster_id].append(sentence)

        return [" ".join(chunk) for chunk in chunked_text if chunk]

def extract_text_from_pdf(pdf_path):
    text_data = []
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        print(f"Processing PDF with {total_pages} pages")
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                text_data.append(text)
            else:
                print(f"Empty text on page {i+1}")
    return text_data
def process_text_semantically(text: str, num_chunks: int = 20) -> List[str]:
    chunker = SemanticSimilarityChunker()

    # Add overlap between chunks to maintain context
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < 10:  # For very small documents
        return [text]

    # Use smaller chunks with overlap
    chunk_size = 10  # sentences per chunk
    overlap = 3     # sentences of overlap

    chunks = []
    for i in range(0, len(sentences), chunk_size - overlap):
        chunk = sentences[i:i + chunk_size]
        if chunk:
            chunks.append(" ".join(chunk))

    # Only apply semantic clustering if we have enough chunks
    if len(chunks) > num_chunks:
        chunks = chunker.chunk_by_similarity(" ".join(chunks), num_chunks)

    return chunks

def process_pdf_directory(directory_path):
    all_chunks = []
    pdf_files = [f for f in os.listdir(directory_path) if f.lower().endswith('.pdf')]

    for filename in pdf_files:
        file_path = os.path.join(directory_path, filename)
        text_data = extract_text_from_pdf(file_path)
        if text_data:
            document_text = " ".join(text_data)
            num_chunks = max(5, len(document_text) // 1000)
            document_chunks = process_text_semantically(document_text, num_chunks)
            all_chunks.extend(document_chunks)
    return all_chunks

def get_user_language_preference():
    print("\nAvailable languages:")
    for code, name in SUPPORTED_LANGUAGES.items():
        print(f"{code}: {name}")

    while True:
        lang_code = input("\nPlease enter your preferred language code (e.g., 'en' for English): ").lower()
        if lang_code in SUPPORTED_LANGUAGES:
            return lang_code
        print(f"Invalid language code. Please choose from: {', '.join(SUPPORTED_LANGUAGES.keys())}")

def translate_query_response(text: str, source_lang: str, target_lang: str, translation_manager: TranslationManager) -> str:
    """Translate text between languages"""
    return translation_manager.translate(text, source_lang, target_lang)

def query_chroma(query, user_lang, translation_manager, collection):
    model = genai.GenerativeModel('gemini-1.5-flash')

    # Translate query to English for processing
    english_query = translate_query_response(query, user_lang, 'en', translation_manager)

    # Actually search the collection using query embeddings
    query_results = collection.query(
        query_texts=[english_query],
        n_results=5,
        include=['documents', 'distances', 'metadatas']
    )

    # Check if we got any results
    if not query_results or not query_results['documents'] or not query_results['documents'][0]:
        no_results_msg = "No relevant information found in the documents."
        translated_msg = translate_query_response(no_results_msg, 'en', user_lang, translation_manager)
        print("\nResponse:", translated_msg)
        return

    # Get the relevant chunks and their similarity scores
    relevant_chunks = query_results['documents'][0]
    distances = query_results['distances'][0] if 'distances' in query_results else [1.0] * len(relevant_chunks)

    # Convert distances to similarity scores (closer to 1 is better)
    similarity_scores = [1 / (1 + d) for d in distances]

    # Filter out low-relevance chunks
    MIN_SIMILARITY_SCORE = 0.3
    filtered_results = [
        (chunk, score)
        for chunk, score in zip(relevant_chunks, similarity_scores)
        if score > MIN_SIMILARITY_SCORE
    ]

    if not filtered_results:
        no_relevant_msg = "Found some content but it wasn't relevant enough to your query."
        translated_msg = translate_query_response(no_relevant_msg, 'en', user_lang, translation_manager)
        print("\nResponse:", translated_msg)
        return

    # Prepare context from relevant chunks
    context = "\n\n".join([
        f"Relevant text (similarity: {score:.2f}):\n{chunk}"
        for chunk, score in filtered_results
    ])

    # Prepare the prompt for the model
    prompt = f"""
    Please answer this question based on the following excerpts from documents:

    Question: {english_query}

    Document excerpts:
    {context}

    Instructions:
    - Base your answer only on the provided excerpts
    - If the excerpts don't contain enough information, say so
    - Include relevant details and cite specific information from the excerpts
    - Be clear and concise
    - dont give answers more than 1000 characters
    - Answer based on the provided context
    - If the context is partially relevant, provide a partial answer based on available information
    - Focus on information with higher relevance scores
    - Cite specific details from the context
    - If you're unsure about any details, acknowledge the uncertainty
    - Keep the response clear and factual

    Answer in {SUPPORTED_LANGUAGES[user_lang]}:
    """

    try:
        # Generate response using the model
        response = model.generate_content(prompt)

        if hasattr(response, 'text') and response.text:
            # Translate response to user's language if needed
            translated_response = translate_query_response(response.text, 'en', user_lang, translation_manager)
            print("\nResponse:", translated_response)
        else:
            error_msg = "Sorry, I couldn't generate a response. Please try rephrasing your question."
            translated_error = translate_query_response(error_msg, 'en', user_lang, translation_manager)
            print("\nResponse:", translated_error)

    except Exception as e:
        error_msg = f"An error occurred while processing your query: {str(e)}"
        translated_error = translate_query_response(error_msg, 'en', user_lang, translation_manager)
        print("\nError:", translated_error)

def main():
    # Configure Google API
    genai.configure(api_key="AIzaSyDurRkk4SQwqAkZW4CEtRGkWhTioZErIWk")  # Replace with your actual API key

    # Directory containing PDF files
    directory_path = "Datasets"

    # Process PDFs with debug info
    print("Processing PDF documents...")
    datasets_list = process_pdf_directory(directory_path)

    if not datasets_list:
        print("Error: No text was extracted from PDFs. Please check your PDF files.")
        return

    print(f"Successfully extracted text from {len(datasets_list)} document sections")

    # Join all text and process semantically
    complete_text = " ".join(datasets_list)
    print(f"Total text length: {len(complete_text)} characters")

    # Process text into chunks
    print("Processing text semantically...")
    text_chunks = process_text_semantically(complete_text)
    print(f"Created {len(text_chunks)} text chunks")

    # Initialize embedding model
    print("Initializing embedding model...")
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    # Create embeddings with progress indicator
    print("Creating embeddings...")
    embeddings = []
    for i, chunk in enumerate(text_chunks):
        embedding = model.encode(chunk)
        embeddings.append(embedding)
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(text_chunks)} chunks")

    # Initialize Chroma
    print("Initializing Chroma database...")
    client = chromadb.Client()
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-MiniLM-L6-v2"
    )

    # Verify data integrity
    if len(text_chunks) != len(embeddings):
        raise ValueError(f"Mismatch in lengths: chunks={len(text_chunks)}, embeddings={len(embeddings)}")

    # Reset collection
    try:
        client.delete_collection(name="my_collection")
    except Exception as e:
        print(f"Note: Could not delete existing collection: {e}")

    # Create and populate collection
    collection = client.create_collection(
        name="my_collection",
        embedding_function=embedding_function
    )

    # Add documents to collection with metadata
    collection.add(
        documents=text_chunks,
        ids=[str(i) for i in range(len(text_chunks))],
        embeddings=embeddings,
        metadatas=[{"chunk_id": i} for i in range(len(text_chunks))]
    )

    print(f"Successfully added {len(text_chunks)} chunks to the database")

    # Initialize translation manager
    translation_manager = TranslationManager()

    # Get user's preferred language
    user_lang = get_user_language_preference()

    # Translate prompt
    exit_msg = translate_query_response(
        "Enter your query (or 'exit' to quit): ",
        'en',
        user_lang,
        translation_manager
    )

    # Main query loop
    print("\nReady for queries!")
    while True:
        user_query = input(exit_msg)
        if user_query.lower() == 'exit':
            break
        query_chroma(user_query, user_lang, translation_manager, collection)


if __name__ == "__main__":
    main()