In [0]:
%pylab inline

In [0]:
import dataiku
from langchain_core.messages import HumanMessage, SystemMessage

In [0]:
client = dataiku.api_client()
project = client.get_default_project()
llm_list = project.list_llms()
for llm in llm_list:
    print(f"- {llm.description} (id: {llm.id})")


In [0]:

LLM_ID = "custom:iliad-plugin-conn-prod:gpt-4o"  # Fill with a valid LLM_ID
llm = project.get_llm(LLM_ID)
lcllm = llm.as_langchain_llm()
lcllmResp = lcllm.invoke("When was the movie Citizen Kane released?")
print(lcllmResp)

question = "When was the movie Citizen Kane released?"
system_msg = """You are an expert in the history of American cinema.
You always answer questions with a lot of passion and enthusiasm.
"""

messages = [
    SystemMessage(content=system_msg),
    HumanMessage(content=question)
]

lcllmResp = lcllm.invoke(messages)
print(lcllmResp)

# ivector store chromadb

In [0]:
import dataiku
import pandas as pd
import numpy as np
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.vectorstores import FAISS, Chroma
from langchain.docstore.document import Document
from langchain.schema import Document as LangChainDocument
from langchain.embeddings.base import Embeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
import pickle
import os
from typing import List
import re
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Tuple, Optional, Union
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import tiktoken
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ChatMessageHistory
from langchain.schema import SystemMessage, HumanMessage, AIMessage
import en_core_web_lg
nlp = en_core_web_lg.load()


class ModelDefination:
    def __init__(self, embedding_model, llm_id, vector_store_type="FAISS"):
        # Store model parameters
        self.embedding_model_name = embedding_model
        self.llm_id = llm_id
        self.vector_store_type = vector_store_type.upper()  # Normalize to uppercase
        self.embedding_model = None
        self.llm = None
        self.client = dataiku.api_client()
        self.project = self.client.get_default_project()

        # Define paths for vector stores
        self.faiss_index_path = "./faiss_index"
        self.chromadb_index_path = "./chromadb_index"

        # Initialize models
        self._initialize_embedding_model()
        self._initialize_llm()

        if self.embedding_model is None:
            raise ValueError("Embedding model could not be initialized")
        if self.llm is None:
            raise ValueError("LLM could not be initialized")

        self.chat_history = ChatMessageHistory()
        self.buffer_memory = ConversationBufferMemory(
            memory_key="chat_history",
            chat_memory=self.chat_history,
            return_messages=True,
            output_key="answer" 
        )

        self.summary_memory = ConversationSummaryBufferMemory(
            llm=self.llm,
            memory_key="chat_history",
            return_messages=True,
            max_token_limit=1000,
            output_key="answer"  
        )

        print(f"Model Definition initialized with vector store type: {self.vector_store_type}")

    def _initialize_embedding_model(self):
        try:
            if self.embedding_model_name == "text-embedding-ada-002":
                client = dataiku.api_client()
                connection = client.get_connection("text-embedding-ada-002")
                connection_params = connection.get_info()["params"]

                available_deployments = connection_params.get("availableDeployments", [])
                if not available_deployments:
                    raise ValueError("No deployments found for embedding model.")

                self.embedding_deployment_name = available_deployments[0]["name"]
                model_name = available_deployments[0]["underlyingModelName"]
                azure_openai_endpoint = f"https://{connection_params['resourceName']}.openai.azure.com/"

                self.embedding_model = AzureOpenAIEmbeddings(
                    azure_endpoint=azure_openai_endpoint,
                    api_key=connection_params.get("apiKey"),
                    deployment=self.embedding_deployment_name,
                    model=model_name,
                    chunk_size=1000
                )
                print(f"Initialized embedding model: {model_name}")

            elif "custom:iliad-plugin-conn-prod" in self.embedding_model_name:
                # Get the embedding model from the project and use the LangChain wrapper
                emb_model = self.project.get_llm(self.embedding_model_name)
                self.embedding_model = emb_model.as_langchain_embeddings()
                print(f"Initialized LangChain embedding model: {self.embedding_model_name}")

        except Exception as e:
            print(f"Error initializing embedding model: {str(e)}")
            import traceback
            traceback.print_exc()

    def _initialize_llm(self):
        try:
            if self.llm_id == "gpt-35-turbo-16k":
                connection = self.client.get_connection("gpt-35-turbo-16k-2")
                connection_params = connection.get_info()["params"]

                available_deployments_llm = connection_params.get("availableDeployments", [])
                if not available_deployments_llm:
                    raise ValueError("No deployments found for LLM.")

                llm_deployment_name = available_deployments_llm[0]["name"]
                llm_model_name = available_deployments_llm[0]["underlyingModelName"]
                azure_llm_endpoint = f"https://{connection_params['resourceName']}.openai.azure.com/"

                self.llm = AzureChatOpenAI(
                    azure_endpoint=azure_llm_endpoint,
                    api_key=connection_params.get("apiKey"),
                    deployment_name=llm_deployment_name,
                    model_name=llm_model_name,
                    temperature=0.1,
                    api_version="2024-02-01"
                )
                print(f"Initialized Azure LLM: {llm_model_name}")

            elif "custom:iliad-plugin-conn-prod" in self.llm_id:
                # Get the LLM model from the project and use the LangChain wrapper
                llm_model = self.project.get_llm(self.llm_id)
                self.llm = llm_model.as_langchain_llm()
                print(f"Initialized LangChain LLM: {self.llm_id}")

        except Exception as e:
            print(f"Failed to initialize LLM: {str(e)}")
            raise Exception(f"Error initializing LLM: {str(e)}")

    def document_preparation(self, df):
        """
        Prepare documents and create vector store index.
        Supports both FAISS and ChromaDB.
        """
        try:
            if not isinstance(df, pd.DataFrame):
                raise ValueError("The provided 'df' is not a pandas DataFrame")

            documents = [
                LangChainDocument(
                    page_content=row["chunk_text"],
                    metadata={"id": str(index), "metadata": row["metadata"]}
                )
                for index, row in df.iterrows()
            ]

            # Create success flag
            success = False

            if self.vector_store_type.upper() == "FAISS":
                store_path = self.faiss_index_path
                os.makedirs(store_path, exist_ok=True)

                if "embeddings" in df.columns:
                    # Use pre-computed embeddings if available
                    embeddings = np.array([eval(embed) if isinstance(embed, str) else embed for embed in df["embeddings"]])
                    vectorstore = FAISS.from_embeddings(
                        text_embeddings=zip(df["chunk_text"], embeddings),
                        embedding=self.embedding_model,
                        metadatas=[{"id": str(index), "metadata": row["metadata"]} for index, row in df.iterrows()]
                    )
                else:
                    # Compute embeddings on-the-fly
                    vectorstore = FAISS.from_documents(documents, embedding=self.embedding_model)

                vectorstore.save_local(store_path)
                print(f"FAISS index saved successfully at {store_path}.")
                success = True

            elif self.vector_store_type.upper() == "CHROMADB":
                store_path = self.chromadb_index_path
                os.makedirs(store_path, exist_ok=True)

                try:
                    # First check if we have pre-computed embeddings
                    has_embeddings = "embeddings" in df.columns
                    print(f"Creating ChromaDB with {'pre-computed' if has_embeddings else 'on-the-fly'} embeddings")

                    # With newer versions of ChromaDB and LangChain integration, we handle this differently
                    # Create basic documents without embeddings first
                    vectorstore = Chroma.from_documents(
                        documents=documents,
                        embedding=self.embedding_model,
                        persist_directory=store_path
                    )

                    # If we have pre-computed embeddings, we'll try to add them
                    if has_embeddings:
                        try:
                            # Get the raw client from the LangChain wrapper
                            collection = vectorstore._collection

                            # Get the document IDs
                            ids = [str(i) for i in range(len(documents))]

                            # Process embeddings from DataFrame
                            processed_embeddings = []
                            for embed in df["embeddings"]:
                                if isinstance(embed, str):
                                    try:
                                        # Convert string representation to list
                                        embed_list = eval(embed)
                                        processed_embeddings.append(embed_list)
                                    except:
                                        print(f"Warning: Could not parse embedding string")
                                        # Return None for this embedding to flag it
                                        processed_embeddings.append(None)
                                else:
                                    # If it's already a list/array, just append it
                                    processed_embeddings.append(embed)

                            # Check if we have valid embeddings for all documents
                            if all(embed is not None for embed in processed_embeddings) and len(processed_embeddings) == len(documents):
                                print(f"Adding {len(processed_embeddings)} pre-computed embeddings to ChromaDB")

                                # Use lower-level API to update the embeddings
                                # This approach varies by ChromaDB version, so we'll try multiple methods

                                try:
                                    # Method 1: Try using the newer ChromaDB API (v0.4.0+)
                                    # This recreates the collection with embeddings
                                    texts = [doc.page_content for doc in documents]
                                    metadatas = [doc.metadata for doc in documents]

                                    # Delete existing collection
                                    vectorstore.delete_collection()

                                    # Create new collection with embeddings
                                    import chromadb
                                    from chromadb.config import Settings

                                    # Try to get client with proper settings
                                    try:
                                        client = chromadb.PersistentClient(path=store_path)
                                    except:
                                        # Fallback to basic client
                                        client = chromadb.Client()

                                    # Create collection
                                    collection = client.create_collection(name="langchain")

                                    # Add documents with embeddings
                                    collection.add(
                                        documents=texts,
                                        embeddings=processed_embeddings,
                                        metadatas=metadatas,
                                        ids=ids
                                    )

                                    # Reconnect LangChain wrapper to the collection
                                    vectorstore = Chroma(
                                        client=client,
                                        collection_name="langchain",
                                        embedding_function=self.embedding_model,
                                        persist_directory=store_path 
                                    )

                                    print("Successfully added embeddings using ChromaDB native API")

                                except Exception as e1:
                                    print(f"Method 1 failed: {e1}")

                                    try:
                                        # Method 2: Try using the LangChain wrapper's API
                                        # This might work with some versions
                                        for i, doc in enumerate(documents):
                                            vectorstore.add_embeddings(
                                                texts=[doc.page_content],
                                                embeddings=[processed_embeddings[i]],
                                                metadatas=[doc.metadata]
                                            )
                                        print("Successfully added embeddings using LangChain wrapper")

                                    except Exception as e2:
                                        print(f"Method 2 failed: {e2}")

                                        try:
                                            # Method 3: Fallback to direct collection update
                                            # This works with some versions
                                            for i, doc_id in enumerate(ids):
                                                collection.update(
                                                    ids=[doc_id],
                                                    embeddings=[processed_embeddings[i]]
                                                )
                                            print("Successfully updated embeddings using direct collection update")

                                        except Exception as e3:
                                            print(f"Method 3 failed: {e3}")
                                            print("Could not add pre-computed embeddings, using model-generated embeddings instead")
                            else:
                                print("Some embeddings could not be parsed, using model-generated embeddings instead")
                        except Exception as embed_error:
                            print(f"Error working with pre-computed embeddings: {embed_error}")

                    # Persist ChromaDB to disk
                    try:
                        vectorstore.persist()
                        print(f"ChromaDB index saved successfully at {store_path}.")
                        success = True
                    except Exception as persist_error:
                        print(f"Warning: Could not persist ChromaDB: {persist_error}")
                        # Even if persist fails, we might still have a working in-memory DB
                        success = True

                except Exception as chroma_error:
                    print(f"ChromaDB integration error: {chroma_error}")
                    import traceback
                    traceback.print_exc()

                    # Try a simplified approach as last resort
                    try:
                        print("Trying simplified ChromaDB approach...")
                        vectorstore = Chroma.from_documents(
                            documents=documents,
                            embedding=self.embedding_model,
                            persist_directory=store_path
                        )
                        success = True
                        print("Simplified ChromaDB approach succeeded")
                    except Exception as simple_error:
                        print(f"Simplified ChromaDB approach failed: {simple_error}")
                        success = False

            else:
                raise ValueError(f"Unsupported vector store type: {self.vector_store_type}")

            return success

        except Exception as e:
            print(f"Failed to create index in Vector Store: {str(e)}")
            import traceback
            traceback.print_exc()
            return False

        

    def _compute_semantic_similarity(self, query_vector, document_vectors):
        """
        Compute semantic similarity between query vector and document vectors.
        """
        if not document_vectors:
            return []

        query_vector = np.array(query_vector).reshape(1, -1)
        document_vectors = np.array(document_vectors)
        return cosine_similarity(query_vector, document_vectors).flatten()

    def _preprocess_query(self, query):
        """
        Preprocess the query to extract key terms and concepts.
        """
        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'when', 'where', 'how', 'is', 'are', 'was', 'were'}
        query = re.sub(r'[^\w\s]', ' ', query.lower())
        words = query.split()
        key_terms = [word for word in words if word not in stop_words and len(word) > 2]
        entities = re.findall(r'\b[A-Z][a-zA-Z]+\b', query)
        return {"original": query, "key_terms": key_terms, "entities": entities}

    def _rerank_documents(self, query, documents, embedded_query=None):
        """
        Rerank documents based on semantic and lexical relevance
        """
        if not documents:
            return []
        
        # Get query components
        query_info = self._preprocess_query(query)
        key_terms = query_info['key_terms']
        
        # Get embeddings for reranking if not provided
        if embedded_query is None and hasattr(self.embedding_model, 'embed_query'):
            embedded_query = self.embedding_model.embed_query(query)
        
        # Extract document embeddings
        doc_embeddings = []
        for doc in documents:
            if hasattr(doc, 'metadata') and 'embedding' in doc.metadata:
                doc_embeddings.append(doc.metadata['embedding'])
            else:
                # If no embedding in metadata, create one
                if hasattr(self.embedding_model, 'embed_documents'):
                    doc_embedding = self.embedding_model.embed_documents([doc.page_content])[0]
                    doc_embeddings.append(doc_embedding)
        
        # Compute semantic similarity if we have embeddings
        if embedded_query is not None and doc_embeddings:
            semantic_scores = self._compute_semantic_similarity(embedded_query, doc_embeddings)
        else:
            semantic_scores = [0] * len(documents)
        
        # Compute lexical similarity (term frequency)
        lexical_scores = []
        for doc in documents:
            content = doc.page_content.lower()
            term_matches = sum(1 for term in key_terms if term in content)
            lexical_scores.append(term_matches / max(1, len(key_terms)))
        
        # Combine scores (0.7 semantic, 0.3 lexical)
        combined_scores = [0.7 * sem + 0.3 * lex for sem, lex in zip(semantic_scores, lexical_scores)]
        
        # Create result tuples with score and document
        results = [(score, doc) for score, doc in zip(combined_scores, documents)]
        
        # Sort by score (descending)
        results.sort(reverse=True, key=lambda x: x[0])
        
        # Return only the documents with their scores
        return [(doc, score) for score, doc in results]
    
    def get_conversational_chain(self, retriever, memory_type="summary"):
        if memory_type == "summary":
            memory = ConversationSummaryBufferMemory(
                llm=self.llm,
                memory_key="chat_history",
                return_messages=True,
                max_token_limit=1000,
                output_key="answer"  
            )
        else:
            memory = ConversationBufferMemory(
                memory_key="chat_history",
                chat_memory=self.chat_history,
                return_messages=True,
                output_key="answer"  
            )

        conversation_chain = ConversationalRetrievalChain.from_llm(
            llm=self.llm,
            retriever=retriever,
            memory=memory,
            return_source_documents=True,
            verbose=True
        )
        return conversation_chain
    
    def get_retriever(self, df=None, top_k=10):
        """
        Get a retriever based on the configured vector store type.
        Will use an existing index or create a new one if df is provided.
        """
        try:
            vectorstore = None
            
            # Check if we need to create the index
            if df is not None:
                self.document_preparation(df)
            
            # Try to load the appropriate vector store
            if self.vector_store_type.upper() == "FAISS":
                if os.path.exists(self.faiss_index_path):
                    vectorstore = FAISS.load_local(
                        self.faiss_index_path, 
                        self.embedding_model, 
                        allow_dangerous_deserialization=True
                    )
                    print(f"Successfully loaded FAISS vector store from {self.faiss_index_path}")
                else:
                    print(f"FAISS index not found at {self.faiss_index_path}")
                    if df is not None:
                        print("Creating FAISS index from provided DataFrame...")
                        success = self.document_preparation(df)
                        if success:
                            vectorstore = FAISS.load_local(
                                self.faiss_index_path, 
                                self.embedding_model, 
                                allow_dangerous_deserialization=True
                            )
                            print(f"Successfully created and loaded FAISS vector store")
                        else:
                            raise ValueError("Failed to create FAISS index")
                    else:
                        raise FileNotFoundError(f"FAISS index not found at {self.faiss_index_path} and no DataFrame provided to create it")
            
            elif self.vector_store_type.upper() == "CHROMADB":
                if os.path.exists(self.chromadb_index_path):
                    vectorstore = Chroma(
                        persist_directory=self.chromadb_index_path,
                        embedding_function=self.embedding_model
                    )
                    print(f"Successfully loaded ChromaDB vector store from {self.chromadb_index_path}")
                else:
                    print(f"ChromaDB index not found at {self.chromadb_index_path}")
                    if df is not None:
                        print("Creating ChromaDB index from provided DataFrame...")
                        success = self.document_preparation(df)
                        if success:
                            vectorstore = Chroma(
                                persist_directory=self.chromadb_index_path,
                                embedding_function=self.embedding_model
                            )
                            print(f"Successfully created and loaded ChromaDB vector store")
                        else:
                            raise ValueError("Failed to create ChromaDB index")
                    else:
                        raise FileNotFoundError(f"ChromaDB index not found at {self.chromadb_index_path} and no DataFrame provided to create it")
            
            # Create retriever from the loaded vector store
            if vectorstore:
                retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": top_k})
                return retriever
            else:
                raise ValueError(f"Could not create retriever for {self.vector_store_type}")
                
        except Exception as e:
            print(f"Error getting retriever: {str(e)}")
            import traceback
            traceback.print_exc()
            return None
    
    def retrieve_relevant_chunks(self, df, query, top_k=10):
        """
        Retrieve relevant chunks based on query similarity with improved relevance
        using the configured vector store type (FAISS or ChromaDB).
        """
        try:
            print(f"Retrieving documents for query: {query[:50]}... using {self.vector_store_type}")

            # Get or create a retriever for the specified vector store type
            retriever = self.get_retriever(df, top_k * 2)  # Get more for reranking
            
            if not retriever:
                print("Failed to create retriever")
                return []

            # Make sure the query is embedded 
            embedded_query = None
            try:
                if hasattr(self.embedding_model, 'embed_query'):
                    embedded_query = self.embedding_model.embed_query(query)
                elif hasattr(self.embedding_model, 'encode'):
                    embedded_query = self.embedding_model.encode(query)
            except Exception as embed_error:
                print(f"Warning: Failed to explicitly embed query: {embed_error}")

            # Retrieve relevant documents
            try:
                print("Attempting to retrieve documents...")
                # Use the standard LangChain invoke method
                initial_docs = retriever.invoke(query)
                print(f"Successfully retrieved {len(initial_docs)} documents for reranking")
            except Exception as invoke_error:
                print(f"Failed to use invoke method: {invoke_error}")
                # Fall back to the older method
                try:
                    if self.vector_store_type.upper() == "FAISS":
                        # For FAISS, try similarity_search_by_vector if invoke failed
                        vectorstore = FAISS.load_local(
                            self.faiss_index_path, 
                            self.embedding_model, 
                            allow_dangerous_deserialization=True
                        )
                        if embedded_query is not None:
                            initial_docs = vectorstore.similarity_search_by_vector(embedded_query, k=top_k * 2)
                        else:
                            initial_docs = vectorstore.similarity_search(query, k=top_k * 2)
                    
                    elif self.vector_store_type.upper() == "CHROMADB":
                        # For ChromaDB, try direct search methods
                        chromadb_store = Chroma(
                            persist_directory=self.chromadb_index_path,
                            embedding_function=self.embedding_model
                        )
                        if embedded_query is not None:
                            # For ChromaDB with embedded query
                            initial_docs = chromadb_store.similarity_search_by_vector(embedded_query, k=top_k * 2)
                        else:
                            # Standard text search
                            initial_docs = chromadb_store.similarity_search(query, k=top_k * 2)
                    
                    print(f"Successfully retrieved {len(initial_docs)} documents using fallback method")
                except Exception as retrieve_error:
                    print(f"Failed to retrieve documents: {retrieve_error}")
                    return []

            # Rerank the documents for better relevance
            reranked_docs = self._rerank_documents(query, initial_docs, embedded_query)

            # Take top k after reranking
            top_docs = reranked_docs[:top_k]

            # Format the results
            results = []
            for doc, score in top_docs:
                result = {
                    "chunk_text": doc.page_content,
                    "metadata": doc.metadata,
                    "score": score
                }
                results.append(result)

            return results

        except Exception as e:
            print(f"Failed to retrieve data from Vector Store: {str(e)}")
            import traceback
            traceback.print_exc()
            return []
    
    def mask_pii_from_chunks(self, retrieved_chunks):
        """
        Mask PII in retrieved document chunks using Presidio.
        """
        # Validate the input
        if not isinstance(retrieved_chunks, list):
            raise ValueError("retrieved_chunks must be a list of dictionaries")

        analyzer = AnalyzerEngine()
        anonymizer = AnonymizerEngine()

        masked_results = []

        for chunk in retrieved_chunks:
            original_text = chunk.get("chunk_text", "")
            metadata = chunk.get("metadata", {})
            score = chunk.get("score", 0)

            # Analyze for PII with error handling
            try:
                analysis_results = analyzer.analyze(text=original_text, entities=None, language='en')
            except Exception as e:
                print(f"PII analysis failed for chunk: {original_text[:30]}... Error: {e}")
                analysis_results = []

            # Convert PII detection results to JSON-like format
            pii_entities = [
                {
                    "entity_type": result.entity_type,
                    "start": result.start,
                    "end": result.end,
                    "score": result.score
                }
                for result in analysis_results
            ]

            # Anonymize the text with error handling
            try:
                anonymized_result = anonymizer.anonymize(text=original_text, analyzer_results=analysis_results)
                masked_text = anonymized_result.text
            except Exception as e:
                print(f"Anonymization failed for chunk: {original_text[:30]}... Error: {e}")
                masked_text = original_text  # Fallback to original if anonymization fails

            masked_results.append({
                "original_text": original_text,
                "masked_text": masked_text,
                "pii_entities": pii_entities,
                "metadata": metadata,
                "score": score
            })

        return masked_results

    def ask_question(self, query, df=None, memory_type="summary"):
        """
        Ask a question to the bot using the specified memory type.
        memory_type: 'summary' or 'buffer'
        """
        try:
            # Get retriever, creating index if needed
            retriever = self.get_retriever(df)
            if not retriever:
                return f"Error: Could not create retriever for {self.vector_store_type}"
                
            # Create the conversation chain
            chain = self.get_conversational_chain(retriever, memory_type=memory_type)
            result = chain.invoke({"question": query})
            return result["answer"]
        except Exception as e:
            print(f"Error asking question: {e}")
            import traceback
            traceback.print_exc()
            return f"Error processing question: {str(e)}"
    
    def generate_llm_response(self, user_query, df, system_prompt=None, return_context=True):
        """
        Generate LLM response based on retrieved chunks with improved relevance and conversational memory.
        Uses LangChain-compatible LLM invocation.
        """
        print(f"Generating LLM response for query: {user_query[:50]}... using {self.vector_store_type}")

        if self.llm is None:
            return ("Error: No LLM instance available", []) if return_context else "Error: No LLM instance available"

        try:
            # Step 1: Extract key terms from the user query for better retrieval
            query_info = self._preprocess_query(user_query)
            key_terms = query_info['key_terms']

            print(f"Identified key terms: {key_terms}")

            # Step 2: Create an augmented query with the key terms emphasized
            augmented_query = user_query
            if key_terms:
                augmented_query = f"{user_query} [KEY TERMS: {', '.join(key_terms)}]"

            # Step 3: Retrieve relevant chunks using the configured vector store
            relevant_chunks = self.retrieve_relevant_chunks(df, augmented_query, top_k=10)

            if not relevant_chunks:
                return (
                    "No relevant information found in the provided documents.",
                    []
                ) if return_context else "No relevant information found in the provided documents."

            print("Retrieved Relevant Chunks:", len(relevant_chunks))
            for i, chunk in enumerate(relevant_chunks, 1):
                print(f"\nResult {i}:")
                print(f"Chunk Text: {chunk['chunk_text'][:100]}...")  # Print just the beginning
                print(f"Metadata: {chunk['metadata']}")
                if chunk["score"] is not None:
                    print(f"Relevance Score: {chunk['score']:.4f}")

            # Step 4: Mask PII in the retrieved chunks
            print("Masking PII from retrieved chunks...")
            masked_chunks = self.mask_pii_from_chunks(relevant_chunks)

            # Step 5: Prepare formatted context using the masked text
            formatted_chunks = ""
            for i, chunk in enumerate(masked_chunks, 1):
                formatted_chunks += f"CHUNK {i}:\n{chunk['masked_text']}\n"
                formatted_chunks += f"Source: {chunk['metadata'].get('source', 'Unknown')}\n\n"

            # Step 6: Create custom prompt
            system_message_content = system_prompt if system_prompt else """
                You are an AI assistant that delivers precise, concise answers from provided documents. Follow these guidelines:

                1. Analyze ALL content thoroughly for ANY information related to the query, even partial matches.
                2. Present information directly without phrases like "the document chunks provide" or "Document 1/2/3 states."
                3. If referencing a specific document, use its actual name (e.g., "According to the iEngage Quick Reference Guide").
                4. Provide ONLY relevant facts without adding commentary, introductions, or conclusions.
                5. Format information as direct statements rather than numbered lists unless specifically requested.
                6. Include ALL potentially relevant details no matter how minimal.
                7. Present information in a concise, summarized format.
                8. Only state information is insufficient if absolutely nothing related can be found.
                9. Never generate content containing hate speech, offensive language, violence, threats, or misinformation.

                User Query:
                {query}

                Document Chunks:
                {context}

                Answer:
                """

            # Step 7: Create LangChain compatible messages
            messages = [
                SystemMessage(content=system_message_content),
                HumanMessage(content=f"""
                User Query:
                {user_query}

                Document Chunks:
                {formatted_chunks}
                """)
            ]

            # Step 8: Call the LLM with the messages using LangChain invoke
            print("Sending prompt to LLM...")
            try:
                response = self.llm.invoke(messages)
                print("Successfully received LLM response")

                # Step 9: Extract the response text
                if hasattr(response, "content"):
                    final_response = response.content
                else:
                    final_response = str(response)

                # Step 10: Update memory using ask_question
                try:
                    # Create retriever for memory update
                    retriever = self.get_retriever(df)
                    if retriever:
                        memory_chain = self.get_conversational_chain(retriever, memory_type="summary")
                        _ = memory_chain.invoke({"question": user_query})
                except Exception as mem_error:
                    print(f"Warning: Could not update conversation memory: {mem_error}")

                # Step 11: Prepare context for optional return
                context_dict = {
                    "relevant_chunks": masked_chunks,
                    "formatted_context": formatted_chunks,
                    "query": user_query,
                    "key_terms": key_terms,
                    "vector_store_used": self.vector_store_type
                }

                if return_context:
                    return final_response, context_dict
                else:
                    return final_response

            except ValueError as ve:
                print(f"ValueError encountered: {ve}")
                error_msg = f"Error generating response: {ve}"
                return (error_msg, []) if return_context else error_msg

        except Exception as e:
            print(f"Error in generate_llm_response: {str(e)}")
            import traceback
            traceback.print_exc()
            error_msg = f"Error generating response: {str(e)}"
            return (error_msg, []) if return_context else error_msg
        
    def count_tokens(self, text, encoding_name="cl100k_base"):  # cl100k_base is used by GPT-4 models
        """Count the number of tokens in the given text using tiktoken library."""
        try:
            encoding = tiktoken.get_encoding(encoding_name)
            return len(encoding.encode(text))
        except Exception as e:
            print(f"Error counting tokens: {str(e)}")
            # Fallback method: roughly estimate tokens as 4 characters per token
            return len(text) // 4

    def generate_response_from_extracted_text(self, user_query: str, extracted_text: str, token_threshold: int = 3500):
        """
        Generate LLM response from raw extracted document content and user query.
        If the input size exceeds the LLM threshold, return a message to update the knowledge base, or call update_knowledgebank endpoint.

        Args:
            user_query (str): The question or query from the user.
            extracted_text (str): Full extracted content from documents.
            token_threshold (int): Max allowed token count before triggering overflow condition.

        Returns:
            str: Final LLM response or a message indicating token overflow.
        """
        print(f"Received query: {user_query}")
        if self.llm is None:
            return "Error: No LLM instance available."

        try:
            # Step 1: Count LLM input token threshold
            estimated_tokens = self.count_tokens(extracted_text)
            print(f"Estimated token count of context: {estimated_tokens}")

            if estimated_tokens > token_threshold:
                print("Token threshold exceeded.")
                return ("LLM input threshold exceeded. Please use the 'update_knowledgebank' endpoint to reduce document size or chunk it appropriately.")

            # Step 2: Preprocess query to extract key terms
            query_info = self._preprocess_query(user_query)
            key_terms = query_info['key_terms']
            print(f"Identified key terms: {key_terms}")

            # Step 3: Analyze the text for PII and mask it
            try:
                analyzer = AnalyzerEngine()
                anonymizer = AnonymizerEngine()

                analysis_results = analyzer.analyze(text=extracted_text, entities=None, language='en')
                anonymized_result = anonymizer.anonymize(text=extracted_text, analyzer_results=analysis_results)
                masked_text = anonymized_result.text

                print("PII masking applied to extracted text")
            except Exception as pii_error:
                print(f"Warning: PII masking failed: {pii_error}")
                masked_text = extracted_text  # Fallback to original if anonymization fails

            # Step 4: Define prompt
            system_prompt = """
            You are an AI assistant that delivers precise, concise answers from provided documents. Follow these guidelines:
                1. Analyze ALL content thoroughly for ANY information related to the query, even partial matches.
                2. Present information directly without phrases like "the document chunks provide" or "Document 1/2/3 states."
                3. If referencing a specific document, use its actual name (e.g., "According to the iEngage Quick Reference Guide").
                4. Provide ONLY relevant facts without adding commentary, introductions, or conclusions.
                5. Format information as direct statements rather than numbered lists unless specifically requested.
                6. Include ALL potentially relevant details no matter how minimal.
                7. Present information in a concise, summarized format.
                8. Only state information is insufficient if absolutely nothing related can be found.
                9. Never generate content containing hate speech, offensive language, violence, threats, or misinformation.

                User Query:
                {query}

                Document Context:
                {context}

                Answer:
            """

            # Step 5: Create LangChain-compatible messages
            messages = [
                SystemMessage(content=system_prompt),
                HumanMessage(content=f"""
                User Query:
                {user_query}

                Key Terms: {', '.join(key_terms)}

                Document Context:
                {masked_text}
                """)
            ]

            # Step 6: Call the LLM
            print("Calling LLM with query and context...")
            try:
                response = self.llm.invoke(messages)
                print("Successfully received LLM response")

                # Step 7: Parse and return response
                if hasattr(response, "content"):
                    return response.content
                return str(response)

            except ValueError as ve:
                print(f"ValueError encountered: {ve}")
                return f"Error generating response: {ve}"

            except Exception as llm_error:
                print(f"LLM invocation error: {llm_error}")
                return f"Error generating response: {str(llm_error)}"

        except Exception as e:
            print(f"Exception in generate_response_from_extracted_text: {str(e)}")
            import traceback
            traceback.print_exc()
            return f"Error generating response: {str(e)}"

    

In [0]:
# Libraries
import dataiku
import pandas as pd
import numpy as np
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.vectorstores import FAISS, Chroma
from langchain.docstore.document import Document
from langchain.schema import Document as LangChainDocument
from langchain.embeddings.base import Embeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
import pickle
import os
from typing import List
import re
from sklearn.metrics.pairwise import cosine_similarity
import json
from langchain.schema import SystemMessage, HumanMessage

"""
host = "https://cdl-dku-dev-desi.commercial-datalake-prod.awscloud.abbvienet.com/"
apiKey = "BDxYeBpzVM2ZMlFD9wEcUTkpsxfsPvxk"
os.environ["DKU_CURRENT_PROJECT_KEY"] = "CDLADMIN" 
dataiku.set_remote_dss(host, apiKey, no_check_certificate=True)
"""

            
class VectorStoreGeneration:
    def __init__(self, input_dataset_name, output_dataset_name, user_query, embedding_model, llm, vector_store_type, top_k, use_compression, azure_openai_key):
        self.input_dataset_name = input_dataset_name
        self.output_dataset_name = output_dataset_name
        self.user_query = user_query
        
        # Define the embedding model and LLM first
        self.embedding_model_name = embedding_model
        self.llm_model_name = llm
        self.vector_store_type = vector_store_type
        
        # Create the model definition object with the model names
        self.ModelDef = ModelDefination(
            embedding_model=self.embedding_model_name, 
            llm_id=self.llm_model_name, 
            vector_store_type=vector_store_type
        )
        
        self.top_k = top_k
        # We don't use compression as per the requirement
        self.use_compression = False

    
    def process(self):
        try:
            dataset = dataiku.Dataset(self.input_dataset_name)
            df = dataset.get_dataframe()

            success = self.ModelDef.document_preparation(df)
            if not success:
                print("Failed to build vector index.")
                return
            
            # Generate LLM response and get context
            llm_response, context_dict = self.ModelDef.generate_llm_response(
                self.user_query, 
                df
            )

            # Extract the formatted context from the context dictionary
            formatted_context = ""
            if isinstance(context_dict, dict) and "formatted_context" in context_dict:
                formatted_context = context_dict["formatted_context"]
            elif isinstance(context_dict, dict) and "relevant_chunks" in context_dict:
                # Create formatted context from relevant chunks if needed
                chunks = context_dict["relevant_chunks"]
                for i, chunk in enumerate(chunks, 1):
                    formatted_context += f"CHUNK {i}:\n{chunk['chunk_text']}\n"
                    if 'metadata' in chunk and isinstance(chunk['metadata'], dict):
                        formatted_context += f"Source: {chunk['metadata'].get('source', 'Unknown')}\n\n"
            

            # Construct the final result with additional metadata
            result = {
                "question": self.user_query,
                "answer": llm_response,
                "contexts": formatted_context,
            }
            # Use json.dumps with a default function that converts non-serializable objects to strings.
            json_result = json.dumps(result, default=str)
            #return json_result
            return result

        except Exception as e:
            print(f"Failed to generate LLM response: {str(e)}")
            import traceback
            traceback.print_exc()
            return {
                "question": self.user_query,
                "answer": f"Error: {str(e)}",
                "contexts": ""
            }

# Example usage

In [0]:

# Import necessary libraries
# from vector_store_creation import VectorStoreGeneration
import dataiku
import os
from transformers import pipeline
# Import ChromaDB monkeypatch to ensure compatibility
from dataiku.core.vector_stores.chroma_vector_store import ChromaVectorStore
ChromaVectorStore.run_the_ugly_chromadb_monkeypatch()

# host = "https://cdl-dku-desi-p.commercial-datalake-prod.awscloud.abbvienet.com/"
# apiKey = "dkuaps-9ALuuZLhFJg9dTcrSgPMcsdtfP8bpPXC"
# os.environ["DKU_CURRENT_PROJECT_KEY"] = "GENAIPOC" 
# dataiku.set_remote_dss(host, apiKey, no_check_certificate=True)

# Dataset and vector store configuration parameters
input_dataset_name = "input_data_embedded"
output_dataset_name = "tripadvisor_hotel_reviews_with_names"
vector_store_type = 'CHROMADB'

# Other configuration parameters

EMBEDDING_MODEL = "custom:iliad-plugin-conn-prod:text-embedding-ada-002"
LLM_MODEL = "custom:iliad-plugin-conn-prod:gpt-4o"

vector_store = VectorStoreGeneration(
            input_dataset_name=input_dataset_name,
            output_dataset_name=output_dataset_name,
            user_query="who to call for rinvoq reactions?",
            embedding_model=EMBEDDING_MODEL,
            llm=LLM_MODEL,
            vector_store_type=vector_store_type,
            top_k=5,
            use_compression=False,
            azure_openai_key="None"
        )
        
# Process and get response
response = vector_store.process()
    
print(response)


# using knowledge bank

In [0]:
import dataiku
# Import ChromaDB monkeypatch to ensure compatibility
from dataiku.core.vector_stores.chroma_vector_store import ChromaVectorStore
ChromaVectorStore.run_the_ugly_chromadb_monkeypatch()
client = dataiku.api_client()
project = client.get_default_project()
# llm_list = project.list_llms()
kb_list = project.list_knowledge_banks(as_type='listitems')

In [0]:
for kb in kb_list:
    print(f"- {kb.name} - (id: {kb.id})")

In [0]:
kb_object = project.get_knowledge_bank("dV3dIQCo") 
kb_core = kb_object.as_core_knowledge_bank()

In [0]:
vectorstore_object = kb_core.as_langchain_retriever(search_type="similarity", search_kwargs={"k": 5})

In [0]:
from Digitization.Core.ModelManager import ModelManager
EMBEDDING_MODEL = "custom:iliad-plugin-conn-prod:text-embedding-ada-002"
LLM_MODEL = "custom:iliad-plugin-conn-prod:gpt-4o"
model_manager = ModelManager(EMBEDDING_MODEL,LLM_MODEL)
llm = model_manager.get_llm()
embedding_model = model_manager.get_embedding_model()

In [3]:
import dataiku
from dataiku.core.dataset import Dataset
from langchain.chains import RetrievalQA
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from Digitization.Core.ModelManager import ModelManager

class DataikuQASystem:
    """
    A knowledge base question answering system for Dataiku.
    This class handles the entire process of setting up and querying the knowledge base.
    """
    
    def __init__(self, kb_id="1to7O1GS", embedding_model_name="custom:iliad-plugin-conn-prod:text-embedding-ada-002", 
                 llm_model_name="custom:iliad-plugin-conn-prod:gpt-4o", k=5):
        """
        Initialize the QA system with knowledge bank and models.
        
        Args:
            kb_id (str): The ID of the knowledge bank to use. If None, must be set later with set_kb.
            embedding_model_name (str): The name of the embedding model to use.
            llm_model_name (str): The name of the LLM model to use.
            k (int): Number of documents to retrieve from the vector store.
        """
        # Ensure ChromaDB compatibility
        from dataiku.core.vector_stores.chroma_vector_store import ChromaVectorStore
        ChromaVectorStore.run_the_ugly_chromadb_monkeypatch()
        
        # Initialize API client and project
        self.client = dataiku.api_client()
        self.project = self.client.get_default_project()
        
        # Set up models
        self.model_manager = ModelManager(embedding_model_name, llm_model_name)
        self.llm = self.model_manager.get_llm()
        self.embedding_model = self.model_manager.get_embedding_model()
        
        # Retrieval parameters
        self.k = k
        self.vectorstore_object = None
        
        # Set up knowledge bank if ID is provided
        if kb_id:
            self.set_kb(kb_id)
        # Using summary buffer memory
        self.memory = ConversationSummaryBufferMemory(
            llm=self.llm,              # needs the LLM to produce summaries
            max_token_limit=512,       # how big the summary can grow
            memory_key="chat_history", # name under which the summary is exposed
            return_messages=False,      # summary is plain text, not Message objects
            input_key="question",       # matches the key passed into the chain
            output_key="answer"         # pick the LLM’s answer as the memory output
        )
        
        # QA chain
        self.qa_chain = None
    
    def set_kb(self, kb_id):
        """
        Set the knowledge bank to use.
        
        Args:
            kb_id (str): The ID of the knowledge bank.
        """
        kb_object = self.project.get_knowledge_bank(kb_id)
        kb_core = kb_object.as_core_knowledge_bank()
        self.vectorstore_object = kb_core.as_langchain_retriever(
            search_type="similarity", 
            search_kwargs={"k": self.k}
        )
        # Reset the QA chain since we have a new knowledge bank
        self.qa_chain = None
        
        return self
    
    def _setup_qa_system(self):
        """
        Set up the QA system with the current LLM and vector store.
        This is called internally when needed.
        
        Returns:
            RetrievalQA: A configured QA chain
        """
        if not self.vectorstore_object:
            raise ValueError("No knowledge bank set. Call set_kb() first.")
            
        # Create a custom prompt template that includes instructions
        template = """
        You are an AI assistant that delivers precise, concise answers from provided documents. 
        Users may rephrase their question in many ways, but your job is to understand the intent and always answer from our knowledge base. 
        Below is a brief summary of our conversation so far:
        {chat_history}
        Follow these guidelines:

                1. Analyze ALL content thoroughly for ANY information related to the query, even partial matches.
                2. Present information directly without meta commentary
                3. If referencing a specific document, use its actual name (e.g., "According to the iEngage Quick Reference Guide").
                4. Provide ONLY relevant facts without adding commentary, introductions, or conclusions.
                5. Format information as direct statements rather than numbered lists unless specifically requested.
                6. Include ALL potentially relevant details no matter how minimal.
                7. Present information in a concise, summarized format.
                8. Only state information is insufficient if absolutely nothing related can be found.
                9. Never generate content containing hate speech, offensive language, violence, threats, or misinformation.
        
        Context:
        {context}
        
        Question: {question}
        
        Please provide a detailed answer based only on the context provided.
        """
        
        prompt = PromptTemplate(
            template=template,
            input_variables=["chat_history", "context", "question"]
        )
        
        # Create the QA chain
#         self.qa_chain = RetrievalQA.from_chain_type(
#             llm=self.llm,
#             chain_type="stuff",  # Simple approach that "stuffs" all retrieved docs into the prompt
#             retriever=self.vectorstore_object,
#             return_source_documents=True,  # Include source documents in the response
#             chain_type_kwargs={"prompt": prompt}
#         )
        self.qa_chain = ConversationalRetrievalChain.from_llm(
            llm=self.llm,
            retriever=self.vectorstore_object,
            memory=self.memory,
            return_source_documents=True,
            combine_docs_chain_kwargs={"prompt": prompt}
#             chain_type_kwargs={"prompt": prompt}
        )
        
        return self.qa_chain
    
    def query(self, user_query):
        """
        Process a user query against the knowledge base and return answer + context.
        """
        if not self.qa_chain:
            self._setup_qa_system()

        response = self.qa_chain({"question": user_query})

        answer = response["answer"]
        source_documents = response.get("source_documents", [])

        # Return raw answer and context separately
        return answer, self._format_sources(source_documents)
    
    def _format_sources(self, sources):
        """
        Format the context sources separately for display.
        """
        formatted_sources = ""

        if sources:
            for i, doc in enumerate(sources):
                try:
                    source_text = doc.page_content[:150] + "..." if len(doc.page_content) > 150 else doc.page_content
                    metadata = doc.metadata if hasattr(doc, 'metadata') else {}
                    source_info = f"Document {i+1}"

                    if metadata:
                        if 'source' in metadata:
                            source_info += f" (Source: {metadata['source']})"
                        elif 'title' in metadata:
                            source_info += f" (Title: {metadata['title']})"

                    formatted_sources += f"\n{source_info}:\n{source_text}\n"
                except AttributeError:
                    formatted_sources += f"\nDocument {i+1}: [Format not recognized]\n"

        return formatted_sources.strip()


# Example usage:
"""
# Initialize the QA system with a knowledge bank ID
qa_system = DataikuQASystem(kb_id="dV3dIQCo")

# Or initialize and set the knowledge bank separately
qa_system = DataikuQASystem()
qa_system.set_kb("dV3dIQCo")

# Query the system
response = qa_system.query("What is the policy on medical leave?")
print(response)
"""

'\n# Initialize the QA system with a knowledge bank ID\nqa_system = DataikuQASystem(kb_id="dV3dIQCo")\n\n# Or initialize and set the knowledge bank separately\nqa_system = DataikuQASystem()\nqa_system.set_kb("dV3dIQCo")\n\n# Query the system\nresponse = qa_system.query("What is the policy on medical leave?")\nprint(response)\n'

In [5]:
# Initialize the QA system with a knowledge bank ID
qa_system = DataikuQASystem(kb_id="1to7O1GS")

# Or initialize and set the knowledge bank separately
qa_system = DataikuQASystem()
# qa_system.set_kb("1to7O1GS")

# Query the system
response = qa_system.query("What are some common reactions?")
print(response)

Initialized LangChain embedding model: custom:iliad-plugin-conn-prod:text-embedding-ada-002
Initialized LangChain LLM: custom:iliad-plugin-conn-prod:gpt-4o
Initialized LangChain embedding model: custom:iliad-plugin-conn-prod:text-embedding-ada-002
Initialized LangChain LLM: custom:iliad-plugin-conn-prod:gpt-4o
('For reactions related to RINVOQ, you can call 1-800-2RINVOQ (1-800-274-6867) for assistance. This number provides 24/7 access to registered nurses, with immediate assistance available Monday through Friday from 8 am to 8 pm ET.', "Document 1:\n.  It starts with your RINVOQ Complete Nurse Ambassador who is always there to get to know you.', 'CTA 1': 'Call 1-800-2RINVOQ (1-800-274-6867)', 'CTA...\n\nDocument 2:\n.  It starts with your RINVOQ Complete Nurse Ambassador who is always there to get to know you.', 'CTA 1': 'Call 1-800-2RINVOQ (1-800-274-6867)', 'CTA...\n\nDocument 3:\n. Message and data rates may apply.\nDEALING WITH A TREATMENT DISRUPTION? Visit SeeRTerms.com and see 