In [0]:
%pylab inline

In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
dataset = dataiku.Dataset("input_data_extracted_custom")
help(dataset)

In [0]:
import dataiku
client = dataiku.api_client()
project = client.get_default_project()
connection_name = "iliad-plugin-conn-prod" 
connection = client.get_connection(connection_name)
connection_info = connection.get_info()
connection_params = connection_info["params"]
models = connection_params['models']
for model in models:
    print(f"{model['capability']} {model} \n")

# test

In [1]:
"""
Modular LangChain-based RAG System
"""
import uuid
import os
import re
import json
from typing import List, Dict, Any, Optional, Tuple,Union

import pandas as pd
from tqdm.auto import tqdm
import dataiku
from PIL import Image
import io
import pickle
import hashlib

#from langchain.retrievers import MultiVectorRetriever
#from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.documents import Document
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever, SearchType
from langchain_core.stores import InMemoryStore
from langchain.storage import LocalFileStore 


# Import ChromaDB monkeypatch to ensure compatibility
from dataiku.core.vector_stores.chroma_vector_store import ChromaVectorStore
ChromaVectorStore.run_the_ugly_chromadb_monkeypatch()


class ModelManager:
    """
    Class to handle model initialization and management
    """
    def __init__(
        self, 
        embedding_model_id: str,
        llm_id: str,
        client=None
    ):
        self.client = client or dataiku.api_client()
        self.project = self.client.get_default_project()
        self.embedding_model_id = embedding_model_id
        self.llm_id = llm_id
        self.embedding_model = None
        self.llm = None
        
        # Initialize models
        self._initialize_embedding_model()
        self._initialize_llm()
    
    def _initialize_embedding_model(self):
        """Initialize the embedding model with LangChain compatibility"""
        try:
            # Get the embedding model from the project and use the LangChain wrapper
            emb_model = self.project.get_llm(self.embedding_model_id)
            self.embedding_model = emb_model.as_langchain_embeddings()
            # Set a smaller batch size if the model supports it
            if hasattr(self.embedding_model, "chunk_size"):
                self.embedding_model.chunk_size = 1000  # Decrease from default
            print(f"Initialized LangChain embedding model: {self.embedding_model_id}")
        except Exception as e:
            print(f"Error initializing embedding model: {str(e)}")
            import traceback
            traceback.print_exc()
            raise

    def _initialize_llm(self):
        """Initialize the LLM with LangChain compatibility"""
        try:
            # Get the LLM model from the project and use the LangChain wrapper
            llm_model = self.project.get_llm(self.llm_id)
            self.llm = llm_model.as_langchain_llm()
            print(f"Initialized LangChain LLM: {self.llm_id}")
        except Exception as e:
            print(f"Failed to initialize LLM: {str(e)}")
            import traceback
            traceback.print_exc()
            raise

    def get_embedding_model(self):
        """Get the initialized embedding model"""
        if not self.embedding_model:
            raise ValueError("Embedding model not initialized")
        return self.embedding_model
        
    def get_llm(self):
        """Get the initialized LLM"""
        if not self.llm:
            raise ValueError("LLM not initialized")
        return self.llm


class ContentSummarizer:
    """
    Class to handle summarization of different content types
    """
    def __init__(self, llm):
        self.llm = llm
        self.text_summary_prompt = PromptTemplate(
            input_variables=["content"],
            template="""
            Summarize the following text in a concise manner that captures the key information:
            
            {content}
            
            Summary:
            """
        )
        
        self.table_summary_prompt = PromptTemplate(
            input_variables=["content"],
            template="""
            Analyze the following table data and provide a concise summary of what it contains:
            
            {content}
            
            Table Summary:
            """
        )
    
    def summarize_text(self, content: str, max_length: int = 4000) -> str:
        """Generate a summary for text content"""
        if not content:
            return ""
        
        try:
            # Truncate content to avoid token limit issues
            truncated_content = content[:max_length]
            chain = self.text_summary_prompt | self.llm
            return chain.invoke({"content": truncated_content})
        except Exception as e:
            print(f"Error generating text summary: {str(e)}")
            return content[:200] + "..."  # Fallback to simple truncation
    
    def summarize_table(self, content: str, max_length: int = 4000) -> str:
        """Generate a summary for table content"""
        if not content:
            return ""
            
        try:
            # Truncate content to avoid token limit issues
            truncated_content = content[:max_length]
            chain = self.table_summary_prompt | self.llm
            return chain.invoke({"content": truncated_content})
        except Exception as e:
            print(f"Error generating table summary: {str(e)}")
            return content[:200] + "..."  # Fallback to simple truncation
    
    def process_image_with_llm(self, image_data: bytes) -> Tuple[str, str]:
        """
        Process an image using multimodal LLM to get detailed description and summary
        
        Args:
            image_data: Raw image bytes
            
        Returns:
            Tuple[str, str]: (description, summary)
        """
        try:
            # For now, we are using a placeholder since the specific multimodal handling
            # would depend on the actual LLM implementation
            print("Warning: Generic image processing used. May need adaptation for specific models.")
            
            # This is a placeholder - in an actual implementation, we would:
            # 1. Convert the LLM to a multimodal format if supported
            # 2. Pass the image data properly formatted to the model
            description = "Image description not available in this implementation."
            summary = "Image summary not available in this implementation."
            
            return description, summary
            
        except Exception as e:
            print(f"Error processing image: {str(e)}")
            return f"Failed to process image: {str(e)}", "Image processing error"


class TextProcessor:
    """
    Class to handle text processing functions like extracting tables from text
    """
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
    
    def split_text(self, text: str) -> List[Document]:
        """Split text into chunks"""
        return self.text_splitter.create_documents([text])
    
    def extract_table_from_text(self, text: str) -> Tuple[bool, str]:
        """Extract table structure from text if present"""
        # Simple detection of table patterns
        has_table = False
        table_content = ""
        
        # Check for markdown tables
        if "|" in text and "-|-" in text:
            has_table = True
            lines = text.strip().split("\n")
            table_lines = []
            for line in lines:
                if "|" in line:
                    table_lines.append(line)
            table_content = "\n".join(table_lines)
        
        # Check for CSV-like content
        elif text.count(",") > 5 and "\n" in text:
            comma_counts = [line.count(",") for line in text.split("\n") if line.strip()]
            if len(comma_counts) > 1 and max(comma_counts) == min(comma_counts) and max(comma_counts) > 0:
                has_table = True
                table_content = text
        
        return has_table, table_content


class ImageManager:
    """
    Class to handle image-related operations
    """
    def __init__(self, image_folder_name: Optional[str] = None):
        self.image_folder_name = image_folder_name
        self.image_folder = None
        
        # Initialize image folder if provided
        if image_folder_name:
            self.image_folder = dataiku.Folder(image_folder_name)
    
    def find_image_for_document(self, doc_id: str, doc_metadata: Dict) -> Optional[str]:
        """
        Find the corresponding image filename for a document based on ID or metadata
        Returns the image filename if found, None otherwise
        """
        if not self.image_folder:
            return None
            
        try:
            # Get list of files in the folder
            image_files = self.image_folder.list_paths_in_partition()
            
            # First try to find exact match based on document ID
            doc_id_clean = re.sub(r'[^\w]', '_', str(doc_id))
            for img_file in image_files:
                if doc_id_clean in img_file:
                    return img_file
            
            # Try to match using source or metadata filename
            source = doc_metadata.get("source", "")
            if source:
                source_clean = re.sub(r'[^\w]', '_', os.path.basename(source).split('.')[0])
                for img_file in image_files:
                    if source_clean in img_file:
                        return img_file
            
            return None
        except Exception as e:
            print(f"Error finding image for document {doc_id}: {str(e)}")
            return None
    
    def get_image_data(self, image_filename: str) -> Optional[bytes]:
        """Get image data from the Dataiku folder"""
        if not self.image_folder:
            return None
        
        try:
            with self.image_folder.get_download_stream(image_filename) as stream:
                return stream.read()
        except Exception as e:
            print(f"Error reading image {image_filename}: {str(e)}")
            return None


class VectorStoreManager:
    """
    Class to handle vector store creation and management with improved chunking
    """
    def __init__(
        self,
        embedding_model,
        store_type: str = "CHROMADB",
        persist_directory: str = None,
        dataiku_folder_name: str = None
    ):
        self.embedding_model = embedding_model
        self.store_type = store_type.upper()
        self.dataiku_folder_name = dataiku_folder_name
        self.vector_store = None
        self.using_dataiku_folder = dataiku_folder_name is not None

        # Use Dataiku folder if provided, otherwise use local directory
        if dataiku_folder_name:
            print(f"Using Dataiku folder: {dataiku_folder_name}")
            try:
                self.dataiku_folder = dataiku.Folder(dataiku_folder_name)
                self.persist_directory = self.dataiku_folder.get_path()
                print(f"Dataiku folder path: {self.persist_directory}")
                
                # List folder contents for debugging
                try:
                    files = self.dataiku_folder.list_paths_in_partition()
                    print(f"Dataiku folder contents: {files}")
                except Exception as e:
                    print(f"Error listing Dataiku folder contents: {str(e)}")
            except Exception as e:
                print(f"Error accessing Dataiku folder: {str(e)}")
                # Fallback to local directory
                self.persist_directory = persist_directory or "./vector_store"
                print(f"Falling back to local directory: {self.persist_directory}")
        else:
            self.persist_directory = persist_directory or "./vector_store"
            print(f"Using local directory: {self.persist_directory}")
            
        # Initialize text splitter with SMALLER chunk size to avoid token limits
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Much smaller chunk size
            chunk_overlap=50  # Smaller overlap too
        )
        
        # Create directory for vector store if it doesn't exist
        if not self.using_dataiku_folder:
            os.makedirs(self.persist_directory, exist_ok=True)
            print(f"Created/verified directory exists: {self.persist_directory}")
    
    def vector_store_exists(self) -> bool:
        """Check if vector store exists at the specified directory - more robust implementation"""
        try:
            if self.store_type == "CHROMADB":
                # For ChromaDB, check for the chroma directory
                if self.using_dataiku_folder:
                    # For Dataiku folders, check if files exist in the listing
                    try:
                        files = self.dataiku_folder.list_paths_in_partition()
                        chroma_files = [f for f in files if "chroma" in f]
                        
                        # More verbose logging for debugging
                        print(f"Looking for ChromaDB files in Dataiku folder")
                        print(f"Found {len(chroma_files)} ChromaDB files: {chroma_files}")
                        
                        return len(chroma_files) > 0
                    except Exception as e:
                        print(f"Error checking for ChromaDB files in Dataiku folder: {str(e)}")
                        return False
                else:
                    # For local filesystem
                    chroma_path = os.path.join(self.persist_directory, "chroma")
                    exists = os.path.exists(chroma_path)
                    print(f"Checking ChromaDB path: {chroma_path}, exists: {exists}")
                    return exists
            else:
                # For FAISS, check for the index file
                if self.using_dataiku_folder:
                    # For Dataiku folders, check if files exist in the listing
                    try:
                        files = self.dataiku_folder.list_paths_in_partition()
                        faiss_files = [f for f in files if "index.faiss" in f]
                        
                        # More verbose logging
                        print(f"Looking for FAISS files in Dataiku folder")
                        print(f"Found {len(faiss_files)} FAISS files: {faiss_files}")
                        
                        return len(faiss_files) > 0
                    except Exception as e:
                        print(f"Error checking for FAISS files in Dataiku folder: {str(e)}")
                        return False
                else:
                    # For local filesystem
                    faiss_path = os.path.join(self.persist_directory, "index.faiss")
                    exists = os.path.exists(faiss_path)
                    print(f"Checking FAISS path: {faiss_path}, exists: {exists}")
                    return exists
        except Exception as e:
            print(f"Error checking if vector store exists: {str(e)}")
            import traceback
            traceback.print_exc()
            return False
    
    def load_vector_store(self):
        """Load an existing vector store with enhanced error handling"""
        print(f"Attempting to load vector store from {self.persist_directory}")
        print(f"Store type: {self.store_type}, Using Dataiku folder: {self.using_dataiku_folder}")
        
        try:
            if self.store_type == "CHROMADB":
                print(f"Loading ChromaDB from: {self.persist_directory}")
                
                # For ChromaDB
                self.vector_store = Chroma(
                    persist_directory=self.persist_directory,
                    embedding_function=self.embedding_model
                )
                
                # Verify that the collection has data
                if hasattr(self.vector_store, '_collection'):
                    count = self.vector_store._collection.count()
                    print(f"ChromaDB loaded with {count} vectors")
            else:  # FAISS
                print(f"Loading FAISS from: {self.persist_directory}")
                
                # For FAISS
                self.vector_store = FAISS.load_local(
                    self.persist_directory, 
                    self.embedding_model, 
                    allow_dangerous_deserialization=True
                )
                print(f"FAISS loaded successfully")
            
            return self.vector_store
        except Exception as e:
            print(f"Error loading vector store: {str(e)}")
            import traceback
            traceback.print_exc()
            raise
    
    def track_processed_documents(self, documents: List[Document], tracking_file: str = "processed_docs.pkl"):
        """Track which documents have been processed to avoid reprocessing."""
        # Extract document content hashes instead of IDs
        doc_hashes = []
        for doc in documents:
            content = doc.page_content
            doc_hash = hashlib.md5(content.encode()).hexdigest()
            doc_hashes.append(doc_hash)

        if hasattr(self, 'dataiku_folder') and self.dataiku_folder:
            # For Dataiku folder
            with self.dataiku_folder.get_writer(tracking_file) as writer:
                pickle.dump(doc_hashes, writer)
        else:
            # Local file system
            tracking_path = os.path.join(self.persist_directory, tracking_file)
            with open(tracking_path, "wb") as f:
                pickle.dump(doc_hashes, f)

        print(f"Tracked {len(doc_hashes)} processed documents")
        return doc_hashes

    def get_processed_documents(self, tracking_file: str = "processed_docs.pkl") -> List[str]:
        """Get list of already processed document hashes."""
        try:
            if hasattr(self, 'dataiku_folder') and self.dataiku_folder:
                # For Dataiku folder
                with self.dataiku_folder.get_download_stream(tracking_file) as stream:
                    doc_hashes = pickle.load(stream)
            else:
                # Local file system
                tracking_path = os.path.join(self.persist_directory, tracking_file)
                if not os.path.exists(tracking_path):
                    return []
                    
                with open(tracking_path, "rb") as f:
                    doc_hashes = pickle.load(f)
                    
            print(f"Found {len(doc_hashes)} previously processed documents")
            return doc_hashes
        except Exception as e:
            print(f"Error loading processed documents: {str(e)}")
            return []
    
    def create_vector_store(self, documents: List[Document]) -> Any:
        """Create a vector store from documents with much smaller chunks to avoid token limits."""
        print(f"Creating vector store with {len(documents)} documents, using smaller chunks")
        
        # If no documents provided, create empty store
        if not documents:
            if self.store_type == "CHROMADB":
                self.vector_store = Chroma(
                    persist_directory=self.persist_directory,
                    embedding_function=self.embedding_model
                )
            else:  # FAISS
                self.vector_store = FAISS(
                    embedding_function=self.embedding_model
                )
            return self.vector_store
            
        # Chunk all documents before processing
        all_chunks = []
        for doc in documents:
            # Split the document into much smaller chunks
            chunks = self.text_splitter.split_text(doc.page_content)
            # Create Document objects for each chunk with metadata
            for i, chunk in enumerate(chunks):
                chunk_doc = Document(
                    page_content=chunk,
                    metadata={**doc.metadata, "chunk": i, "total_chunks": len(chunks)}
                )
                all_chunks.append(chunk_doc)
        
        print(f"Split {len(documents)} documents into {len(all_chunks)} smaller chunks")
        
        # Process in very small batches to avoid token limits
        batch_size = 1  # Process one chunk at a time
        
        if self.store_type == "CHROMADB":
            print(f"Creating ChromaDB vector store at {self.persist_directory}")
            
            try:
                # First create an empty collection
                self.vector_store = Chroma(
                    persist_directory=self.persist_directory,
                    embedding_function=self.embedding_model
                )
                
                # Then add documents one by one
                for i in range(0, len(all_chunks), batch_size):
                    end_idx = min(i + batch_size, len(all_chunks))
                    batch = all_chunks[i:end_idx]
                    
                    if i % 10 == 0:  # Print progress every 10 chunks
                        print(f"Processing chunk {i+1}/{len(all_chunks)}")
                    
                    # Add single chunk to vector store
                    self.vector_store.add_documents(batch)
                    
                    # Persist after each batch to be safe
                    self.vector_store.persist()
                    
            except Exception as e:
                print(f"Error creating ChromaDB vector store: {str(e)}")
                import traceback
                traceback.print_exc()
                raise
                
        else:  # Default to FAISS
            print(f"Creating FAISS vector store")
            
            try:
                # Initialize empty FAISS index
                self.vector_store = FAISS(
                    embedding_function=self.embedding_model
                )
                
                # Add documents one by one
                for i in range(0, len(all_chunks), batch_size):
                    end_idx = min(i + batch_size, len(all_chunks))
                    batch = all_chunks[i:end_idx]
                    
                    if i % 10 == 0:  # Print progress every 10 chunks
                        print(f"Processing chunk {i+1}/{len(all_chunks)}")
                    
                    # Add single chunk to vector store
                    self.vector_store.add_documents(batch)
                
                # Save FAISS index
                self.vector_store.save_local(self.persist_directory)
                
            except Exception as e:
                print(f"Error creating FAISS vector store: {str(e)}")
                import traceback
                traceback.print_exc()
                raise
        
        print(f"Successfully created vector store with {len(all_chunks)} chunks")
        return self.vector_store
    
    


class DocumentProcessor:
    """
    Class to prepare documents for the multi-vector store
    """
    def __init__(
        self,
        summarizer: ContentSummarizer,
        text_processor: TextProcessor,
        image_manager: Optional[ImageManager] = None
    ):
        self.summarizer = summarizer
        self.text_processor = text_processor
        self.image_manager = image_manager
        
    def prepare_multi_vector_documents(
        self, 
        df: pd.DataFrame, 
        text_column: str, 
        table_column: Optional[str] = None
    ) -> Tuple[List[Document], List[Document], Dict]:
        """
        Prepare documents for multi-vector storage with improved table and image handling

        Returns:
            Tuple[List[Document], List[Document], Dict]: (parent_docs, child_docs, id_to_children_map)
        """
        all_docs = []       # Original documents for the docstore
        child_docs = []     # Summary documents for the vector store
        id_to_children = {}

        print("Preparing documents for multi-vector retrieval...")

        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents"):
            doc_id = str(uuid.uuid4())
            text_content = str(row[text_column])
            metadata = row.get("metadata", {})
            if isinstance(metadata, str):
                try:
                    metadata = json.loads(metadata)
                except:
                    metadata = {"source": metadata}

            # Add source info if not present
            if "source" not in metadata:
                metadata["source"] = f"document_{idx}"

            # Base document (for docstore)
            doc = Document(
                page_content=text_content,
                metadata={"id": doc_id, **metadata, "type": "original"}
            )
            all_docs.append(doc)

            # Create summary for vector store
            summary = self.summarizer.summarize_text(text_content)
            summary_doc = Document(
                page_content=summary,
                metadata={"id": f"{doc_id}_summary", "parent_id": doc_id, **metadata, "type": "summary"}
            )
            child_docs.append(summary_doc)

            # Process tables if present in a separate column
            if table_column and table_column in row and row[table_column]:
                table_content = str(row[table_column])

                # Store full table in docstore
                table_doc = Document(
                    page_content=table_content,
                    metadata={"id": f"{doc_id}_table", **metadata, "type": "table"}
                )
                all_docs.append(table_doc)  # Add to docstore, not just as child

                # Create table summary for vector store
                table_summary = self.summarizer.summarize_table(table_content)
                table_summary_doc = Document(
                    page_content=table_summary,
                    metadata={"id": f"{doc_id}_table_summary", "parent_id": doc_id, **metadata, "type": "table_summary"}
                )
                child_docs.append(table_summary_doc)
            else:
                # Check for tables in the text content
                has_table, table_content = self.text_processor.extract_table_from_text(text_content)
                if has_table:
                    # Store full table in docstore
                    table_doc = Document(
                        page_content=table_content,
                        metadata={"id": f"{doc_id}_table", **metadata, "type": "table"}
                    )
                    all_docs.append(table_doc)  # Add to docstore

                    # Create table summary for vector store
                    table_summary = self.summarizer.summarize_table(table_content)
                    table_summary_doc = Document(
                        page_content=table_summary,
                        metadata={"id": f"{doc_id}_table_summary", "parent_id": doc_id, **metadata, "type": "table_summary"}
                    )
                    child_docs.append(table_summary_doc)

            # Process associated images if image manager is configured
            if self.image_manager:
                image_filename = self.image_manager.find_image_for_document(doc_id, metadata)
                if image_filename:
                    try:
                        image_data = self.image_manager.get_image_data(image_filename)
                        if image_data:
                            image_description, image_summary = self.summarizer.process_image_with_llm(image_data)

                            # Store full image description in docstore
                            image_desc_doc = Document(
                                page_content=image_description,
                                metadata={
                                    "id": f"{doc_id}_image", 
                                    "image_filename": image_filename,
                                    **metadata, 
                                    "type": "image"
                                }
                            )
                            all_docs.append(image_desc_doc)  # Add to docstore

                            # Store image summary for vector search
                            image_summary_doc = Document(
                                page_content=image_summary,
                                metadata={
                                    "id": f"{doc_id}_image_summary", 
                                    "parent_id": doc_id,
                                    "image_filename": image_filename,
                                    **metadata, 
                                    "type": "image_summary"
                                }
                            )
                            child_docs.append(image_summary_doc)
                            print(f"Processed image {image_filename} for document {doc_id}")
                    except Exception as e:
                        print(f"Error processing image {image_filename} for document {doc_id}: {str(e)}")

            # Store children relationships
            id_to_children[doc_id] = [
                child.metadata["id"] for child in child_docs 
                if child.metadata.get("parent_id") == doc_id
            ]

        return all_docs, child_docs, id_to_children


class MultiVectorRetrieverBuilder:
    """
    Class to build and configure the multi-vector retriever
    """
    def __init__(
        self,
        vector_store_manager: VectorStoreManager,
        search_type: str = "similarity",
        k: int = 5,
        persist_directory: str = "./vector_store"
    ):
        self.vector_store_manager = vector_store_manager
        self.search_type = search_type
        self.k = k
        self.persist_directory = persist_directory
        self.docstore = InMemoryStore()  # Document store to keep parent documents
        
        # Store whether we're using Dataiku folder
        self.using_dataiku_folder = hasattr(self.vector_store_manager, 'dataiku_folder') and self.vector_store_manager.dataiku_folder is not None
        
        # Set up storage paths
        if self.using_dataiku_folder:
            self.metadata_filename = "doc_metadata.pkl"
        else:
            # For local storage
            self.doc_metadata_path = os.path.join(persist_directory, "doc_metadata.pkl")
    
    def save_document_metadata(self, parent_docs):
        """Save parent document metadata to enable retriever reconstruction"""
        doc_metadata = {}
        for doc in parent_docs:
            doc_id = doc.metadata["id"]
            doc_metadata[doc_id] = {
                "content": doc.page_content,
                "metadata": doc.metadata
            }
        
        # Save to the appropriate location
        if self.using_dataiku_folder:
            # Use Dataiku folder for storage
            with self.vector_store_manager.dataiku_folder.get_writer(self.metadata_filename) as writer:
                pickle.dump(doc_metadata, writer)
        else:
            # Save to local file system
            with open(self.doc_metadata_path, "wb") as f:
                pickle.dump(doc_metadata, f)
                
        print(f"Saved metadata for {len(doc_metadata)} parent documents")
    
    def load_document_metadata(self):
        """Load document metadata from file"""
        try:
            if self.using_dataiku_folder:
                # Load from Dataiku folder
                with self.vector_store_manager.dataiku_folder.get_download_stream(self.metadata_filename) as stream:
                    doc_metadata = pickle.load(stream)
            else:
                # Load from local file system
                if not os.path.exists(self.doc_metadata_path):
                    print("No document metadata file found")
                    return {}
                    
                with open(self.doc_metadata_path, "rb") as f:
                    doc_metadata = pickle.load(f)
                    
            print(f"Loaded metadata for {len(doc_metadata)} parent documents")
            return doc_metadata
        except FileNotFoundError:
            print("No document metadata file found")
            return {}
        except Exception as e:
            print(f"Error loading document metadata: {str(e)}")
            return {}
    
    def build_retriever(
        self, 
        parent_docs: List[Document], 
        child_docs: List[Document],
        update_existing: bool = False
    ) -> MultiVectorRetriever:
        """
        Build a multi-vector retriever with full media access
        """
        # Add ALL parent documents to docstore (including tables and images)
        for doc in parent_docs:
            self.docstore.mset([(doc.metadata["id"], doc)])

        # Save document metadata for later reconstruction
        self.save_document_metadata(parent_docs)

        # Create or update vector store for child documents (summaries only)
        if update_existing and self.vector_store_manager.vector_store is not None:
            # Add new documents to existing vector store
            self.vector_store_manager.vector_store.add_documents(child_docs)
            vector_store = self.vector_store_manager.vector_store
        else:
            # Create new vector store
            vector_store = self.vector_store_manager.create_vector_store(child_docs)

        # Create the multi-vector retriever
        retriever = MultiVectorRetriever(
            vectorstore=vector_store,
            docstore=self.docstore,
            id_key="parent_id",  # This links summaries to parents
            search_type=SearchType.similarity,
            search_kwargs={"k": self.k}
        )

        print(f"Multi-vector retriever built successfully with {len(parent_docs)} parent documents and {len(child_docs)} summary documents")

        return retriever
    
    def load_retriever(self) -> MultiVectorRetriever:
        """
        Load an existing retriever from a saved vector store
        
        Returns:
            MultiVectorRetriever: Loaded retriever
        """
        # Load existing vector store
        vector_store = self.vector_store_manager.load_vector_store()
        
        # Load document metadata and populate docstore
        doc_metadata = self.load_document_metadata()
        if doc_metadata:
            for doc_id, doc_data in doc_metadata.items():
                doc = Document(
                    page_content=doc_data["content"],
                    metadata=doc_data["metadata"]
                )
                self.docstore.mset([(doc_id, doc)])
        
        # Create the multi-vector retriever
        retriever = MultiVectorRetriever(
            vectorstore=vector_store,
            docstore=self.docstore,
            id_key="parent_id",
            search_type=SearchType.similarity,
            search_kwargs={"k": min(self.k, 3)} 
        )
        
        print(f"Multi-vector retriever loaded successfully with {len(list(self.docstore.yield_keys()))} parent documents")
        return retriever


class QueryEngine:
    """
    Class to handle document retrieval and response generation
    """
    def __init__(
        self,
        retriever: MultiVectorRetriever,
        embedding_model,
        llm,
        similarity_threshold: float = 0.5
    ):
        self.retriever = retriever
        self.embedding_model = embedding_model
        self.llm = llm
        
        # Add a relevance filter for better results
        self.embeddings_filter = EmbeddingsFilter(
            embeddings=embedding_model,
            similarity_threshold=similarity_threshold
        )
        
        # Create prompt template for response generation
        self.response_prompt = PromptTemplate(
            input_variables=["query", "context"],
            template="""
            You are an AI assistant that provides precise answers from provided document chunks. Follow these steps:

            1. Carefully read all the document chunks provided.
            2. Determine what specific detail the user is asking for in their query.
            3. Identify the most relevant information in the chunks that answers this query.
            4. Extract and return a precise answer with all details, without any extra commentary.
            5. If no relevant information is found, state that the data is insufficient.
            6. Never generate content that includes hate speech, offensive language, violence, threats, or misinformation.

            User Query:
            {query}

            Document Chunks:
            {context}

            Answer:
            """
        )
    
    def retrieve(self, query: str, k: int = 5, filter_metadata: Optional[Dict] = None) -> List[Document]:
        """
        Retrieve documents relevant to the query

        Args:
            query: Query string
            k: Number of documents to retrieve
            filter_metadata: Optional filter for metadata fields

        Returns:
            List[Document]: Retrieved documents
        """
        try:
            # Apply filters if provided            
            # build the kwargs
            search_args = {"k": k}
            if filter_metadata:
                search_args["filter"] = filter_metadata

            print("[DEBUG] docstore keys:", list(self.retriever.docstore.yield_keys()))
            if hasattr(self.retriever.vectorstore, '_collection'):
                print("[DEBUG] vectorstore has", self.retriever.vectorstore._collection.count(), "vectors")

            # Retrieve documents safely
            results = self.retriever.get_relevant_documents(query, **search_args)
            
            # Filter out any non-Document objects
            retrieved_docs = []
            for item in results:
                if hasattr(item, 'metadata') and hasattr(item, 'page_content'):
                    retrieved_docs.append(item)
                else:
                    print(f"WARNING: Retrieved non-Document object: {type(item)}")
                
            print(f"Retrieved {len(retrieved_docs)} valid documents")
            
            if not retrieved_docs:
                return []
                
            # Apply relevance filtering
            try:
                filtered_docs = self.embeddings_filter.compress_documents(
                    retrieved_docs, query
                )
                print(f"Filtered to {len(filtered_docs)} relevant documents")
                return filtered_docs
            except Exception as e:
#                 print(f"Error filtering documents: {str(e)}")
                # Return original results if filtering fails
                return retrieved_docs

        except Exception as e:
            print(f"Error retrieving documents: {str(e)}")
            import traceback
            traceback.print_exc()
            return []
    
    def generate_response(self, query: str, return_context: bool = True) -> Union[str, Tuple[str, Dict]]:
        """
        Generate a response based on retrieved summaries and also return original images and tables
        """
        try:
            # Retrieve relevant documents based on summary embeddings
            docs = self.retrieve(query, k=5)

            if not docs:
                return (
                    "No relevant information found in the provided documents.", 
                    []
                ) if return_context else "No relevant information found in the provided documents."

            # Collect related media from the docstore for all retrieved summary documents
            related_media = {
                'images': [],
                'tables': []
            }

            # Keep track of processed parent IDs to avoid duplicates
            processed_parent_ids = set()

            # For each retrieved document
            for doc in docs:
                doc_type = doc.metadata.get('type', '')
                parent_id = doc.metadata.get('parent_id')

                # Skip if we've already processed this parent document
                if parent_id in processed_parent_ids:
                    continue

                processed_parent_ids.add(parent_id)

                # If this is an image summary, add the image reference to related media
                if doc_type == 'image_summary' and 'image_filename' in doc.metadata:
                    # Find the original image description if available
                    image_filename = doc.metadata['image_filename']
                    image_doc_id = f"{parent_id}_image" if parent_id else None

                    image_description = ""
                    if image_doc_id and hasattr(self.retriever, 'docstore'):
                        try:
                            image_doc = self.retriever.docstore.get(image_doc_id)
                            if image_doc:
                                image_description = image_doc.page_content
                        except:
                            pass

                    related_media['images'].append({
                        'filename': image_filename,
                        'parent_id': parent_id,
                        'summary': doc.page_content,
                        'description': image_description
                    })

                # If this is a table summary, add the table content to related media
                if doc_type == 'table_summary':
                    # Try to find original table content
                    table_doc_id = f"{parent_id}_table" if parent_id else None

                    if table_doc_id and hasattr(self.retriever, 'docstore'):
                        try:
                            table_doc = self.retriever.docstore.get(table_doc_id)
                            if table_doc:
                                related_media['tables'].append({
                                    'content': table_doc.page_content,
                                    'parent_id': parent_id,
                                    'summary': doc.page_content
                                })
                        except Exception as e:
                            print(f"Error retrieving table: {str(e)}")

            # Prepare formatted context using only summaries for the LLM
            formatted_context = ""
            for i, doc in enumerate(docs, 1):
                formatted_context += f"DOCUMENT {i} ({doc.metadata.get('type', 'unknown')}):\n"
                formatted_context += f"{doc.page_content}\n"
                formatted_context += f"Source: {doc.metadata.get('source', 'Unknown')}\n\n"

            # Generate response using the chain
            chain = self.response_prompt | self.llm
            final_response = chain.invoke({
                "query": query,
                "context": formatted_context
            })

            # Return response with all related content if requested
            if return_context:
                context_dict = {
                    "retrieved_docs": [
                        {
                            "content": doc.page_content,
                            "metadata": doc.metadata
                        } for doc in docs
                    ],
                    "formatted_context": formatted_context,
                    "related_media": related_media,
                    "query": query
                }
                return final_response, context_dict
            else:
                return final_response

        except Exception as e:
            print(f"Error generating response: {str(e)}")
            import traceback
            traceback.print_exc()
            error_msg = f"Error generating response: {str(e)}"
            return (error_msg, []) if return_context else error_msg
    
    


class RAGPipeline:
    """
    Main pipeline class to orchestrate the entire RAG process
    """
    def __init__(
        self,
        embedding_model_id: str,
        llm_id: str,
        vector_store_type: str = "CHROMADB",
        persist_directory: str = "./vector_store",
        image_folder_name: Optional[str] = None,
        vector_store_folder_name: Optional[str] = None  # Add this parameter
    ):
        # Initialize model manager
        self.model_manager = ModelManager(embedding_model_id, llm_id)

        # Get initialized models
        self.embedding_model = self.model_manager.get_embedding_model()
        self.llm = self.model_manager.get_llm()

        # Initialize components
        self.text_processor = TextProcessor()
        self.summarizer = ContentSummarizer(self.llm)
        self.image_manager = ImageManager(image_folder_name) if image_folder_name else None
        self.vector_store_manager = VectorStoreManager(
            self.embedding_model,
            store_type=vector_store_type,
            persist_directory=persist_directory,
            dataiku_folder_name=vector_store_folder_name  # Pass to the manager
        )
        self.retriever_builder = MultiVectorRetrieverBuilder(self.vector_store_manager)
        
        # Will be set during processing
        self.document_processor = DocumentProcessor(
            self.summarizer,
            self.text_processor,
            self.image_manager
        )
        self.retriever = None
        self.query_engine = None
        
        # Store configuration
        self.config = {
            "embedding_model_id": embedding_model_id,
            "llm_id": llm_id,
            "vector_store_type": vector_store_type,
            "persist_directory": persist_directory,
            "image_folder_name": image_folder_name,
            "vector_store_folder_name": vector_store_folder_name
        }
        
        # Enhanced logging for initialization
        print(f"RAGPipeline initialized with: vector_store_type={vector_store_type}, "
              f"persist_directory={persist_directory}, "
              f"vector_store_folder_name={vector_store_folder_name}")
        
        if vector_store_folder_name:
            try:
                folder = dataiku.Folder(vector_store_folder_name)
                print(f"Dataiku folder path: {folder.get_path()}")
                print(f"Dataiku folder files: {folder.list_paths_in_partition()}")
            except Exception as e:
                print(f"Error accessing Dataiku folder: {str(e)}")
    
    def process_dataset(
        self,
        df: pd.DataFrame,
        text_column: str,
        table_column: Optional[str] = None
    ) -> MultiVectorRetriever:
        """
        Process a dataset to build a multi-vector retriever, 
        only processing new documents not already in the vector store
        """
        # More robust check for vector store existence
        vector_store_exists = self.vector_store_manager.vector_store_exists()
        
        print(f"Checking for vector store: {vector_store_exists}")
        
        # Log key paths for debugging
        if hasattr(self.vector_store_manager, 'persist_directory'):
            print(f"Vector store directory: {self.vector_store_manager.persist_directory}")
            
            # Check if the directory exists and is accessible
            if os.path.exists(self.vector_store_manager.persist_directory):
                print(f"Directory exists and is accessible")
                try:
                    files = os.listdir(self.vector_store_manager.persist_directory)
                    print(f"Directory contents: {files}")
                except Exception as e:
                    print(f"Cannot list directory contents: {str(e)}")
            else:
                print(f"Directory does not exist or is not accessible")

        if vector_store_exists:
            print(f"Found existing vector store")
            # Get list of processed document IDs
            processed_doc_ids = self.vector_store_manager.get_processed_documents()
            print(f"Found {len(processed_doc_ids)} processed document hashes")

            # If no tracking file, assume we need to rebuild
            if not processed_doc_ids:
                print("No document tracking information found, processing all documents")
                return self._process_all_documents(df, text_column, table_column)

            # Try to load existing retriever
            try:
                print("Attempting to load existing retriever")
                self.retriever = self.retriever_builder.load_retriever()
                print("Successfully loaded existing retriever")

                # Process only new documents
                return self._process_new_documents(df, text_column, processed_doc_ids, table_column)
            except Exception as e:
                print(f"Error loading existing retriever: {str(e)}")
                import traceback
                traceback.print_exc()
                print("Rebuilding vector store from scratch")
                return self._process_all_documents(df, text_column, table_column)
        else:
            print("No existing vector store found, processing all documents")
            return self._process_all_documents(df, text_column, table_column)

    def _process_all_documents(
        self,
        df: pd.DataFrame,
        text_column: str,
        table_column: Optional[str] = None
    ) -> MultiVectorRetriever:
        """Process all documents in the dataset"""
        # Prepare documents
        parent_docs, child_docs, id_to_children = self.document_processor.prepare_multi_vector_documents(
            df, text_column, table_column
        )

        # Build retriever
        self.retriever = self.retriever_builder.build_retriever(parent_docs, child_docs)

        # Track processed documents
        self.vector_store_manager.track_processed_documents(parent_docs)

        # Initialize query engine
        self.query_engine = QueryEngine(
            self.retriever,
            self.embedding_model,
            self.llm
        )

        return self.retriever

    def _process_new_documents(
        self,
        df: pd.DataFrame,
        text_column: str,
        processed_doc_ids: List[str],
        table_column: Optional[str] = None,
        max_token_length: int = 4000  # Reduced from 7000 to ensure batches stay under limits
    ) -> MultiVectorRetriever:
        """
        Process only new documents not in processed_doc_ids

        Args:
            df: DataFrame containing documents
            text_column: Column name containing the text
            processed_doc_ids: List of already processed document IDs/hashes
            table_column: Optional column name for table data
            max_token_length: Maximum token length before truncation (reduced to stay well under model limits)

        Returns:
            MultiVectorRetriever: The retriever with updated documents
        """
        if not self.retriever:
            try:
                self.retriever = self.retriever_builder.load_retriever()
            except Exception as e:
                print(f"Error loading existing retriever: {str(e)}")
                print("Will create a new retriever from scratch")
                return self._process_all_documents(df, text_column, table_column)

        # Function to estimate token count - approximate but faster than calling tokenizer
        def estimate_token_count(text):
            # Rough estimate: 4 chars per token for English text
            return len(text) // 4

        # Function to truncate text to max token length
        def truncate_to_max_tokens(text, max_tokens=max_token_length):
            estimated_tokens = estimate_token_count(text)
            if estimated_tokens <= max_tokens:
                return text

            # If we need to truncate, use character count as proxy
            # (4 chars per token approximation)
            max_chars = max_tokens * 4
            truncated = text[:max_chars]
            print(f"Truncated document from ~{estimated_tokens} tokens to {max_tokens} tokens")
            return truncated

        print("Checking for new documents...")

        # Generate content hashes for all current documents
        current_doc_hashes = []
        for idx in range(len(df)):
            # Get the text content, truncate if needed to avoid tokenization issues
            text_content = str(df.iloc[idx][text_column])
            text_content = truncate_to_max_tokens(text_content)

            # Generate hash from the (potentially truncated) content
            content_hash = hashlib.md5(text_content.encode()).hexdigest()
            current_doc_hashes.append(content_hash)

        # Determine which documents are new by comparing hashes
        new_indices = [
            idx for idx, hash_value in enumerate(current_doc_hashes)
            if hash_value not in processed_doc_ids
        ]

        if not new_indices:
            print("No new documents to process")
            # Initialize query engine with existing retriever
            self.query_engine = QueryEngine(
                self.retriever,
                self.embedding_model,
                self.llm
            )
            return self.retriever

        print(f"Processing {len(new_indices)} new documents")
        new_df = df.iloc[new_indices].copy()

        # Truncate large documents to avoid token limit issues
        for idx in new_df.index:
            text_content = str(new_df.loc[idx, text_column])
            token_count = estimate_token_count(text_content)

            if token_count > max_token_length:
                print(f"Document at index {idx} has ~{token_count} tokens, truncating to {max_token_length}")
                new_df.loc[idx, text_column] = truncate_to_max_tokens(text_content)

        # Process documents in small batches to avoid token limits
        batch_size = 5  # Process only 5 documents at a time
        all_parent_docs = []
        all_child_docs = []
        all_id_to_children = {}

        # Split the dataframe into smaller batches
        for i in range(0, len(new_df), batch_size):
            end_idx = min(i + batch_size, len(new_df))
            batch_df = new_df.iloc[i:end_idx]
            print(f"Preparing batch {i//batch_size + 1}/{(len(new_df) + batch_size - 1)//batch_size}")

            # Process this small batch of documents
            parent_docs, child_docs, id_to_children = self.document_processor.prepare_multi_vector_documents(
                batch_df, text_column, table_column
            )

            all_parent_docs.extend(parent_docs)
            all_child_docs.extend(child_docs)
            all_id_to_children.update(id_to_children)

        # Track the new document hashes that we're processing
        new_doc_hashes = [current_doc_hashes[idx] for idx in new_indices]

        # Add new documents to existing vector store
        if self.vector_store_manager.vector_store is None:
            try:
                self.vector_store_manager.load_vector_store()
            except Exception as e:
                print(f"Error loading vector store: {str(e)}")
                print("Creating new vector store")
                self.vector_store_manager.create_vector_store([])  # Create empty store

        # Process child documents in VERY small batches to avoid token limit issues
        small_batch_size = 1  # Process one document at a time for embedding
        try:
            for i in range(0, len(all_child_docs), small_batch_size):
                end_idx = min(i + small_batch_size, len(all_child_docs))
                batch = all_child_docs[i:end_idx]
                print(f"Adding document {i+1}/{len(all_child_docs)} to vector store")

                # Check total tokens in this batch
                batch_tokens = sum(estimate_token_count(doc.page_content) for doc in batch)
                print(f"Batch token count: ~{batch_tokens}")

                # Add to vector store
                self.vector_store_manager.vector_store.add_documents(batch)

                # Persist after each batch if supported
                if hasattr(self.vector_store_manager.vector_store, 'persist'):
                    self.vector_store_manager.vector_store.persist()
        except Exception as e:
            print(f"Error adding documents to vector store: {str(e)}")
            import traceback
            traceback.print_exc()

            # If we failed, we need to rebuild from scratch
            print("Embedding failed. Rebuilding vector store with smaller chunks...")
            return self._process_all_documents(df, text_column, table_column)

        # Add parent documents to docstore
        for doc in all_parent_docs:
            self.retriever.docstore.mset([(doc.metadata["id"], doc)])

        # Track all processed documents (existing + new)
        all_processed_hashes = processed_doc_ids + new_doc_hashes

        # Update tracking file with all processed hashes
        tracking_file = "processed_docs.pkl"
        
        # Use the same storage mechanism as the document metadata
        using_dataiku_folder = hasattr(self.vector_store_manager, 'dataiku_folder') and self.vector_store_manager.dataiku_folder is not None
        
        if using_dataiku_folder:
            # For Dataiku folder
            with self.vector_store_manager.dataiku_folder.get_writer(tracking_file) as writer:
                pickle.dump(all_processed_hashes, writer)
        else:
            # Local file system
            tracking_path = os.path.join(self.vector_store_manager.persist_directory, tracking_file)
            with open(tracking_path, "wb") as f:
                pickle.dump(all_processed_hashes, f)

        print(f"Updated tracking with {len(new_doc_hashes)} new document hashes, total: {len(all_processed_hashes)}")

        # Initialize query engine
        self.query_engine = QueryEngine(
            self.retriever,
            self.embedding_model,
            self.llm
        )

        return self.retriever
    
    def load_retriever(self) -> MultiVectorRetriever:
        """
        Load an existing retriever from a saved vector store
        
        Returns:
            MultiVectorRetriever: Loaded retriever
        """
        self.retriever = self.retriever_builder.load_retriever()
        
        # Initialize query engine
        self.query_engine = QueryEngine(
            self.retriever,
            self.embedding_model,
            self.llm
        )
        
        return self.retriever
    
    def query(self, query: str, return_context: bool = True):
        """
        Query the RAG system
        
        Args:
            query: Query string
            return_context: Whether to return the retrieved context
            
        Returns:
            str or Tuple[str, Dict]: Response or (response, context)
        """
        if not self.query_engine:
            if not self.retriever:
                self.load_retriever()
            self.query_engine = QueryEngine(
                self.retriever,
                self.embedding_model,
                self.llm
            )
        
        return self.query_engine.generate_response(query, return_context)
    


  from .autonotebook import tqdm as notebook_tqdm
/opt/dataiku-dss-13.4.3/python/dataikuapi/dss/langchain/llm.py:138: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
  extra = pydantic.Extra.forbid
/opt/dataiku-dss-13.4.3/python/dataikuapi/dss/langchain/llm.py:302: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
  extra = pydantic.Extra.forbid
/opt/dataiku-dss-13.4.3/python/dataikuapi/dss/langchain/embeddings.py:24: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 

In [0]:
# processor = process_dataset_with_multi_vector(
#     dataset_name="input_data_extracted_custom",
#     output_dataset_name="output_test",
#     text_column="text_content",
#     table_column="table_content",
#     image_folder_name="input_images_extracted_custom"
# )
# response = query_multi_vector_store(
#     vector_store_path="./vector_store_input_data_extracted_custom",
#     query="What is Electronic Certificate Document Number: US-IMM-180289?",
#     embedding_model_id="custom:iliad-plugin-conn-prod:text-embedding-ada-002",
#     llm_id="custom:iliad-plugin-conn-prod:gpt-4o",
# )


# orchestrator

In [0]:
# # Load dataset
# dataset_name = "input_data_extracted_custom"
# dataset = dataiku.Dataset("input_data_extracted_custom")
# df = dataset.get_dataframe()

# # Set default persist directory if not provided
# persist_directory = f"./vector_store_{dataset_name}"

# text_column="text_content"
# table_column="table_content"
# embedding_model_id="custom:iliad-plugin-conn-prod:text-embedding-ada-002"
# llm_id="custom:iliad-plugin-conn-prod:gpt-4o"
# vector_store_type="CHROMADB"

# pipeline = create_and_use_rag_system(
#     df = df,
#     text_column = text_column,
#     table_column = table_column,
#     embedding_model_id = embedding_model_id,
#     llm_id = llm_id,
#     vector_store_type = vector_store_type,
#     persist_directory = persist_directory,
#     chunk_size = 1000,
#     chunk_overlap = 100,
#     image_folder_name = "images_test",
#     k = 5
# )

In [2]:
import uuid
import os
import re
import json
from typing import List, Dict, Any, Optional, Tuple,Union

import pandas as pd
from tqdm.auto import tqdm
import dataiku
from PIL import Image
import io
import pickle
# Import ChromaDB monkeypatch to ensure compatibility
from dataiku.core.vector_stores.chroma_vector_store import ChromaVectorStore
ChromaVectorStore.run_the_ugly_chromadb_monkeypatch()

# from Digitization.Core.RagPipeline import RAGPipeline

# Load dataset
dataset_name = "input_data_extracted_custom"
dataset = dataiku.Dataset("input_data_extracted_custom")
df = dataset.get_dataframe()

# Set default persist directory if not provided
persist_directory = f"./vector_store_{dataset_name}"

text_column="text_content"
table_column="table_content"

# Initialize RAG pipeline
pipeline = RAGPipeline(
    embedding_model_id="custom:iliad-plugin-conn-prod:text-embedding-ada-002",
    llm_id="custom:iliad-plugin-conn-prod:gpt-4o",
    vector_store_type="CHROMADB",
    persist_directory = persist_directory,
    vector_store_folder_name="vector_store"
)


# Process the dataset to build the retriever
print(f"Processing dataset {dataset_name} with {len(df)} documents...")
pipeline.process_dataset(df, text_column, table_column)



Initialized LangChain embedding model: custom:iliad-plugin-conn-prod:text-embedding-ada-002
Initialized LangChain LLM: custom:iliad-plugin-conn-prod:gpt-4o
Using Dataiku folder: vector_store
Dataiku folder path: /data/dataiku/dss_data/managed_folders/GENAIPOC/tPuZfwpr
Dataiku folder contents: ['/doc_metadata.pkl', '/chroma.sqlite3', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/header.bin', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/data_level0.bin', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/length.bin', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/link_lists.bin', '/processed_docs.pkl']
RAGPipeline initialized with: vector_store_type=CHROMADB, persist_directory=./vector_store_input_data_extracted_custom, vector_store_folder_name=vector_store
Dataiku folder path: /data/dataiku/dss_data/managed_folders/GENAIPOC/tPuZfwpr
Dataiku folder files: ['/doc_metadata.pkl', '/chroma.sqlite3', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/header.bin', '/24a7e438-50a8-426f-a249-1cb8a5ed0b6b/data_level0.bin', '/24a7e438-50a8-4

  self.vector_store = Chroma(


MultiVectorRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7fffe82ad1f0>, docstore=<langchain_core.stores.InMemoryStore object at 0x7fffe8245bb0>, id_key='parent_id', search_kwargs={'k': 3})

In [0]:
#query="How are bridge programs utilized in the treatment process when certain authorizations are denied?"
# query = "Who to call for RINVOQ reactions?"
query = "What are support programs currently available under each drug brand?"
# Execute the query
print(f"Executing query: {query}")
response, context = pipeline.query(query, return_context=True)
# response, context = pipeline.run(query)

print(f"Query completed successfully")
print("\n=== RAG RESPONSE ===")
print(response)

# print("\n=== RETRIEVED SOURCES ===")
# for i, doc in enumerate(context["retrieved_docs"], 1):
#     print(f"SOURCE {i}: {doc['metadata'].get('source', 'Unknown')}")
#     print(f"TYPE: {doc['metadata'].get('type', 'Unknown')}")
#     print(f"CONTENT: {doc['content'][:1500]}...")
#     print()


# New heading