## Imports

In [None]:
import json
import os
import asyncio
from typing import Annotated, TypedDict, Optional
import pandas as pd # Assuming pandas is a project dependency for data manipulation
import logging # Use standard logging or integrate loguru directly if preferred

# Import actual classes from your project
from DataDistributor import DataDistributor
from Constants import Constants # For API keys, database names etc.
from RequestContext import RequestContext as ctx # For context-aware logging
from Chain import LangChainAgent # Assuming LangChainAgent is the primary LLM wrapper
# Assuming your project uses loguru and it's configured globally or via RequestContext
# from Logging.ErrorLogger import logger # If you need direct access outside context

# You will need to replace this with your actual LLM initialization
# from langchain_core.language_models import BaseChatModel
# from langchain_core.output_parsers import JsonOutputParser
# from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage # Assuming LangChainAgent returns HumanMessage or similar

# --- LangGraph Setup ---
from langgraph.graph import StateGraph, END, START
# from langgraph.prebuilt import ToolExecutor # You will likely use this in your real implementation
# from langchain.tools import BaseTool # Not directly used if DataDistributor methods are called directly

# Placeholder for LIDA Manager - requires installation: pip install lida
# And you'll need an API key for LIDA's text generation model
from lida import Manager, TextGenerationConfig, DataConfig

# Setup basic logging for notebook execution context
logging.basicConfig(level=logging.INFO)

## Tools (Implemented via DataDistributor)

In [None]:
# Actual tool implementations are within DataDistributor methods:
# DataDistributor.vector_search
# DataDistributor.parameter_search

# The GraphBuilder will call these methods directly or via a wrapper
# No separate tool classes like MockVectorSearchTool are needed here.

## Data Framing

In [None]:
# This function transforms raw data fetched from the database
# into a Pandas DataFrame, which LIDA requires.
def data_to_dataframe(raw_data: list) -> pd.DataFrame:
    """
    Transforms raw data (list of dicts) into a Pandas DataFrame.
    Assumes raw_data is already a list of dictionaries.
    """
    print("Transforming data to DataFrame...")
    # Add logic to flatten or process data if needed based on your schema
    # For simple list of dicts, direct conversion is fine.
    if not isinstance(raw_data, list) or not all(isinstance(item, dict) for item in raw_data):\
        ctx.logger().error("data_to_dataframe received data not in expected list of dicts format.")
        return pd.DataFrame()

    return pd.DataFrame(raw_data)

## LLM Classes (Implemented via LangChainAgent)

In [None]:
# Actual LLM interactions are handled by LangChainAgent.
# We will initialize different instances or use methods of LangChainAgent
# for different purposes (decision, tool calling prompt, summarization, answer generation).
# Based on QueryHandler, LangChainAgent is initialized with an API key and optionally a summary_workflow flag.
# We might need to add specific methods to LangChainAgent or QueryHandler
# for decision making and tool call prompt generation if they don't exist.

# For this refactor, we'll assume LangChainAgent can handle different prompts
# based on how it's invoked or configured.

# Placeholder for LLM initialization based on project's Constants
def initialize_llm_agent(summary_workflow=False, model_name=''):
    """
    Initializes a LangChainAgent using project's Constants.
    """
    try:
        api_key = Constants._get_google_ai_key()
        if not api_key:
             ctx.logger().error("Google AI API key not found in Constants.")
             raise ValueError("Google AI API key not configured.")

        # You might need different model names based on the task (decision, generation, etc.)
        # The model_name parameter in LangChainAgent needs to be aligned with available models.
        # If LangChainAgent handles prompt routing internally, you might only need one instance.

        return LangChainAgent(api_key=api_key, model_name=model_name, summary_workflow=summary_workflow)
    except Exception as e:
        ctx.logger().error(f"Failed to initialize LangChainAgent: {e}", eventid="tag_llm_init_fail")
        # Depending on error handling strategy, you might raise or return None
        raise

## State and Graph Builder

In [None]:
# Placeholder for your add_messages function
# This should align with how messages are handled in your project, likely in QueryHandler or similar.
# If message history is managed externally and passed into the state, this function might be simplified or removed.
# Assuming for now it just appends messages as in the original notebook.
def add_messages(left: list, right: list):
    """
    Append the new messages to the existing messages.
    This is a placeholder and should be replaced with your project's actual logic for managing messages.
    """
    left.extend(right)
    return left

# Update State TypedDict to match project's schema and include visualization fields
# Align these fields with your project's actual state/context schema
class State(TypedDict):
    messages: Annotated[list, add_messages] # Or however message history is tracked
    history: str # Text representation of history, used in prompts
    question: str # Current user question
    # These splitting/needs_history fields are from the original notebook, keep if relevant
    should_split: bool
    split_questions: list
    needs_history: bool
    needs_context: bool
    context: str # Fetched data converted to string for text workflow
    raw_context_data: Optional[list] # Store raw data list of dicts for visualization
    summarized_context: str # Summarized context for text answer
    final_answer: str # Final text answer

    # Fields for visualization workflow
    visualization_request: Optional[dict] # Structured request from decision LLM
    # visualization_data: Optional[list] # Renamed to raw_context_data for clarity
    visualization_output: Optional[dict] # Vega-Lite spec or image data
    visualization_error: Optional[str] # Error message if visualization fails

class GraphBuilder:
    def __init__(self):\
        # Initialize actual LLM models and DataDistributor
        # Assuming different LangChainAgent instances are needed for different tasks,\
        # or your LangChainAgent/QueryHandler handles internal routing.\
        # If QueryHandler methods are used, initialize QueryHandler here.\
        from QueryHandler import QueryHandler # Import QueryHandler
        self.query_handler = QueryHandler() # Use the project's QueryHandler

        # Initialize LIDA Manager
        try:
            # Get LIDA API key using project's Constants or environment variables
            # Assuming Constants can provide a generic API key or a specific LIDA one
            lida_api_key = Constants._get_google_ai_key() # Or Constants.get_lida_api_key()
            if not lida_api_key:
                 # Fallback to environment variable if not in Constants
                 lida_api_key = os.environ.get("GOOGLE_API_KEY") # Or "LIDA_API_KEY"

            if not lida_api_key:
                 ctx.logger().error("LIDA API key not found in Constants or environment variables.", eventid="tag_lida_key_missing")
                 # Handle this error appropriately - maybe disable viz or raise error
                 self.lida = None # Disable LIDA if key is missing
                 print("Warning: LIDA API key is not configured. Visualization may not work.")
            else:
                self.lida = Manager(text_gen_config=TextGenerationConfig(
                    model="gemini-flash", # Or the model you intend to use for LIDA
                    api_key=lida_api_key
                ))

        except ImportError:
             ctx.logger().error("LIDA or its dependencies not installed.", eventid="tag_lida_import_error")
             self.lida = None
             print("Warning: LIDA library is not installed (pip install lida). Visualization will be disabled.")
        except Exception as e:
            ctx.logger().error(f"Failed to initialize LIDA Manager: {e}", eventid="tag_lida_init_fail")
            self.lida = None
            print(f"Warning: Failed to initialize LIDA Manager: {e}. Visualization will be disabled.")

        # DataDistributor is used for database interactions
        # DataDistributor is designed as class methods, no instance needed?\
        # Or initialize if it manages connections per instance?\
        # Based on the provided code, DataDistributor methods are @classmethod,\
        # so we can call them directly. No instance needed here.\
        # self.data_distributor = DataDistributor() # If it were an instance class

    # --- New and Modified Node Methods ---

    async def decide_next_step(self, state: State) -> State:
        """
        Decides whether to generate a visualization or continue with the text workflow.
        Sets the visualization_request field in the state.
        Uses the project's LLM agent for decision.
        """
        question = state['question']
        history = state.get('history', '') # Use get for safety

        # Craft a prompt for the decision LLM. This might need refinement
        # based on how your LangChainAgent handles such specific instructions.
        decision_prompt = f"""
Based on the following user question and chat history, determine if the user is requesting a data visualization (chart, graph, plot). Respond with a JSON object.

If the user explicitly asks for a visualization, respond with:
{{ "request_type": "visualization", "chart_type": "[infer type or null]", "data_fields": ["list", "of", "relevant", "fields"] }}

If the user is NOT requesting a visualization, respond with:
{{ "request_type": "continue_text_workflow" }}

Your response should be a JSON object and contain ONLY the JSON object.

Question: {question}
Chat History: {history}
"""
        ctx.logger().debug("Calling LLM for decision on next step.", eventid="tag_decision_llm")
        try:
            # Use the QueryHandler or a dedicated LangChainAgent for this decision
            # Assuming query_handler.ask_llm can handle this type of prompt and return structured JSON
            # This might require specific prompt engineering or model fine-tuning.\
            # Alternatively, if a separate LangChainAgent is used just for decisions:\
            # response, metric = await self.decision_agent.process_input(user_input=decision_prompt)
            # For now, assuming query_handler can do it, might need to adapt or create a new method.\

            # **Adaptation Required:** The structure of the original project's QueryHandler.ask_llm\
            # seems designed for generating chat responses, not structured decisions.\
            # You likely need a new method in QueryHandler (e.g., `decide_workflow`) or\
            # a separate LangChainAgent instance specifically configured for this task.\
            # For this refactor, we'll simulate the expected output based on the prompt logic.\
            # REPLACE THE FOLLOWING SIMULATION WITH ACTUAL LLM CALL:
            # This is a placeholder - replace with actual LLM call using your project's classes
            # Example using a hypothetical new method:
            # decision_response_content, metric = await self.query_handler.decide_workflow(question, history)
            # decision = json.loads(decision_response_content)

            # --- Mock Simulation (REPLACE THIS) ---
            # This simulates the LLM reading the prompt and deciding.
            # In a real scenario, the LLM's response would be parsed.
            lower_question = question.lower()
            if any(word in lower_question for word in ["visualization", "chart", "graph", "plot"]):
                # Attempt to infer fields based on keywords (highly simplified)
                data_fields = []
                if "organization" in lower_question: data_fields.append("organization")
                if "category" in lower_question: data_fields.append("category")
                if "research_areas" in lower_question: data_fields.append("research_areas")
                if "keywords" in lower_question: data_fields.append("keywords")
                if "contacts" in lower_question: data_fields.append("contacts")
                chart_type = "bar" if "bar chart" in lower_question else ("line" if "line chart" in lower_question else None)
                decision = {"request_type": "visualization", "chart_type": chart_type, "data_fields": data_fields if data_fields else None}
            else:
                decision = {"request_type": "continue_text_workflow"}
            # --- End Mock Simulation ---

            ctx.logger().debug(f"Decision from LLM: {decision}", eventid="tag_decision_result")

            if decision.get("request_type") == "visualization":
                state['visualization_request'] = decision # Store the structured request
                # Data fetching will happen in fetch_context or a dedicated node before viz generation
            else:
                state['visualization_request'] = None # No visualization requested

        except json.JSONDecodeError:
            ctx.logger().error("LLM response for decision was not valid JSON.", eventid="tag_decision_json_error")
            state['visualization_request'] = None # Default to text workflow on JSON error
            state['visualization_error'] = "Error parsing LLM decision." # Use visualization_error for consistency
        except Exception as e:
            ctx.logger().error(f"Error in decide_next_step: {e}", eventid="tag_decide_error")
            state['visualization_request'] = None # Default to text workflow on other errors
            state['visualization_error'] = f"Error determining visualization request: {e}"

        return state

    def route_request(self, state: State) -> str:
        """
        Routes the workflow based on the visualization_request state.
        """
        if state.get('visualization_request'):
            print("Routing to generate_visualization")
            return "generate_visualization"
        else:
            print("Routing to continue_text_workflow")
            return "continue_text_workflow"

    async def fetch_context(self, state: State) -> State:
        """
        Fetches data based on the user's question, potentially guided by LLM tool call decision.
        Stores raw data in raw_context_data if visualization is requested, otherwise in context string.
        Uses the project's DataDistributor for database interaction.
        """
        print("Executing fetch_context")
        question = state['question']
        history = state.get('history', '') # Use get for safety
        viz_request = state.get('visualization_request')

        ctx.logger().debug("Calling LLM for tool call decision and query generation.", eventid="tag_tool_llm")
        try:
            # Craft a prompt for the tool calling LLM.
            # Similar to the decision LLM, this might need a specific LangChainAgent or QueryHandler method.
            tool_decision_prompt = f"""
Question Instructions:

Inputs:

Question: {question}

Chat History: {history}

Task: Your task is to generate an effective search query and determine the appropriate tool (vector_search_tool or parameter_search_tool) to retrieve the necessary information from the database. Consider both the question and the chat history (if relevant) when constructing your query and selecting the tool.
If the question asks for specific attributes or uses precise terms, favor parameter_search_tool. If it's a more general or conceptual query, favor vector_search_tool.

Output: Return the tool call in the specified format (e.g., using function_call in additional_kwargs).

End of Instructions
"""
            # **Adaptation Required:** Replace with actual LLM call using your project's classes.
            # This requires your LLM agent to support function calling or return a structure you can parse.
            # Example using a hypothetical new method:
            # tool_response, metric = await self.query_handler.determine_tool_call(question, history)
            # tool_call = tool_response.additional_kwargs.get("function_call", {})

            # --- Mock Simulation (REPLACE THIS) ---
            # This simulates the LLM deciding on a tool and query.
            # In a real scenario, the LLM's response would be parsed.
            lower_question = question.lower()
            tool_call = {}
            tool_query = question # Default query
            desired_fields = ["description"] # Default fields for text context

            if any(term in lower_question for term in ["specific attributes", "precise terms", "details"]):\
                 tool_call["name"] = "parameter_search_tool"
                 # Attempt to parse parameters - highly simplified mock
                 # In reality, the LLM should provide structured arguments.
                 parameter_query = {} # Example: {"name": "Org A"}
                 # If visualization is requested, try to fetch specific fields identified by the decision LLM
                 if viz_request and viz_request.get('data_fields'):
                      desired_fields = viz_request['data_fields']
                 else:
                      desired_fields = ["organization", "category", "value"] # Example fields for parameter search data
                 tool_query = parameter_query # Use the parameter dict as the query

            else:\
                 tool_call["name"] = "vector_search_tool"
                 tool_query = question # Use the question as the vector search query
                 # Vector search usually returns chunks/documents, format for text context initially
                 desired_fields = ["description", "file", "page"] # Example fields for vector search results
            # --- End Mock Simulation ---

            func_name = tool_call.get("name", "")

            raw_output = None
            if func_name == "vector_search_tool":
                ctx.logger().debug(f"Invoking vector search tool with query: {tool_query}", eventid="tag_vector_search_invoke")
                # Call the actual DataDistributor method
                # DataDistributor.vector_search expects query: str, desired_field: str
                # The current DataDistributor.vector_search seems to return a list of strings (descriptions)
                # If you need more structured data for viz, you might need to modify vector_search
                # or call a different method.
                # Assuming for now it returns list of strings or dicts depending on internal logic.
                # Need to adapt based on what DataDistributor.vector_search actually returns.

                # **Adaptation Required:** DataDistributor.vector_search currently returns a list of descriptions.\
                # If you need full document objects for visualization, modify DataDistributor.vector_search.\
                # For now, assume we call it with the question and expect a list of strings/dicts.\
                # We'll pass a single desired_field, assuming the method handles multiple internally if needed.\
                # If vector_search needs different logic for viz data, create a new method in DataDistributor.\
                raw_output = await DataDistributor.vector_search(query=tool_query, desired_field="description")
                ctx.logger().debug(f"Vector search returned {len(raw_output) if raw_output else 0} results.", eventid="tag_vector_search_results")

            elif func_name == "parameter_search_tool":
                 ctx.logger().debug(f"Invoking parameter search tool with query: {tool_query} and fields: {desired_fields}", eventid="tag_parameter_search_invoke")
                 # Call the actual DataDistributor method
                 # DataDistributor.parameter_search expects name: str, query: dict, time: bool, limit: int, des_field: list
                 # Need collection name and query dict from the LLM's tool call arguments.
                 # **Adaptation Required:** The mock LLM simulation doesn't provide these. The tool calling LLM needs to provide\
                 # collection_name and the query dict based on the user's question.\
                 # Assuming for the refactor we use a default collection and the LLM's 'tool_query' (which is currently just the question)\
                 # needs to be parsed into a query dict by the LLM or a preceding step.\
                 # For now, we'll use a placeholder query dict and collection name.

                 # Placeholder query_dict - replace with actual parsed arguments from LLM
                 parameter_query_dict = {"user_query": tool_query} # This is incorrect, needs to be a proper MongoDB query dict
                 collection_name = Constants.config.get("collection1", "default_collection") # Example: get default collection name

                 # Need to decide on limit and time parameters - perhaps based on LLM decision or defaults?\
                 limit = 5 # Example default limit
                 use_time = False # Example default

                 # **Adaptation Required:** The tool_query from the LLM needs to be a proper MongoDB filter dict\
                 # and the collection name needs to be determined. This is a major integration point.\
                 # Assuming for demonstration, parameter_query_dict is derived correctly and desired_fields are populated.\

                 raw_output = await DataDistributor.parameter_search(name=collection_name, query=parameter_query_dict, time=use_time, limit=limit, des_field=desired_fields)
                 ctx.logger().debug(f"Parameter search returned {len(raw_output) if raw_output else 0} results.", eventid="tag_parameter_search_results")

            else:
                ctx.logger().warning(f"Unknown tool name determined by LLM: {func_name}", eventid="tag_unknown_tool")
                state['context'] = "Could not identify relevant tool."
                state['raw_context_data'] = None
                state['final_answer'] = "I couldn't find the right tool to fetch data for that."
                return state # Exit early on unknown tool

            if raw_output is None:
                 ctx.logger().warning("Data fetching returned None.", eventid="tag_fetch_none")
                 state['context'] = "No data found for your query."
                 state['raw_context_data'] = None
                 # Don't set final_answer here, let subsequent nodes handle it

            elif viz_request and isinstance(raw_output, list) and len(raw_output) > 0 and isinstance(raw_output[0], dict):
                # If viz is requested and tool returned a list of dicts (structured data suitable for viz)
                state['raw_context_data'] = raw_output # Store raw data list of dicts
                state['context'] = "Data fetched for visualization."
                state['summarized_context'] = "" # Clear summarized context for viz path
                ctx.logger().debug(f"Stored {len(raw_output)} items in raw_context_data for visualization.", eventid="tag_stored_raw_viz")
            else:
                # If no viz request, or data is not list of dicts (e.g., vector search returns list of strings)
                # Convert raw output to a string for text workflow summarization
                # Ensure output is iterable for join
                if isinstance(raw_output, list):
                    context_string = "\n".join(map(str, raw_output)) # Join items into a string
                else:\
                     context_string = str(raw_output) # Convert single item or other types to string

                state['context'] = context_string
                state['raw_context_data'] = None # Ensure raw data is clear if not used for viz
                state['summarized_context'] = "" # Summarization will happen in the next node
                ctx.logger().debug("Formatted fetched data into context string for text workflow.", eventid="tag_formatted_context")


        except Exception as e:
            ctx.logger().error(f"Error in fetch_context: {e}", eventid="tag_fetch_error")
            state['context'] = f"Error fetching data: {e}"
            state['raw_context_data'] = None
            state['summarized_context'] = ""
            state['final_answer'] = "An error occurred while fetching data."
            state['visualization_error'] = f"Data fetching error: {e}" # Use viz error field for consistency
            # Optionally, re-raise the exception if you want the graph to stop
            # raise e

        return state

    async def summarize_context(self, state: State) -> State:
        """
        Summarizes the fetched context for the text workflow.
        This node is skipped if a visualization is requested or if there's no context.
        Uses the project's LLM agent for summarization.
        """
        print("Executing summarize_context")
        context = state.get('context')

        # Skip summarization if visualization is requested or no context to summarize
        if state.get('visualization_request') or not context or context == "Data fetched for visualization.":\
            print("Skipping summarization as visualization is requested or no context.")
            state['summarized_context'] = "" # Ensure empty if skipped
            return state

        ctx.logger().debug("Calling LLM for context summarization.", eventid="tag_summarize_llm")
        summary_prompt = f"""Summarize the following context for a chatbot response:
{context}
"""
        try:
            # Use the QueryHandler.summarize_llm method
            summary_response_content, metric = await self.query_handler.summarize_llm(question=summary_prompt)
            state['summarized_context'] = summary_response_content
            ctx.logger().debug("Context summarization complete.", eventid="tag_summarize_complete")
        except Exception as e:
            ctx.logger().error(f"Error in summarize_context: {e}", eventid="tag_summarize_error")
            state['summarized_context'] = "Error summarizing context."
            state['final_answer'] = "An error occurred while summarizing the information."
            # Optionally, re-raise the exception
            # raise e

        return state

    async def generate_answer(self, state: State) -> State:
        """
        Generates the final text answer based on the summarized context.
        This node is skipped if a visualization was successfully generated.
        Uses the project's LLM agent for answer generation.
        """
        print("Executing generate_answer")
        question = state['question']
        summarized_context = state.get('summarized_context')
        history = state.get('history', '') # Use get for safety
        visualization_output = state.get('visualization_output')

        # Skip text answer generation if visualization output exists and is successful
        if visualization_output:
             print("Skipping generate_answer as visualization output exists.")
             # The final_answer might have been set by generate_visualization_node
             return state

        if not summarized_context:
            # If no summarized context and no viz output, set a default error message
            if not state.get('final_answer'): # Avoid overwriting a previous error message
                 state['final_answer'] = state.get('context', "Could not generate a response based on the available information.")
                 if state['final_answer'] == "Data fetched for visualization.":
                      state['final_answer'] = "Could not generate a response based on the available information."
            print("Skipping generate_answer due to missing summarized context.")
            return state

        ctx.logger().debug("Calling LLM for final answer generation.", eventid="tag_answer_llm")
        answer_prompt = f"""Based on the following summarized context and chat history, answer the user's question:

Question: {question}
Chat History: {history}
Summarized Context: {summarized_context}

Provide a concise and helpful answer.
"""
        try:
            # Use the QueryHandler.ask_llm method for the final answer
            # QueryHandler.ask_llm expects question and history
            # We'll pass the full prompt including summarized context as the question for now.
            # **Adaptation Required:** QueryHandler.ask_llm is designed for initial query processing.\
            # You might need a new method in QueryHandler (e.g., `generate_final_response`) that takes\
            # summarized context explicitly, or adapt how ask_llm uses the prompt/history.\

            # For this refactor, we'll simulate using ask_llm with the combined prompt.\
            # REPLACE THE FOLLOWING SIMULATION WITH ACTUAL LLM CALL:
            # answer_response_content, metric = await self.query_handler.ask_llm(question=answer_prompt, history=history)
            # state['final_answer'] = answer_response_content

            # --- Mock Simulation (REPLACE THIS) ---
            state['final_answer'] = f"Answer based on: {summarized_context[:100]}... (Actual answer generation via LLM)"
            # --- End Mock Simulation ---

            ctx.logger().debug("Final answer generation complete.\

", eventid="tag_answer_complete")

        except Exception as e:
            ctx.logger().error(f"Error in generate_answer: {e}", eventid="tag_answer_error")
            state['final_answer'] = "An error occurred while generating the answer."
            # Optionally, re-raise the exception
            # raise e

        return state

    async def generate_visualization_node(self, state: State) -> State:
        """
        Calls the LIDA tool logic to generate a visualization.
        Requires raw_context_data to be populated with a list of dictionaries.
        """
        print("Executing generate_visualization_node")
        question = state['question']
        raw_data = state.get('raw_context_data') # Use raw_context_data
        viz_request = state.get('visualization_request')

        if not self.lida:
             error_msg = "LIDA Manager was not initialized successfully. Visualization is disabled."
             ctx.logger().error(error_msg, eventid="tag_viz_lida_not_initialized")
             state['final_answer'] = error_msg
             state['visualization_error'] = error_msg
             return state

        if not raw_data or not isinstance(raw_data, list) or len(raw_data) == 0 or not isinstance(raw_data[0], dict):
            error_msg = "I couldn't find enough structured data to create a visualization for that."
            ctx.logger().warning(error_msg, eventid="tag_viz_no_structured_data")
            state['final_answer'] = error_msg
            state['visualization_error'] = "No structured data available for visualization."
            return state

        try:
            # Transform data for LIDA (list of dicts to DataFrame)
            df = data_to_dataframe(raw_data)
            if df.empty:
                 error_msg = "Failed to convert fetched data to DataFrame."
                 ctx.logger().error(error_msg, eventid="tag_viz_dataframe_error")
                 state['final_answer'] = error_msg
                 state['visualization_error'] = "DataFrame conversion failed."
                 return state

            ctx.logger().debug("Generating data summary using LIDA.", eventid="tag_lida_summarize")
            # Summarize the data using LIDA
            data_summary = self.lida.summarize(data_frame=df, data_config=DataConfig())
            ctx.logger().debug(f"LIDA Data Summary generated.", eventid="tag_lida_summary_complete")

            # Use the user's query or the parsed viz_request for visualization generation
            # Prioritize the original question as the goal for LIDA
            visualization_goal = question

            ctx.logger().debug("Generating visualization code specs using LIDA.", eventid="tag_lida_generate_viz")
            # Generate visualization code specs
            # Use the same text gen config as LIDA manager init or a new one if needed
            viz_code_specs = self.lida.generate_viz(
                summary=data_summary,
                goal=visualization_goal,
                textgen_config=self.lida.text_gen_config, # Use the manager's config
                library="altair" # Specify a library LIDA should use (altair is common for vega-lite)
            )
            ctx.logger().debug(f"LIDA Generated {len(viz_code_specs)} Viz Code Specs.", eventid="tag_lida_viz_specs_complete")

            if viz_code_specs:
                # Execute the visualization code - try the first spec
                ctx.logger().debug("Executing visualization code using LIDA.", eventid="tag_lida_execute_viz")
                charts = self.lida.execute_viz(
                    code_specs=viz_code_specs[0].code, # Assuming the first spec is relevant and has a 'code' field
                    data=df,
                    summary=data_summary,
                    library="altair"
                )
                ctx.logger().debug(f"LIDA Executed {len(charts) if charts else 0} charts.", eventid="tag_lida_execute_complete")

                if charts:
                    # Assuming charts is a list of dictionaries, each with 'spec' (Vega-Lite JSON) and potentially 'raster' (image)
                    first_chart = charts[0]
                    if 'spec' in first_chart and first_chart['spec'] is not None:
                        state['visualization_output'] = {
                            "type": "vega-lite",
                            "spec": first_chart['spec'] # Vega-Lite JSON specification
                        }
                        # Set a default success message, can be refined later if needed
                        state['final_answer'] = "Here is the visualization you requested:"
                        ctx.logger().info("Successfully generated Vega-Lite visualization.", eventid="tag_viz_success_vega")
                    elif 'raster' in first_chart and first_chart['raster'] is not None:
                        # If no spec but image is available, return the image (base64)
                        # LIDA might return base64 directly or a path. Check LIDA docs/output.
                        # Assume it's base64 for now.
                        state['visualization_output'] = {
                            "type": "image",
                            "data": first_chart['raster'] # Base64 encoded image data
                        }
                        state['final_answer'] = "Here is the visualization you requested:"
                        ctx.logger().info("Successfully generated visualization image.", eventid="tag_viz_success_image")
                    else:
                        error_msg = "I generated a visualization, but I couldn't format it correctly."
                        ctx.logger().error(error_msg, eventid="tag_viz_format_error")
                        state['final_answer'] = error_msg
                        state['visualization_error'] = "LIDA execution returned unexpected format."

                else:
                    error_msg = "I was able to generate visualization code, but executing it failed."
                    ctx.logger().error(error_msg, eventid="tag_viz_execute_failed")
                    state['final_answer'] = error_msg
                    state['visualization_error'] = "LIDA execute_viz failed."

            else:
                error_msg = "I understood you wanted a visualization, but I couldn't generate a suitable one based on the data and your request."
                ctx.logger().warning(error_msg, eventid="tag_viz_generate_failed")
                state['final_answer'] = error_msg
                state['visualization_error'] = "LIDA generate_viz failed."

        except ImportError as e:
            error_msg = f"Missing required library for visualization: {e}. Please install it (e.g., pip install lida pandas altair)."\
            ctx.logger().error(error_msg, eventid="tag_viz_import_missing")
            state['final_answer'] = error_msg
            state['visualization_error'] = f"Import error: {e}"
        except Exception as e:
            ctx.logger().error(f"Error during LIDA visualization generation: {e}", eventid="tag_viz_general_error")
            state['final_answer'] = "An error occurred while trying to generate the visualization."
            state['visualization_error'] = f"LIDA generation error: {e}"


        return state

## Workflow Definition

In [None]:
def create_workflow(self):
        """
        Defines the LangGraph workflow including visualization path.
        """
        workflow = StateGraph(State)

        # Add nodes
        workflow.add_node("decide_next_step", self.decide_next_step)
        workflow.add_node("fetch_context", self.fetch_context)
        workflow.add_node("summarize", self.summarize_context)
        workflow.add_node("generate_answer", self.generate_answer)
        workflow.add_node("generate_visualization", self.generate_visualization_node)

        # Define edges
        workflow.add_edge(START, "decide_next_step") # Start by deciding text vs viz

        # After deciding, fetch context regardless of path, as both need data
        # The fetch_context node will handle storing raw data for viz or string for text
        workflow.add_edge("decide_next_step", "fetch_context")

        # After fetching data, route based on the decision made in decide_next_step
        workflow.add_conditional_edge(
            "fetch_context", # Route from fetch_context now
            self.route_request, # Method to determine the next node
            {
                "generate_visualization": "generate_visualization", # If visualization is requested
                "continue_text_workflow": "summarize" # If a standard text answer is needed
            }
        )

        # Edges from visualization and text workflows
        # Visualization path goes directly to END after generation attempt
        workflow.add_edge("generate_visualization", END)

        # Text workflow continues from summarize to generate_answer
        workflow.add_edge("summarize", "generate_answer")

        # Text answer path goes to END
        workflow.add_edge("generate_answer", END)

        return workflow

## Usage in Jupyter Notebook:

In [None]:
# --- How to use in your Jupyter Notebook ---

# 1. Initialize the GraphBuilder
#    This will initialize LLM agents, DataDistributor (conceptually), and LIDA.
#    Ensure your environment variables and Constants are set up correctly.
graph_builder = GraphBuilder()

# 2. Create the workflow
workflow = graph_builder.create_workflow()

# 3. Compile the graph
app = workflow.compile()

# 4. Run the graph with an initial state
# You'll need to provide the initial state dictionary.
# The 'question' and 'history' are the primary inputs.
# Other fields are populated during the workflow.

# Example state for a text query:
initial_state_text = {
 'messages': [], # Assuming messages are tracked here
 'history': 'User: What is the purpose of the project? AI: The project is a chatbot.', # Example history format
 'question': 'Tell me about the organizations mentioned in the data.',
 # Other state fields will be populated by the graph nodes
 'should_split': False,
 'split_questions': [],
 'needs_history': False,
 'needs_context': False,
 'context': '',
 'raw_context_data': None,
 'summarized_context': '',
 'final_answer': '',
 'visualization_request': None,
 'visualization_output': None,
 'visualization_error': None
}

# Example state for a visualization query:
initial_state_viz = {
 'messages': [],
 'history': '',
 'question': 'Show me a bar chart of organizations by category.',
 # Other state fields will be populated by the graph nodes
 'should_split': False,
 'split_questions': [],
 'needs_history': False,
 'needs_context': False,
 'context': '',
 'raw_context_data': None,
 'summarized_context': '',
 'final_answer': '',
 'visualization_request': None, # This will be set by decide_next_step
 'visualization_output': None, # This will be populated by generate_visualization
 'visualization_error': None
}

# To run the graph (use await in an async environment like certain parts of Jupyter or inside an async function)
# For a simple test in a synchronous notebook cell, you need to run it within an asyncio event loop.

async def run_graph_async(state):
    # Wrap the graph execution in a RequestContext for logging
    # You need to provide actual user_id, correlation_id, session_id, client_id, body
    # based on how your project generates these.
    # For notebook testing, you can use placeholders.
    with ctx(
        user_id="test_user",
        correlation_id="test_corr",
        session_id="test_session",
        client_id="test_client",
        body={"question": state['question'], "history": state['history']}
    ):
        print("Running graph...")
        # The graph runs until it reaches END
        result = await app.ainvoke(state)
        print("Graph execution finished.")
        return result

# Example usage (run one at a time by uncommenting):

# print("Running text query...")
# text_result = await run_graph_async(initial_state_text)
# print("\n--- Text Workflow Result ---")
# print(f"Final Answer: {text_result.get('final_answer')}")
# print(f"Context: {text_result.get('context')[:200]}...")
# print(f"Summarized Context: {text_result.get('summarized_context')[:200]}...")
# print(f"Visualization Request: {text_result.get('visualization_request')}")
# print(f"Visualization Output: {text_result.get('visualization_output')}")
# print(f"Visualization Error: {text_result.get('visualization_error')}")


print("Running visualization query...")
# For visualization, ensure your MongoDB has data that DataDistributor can fetch
# and that LIDA can process (list of dicts format is crucial).
# You might need to set up a mock MongoDB or ensure connection details are correct.
viz_result = await run_graph_async(initial_state_viz)

print("\n--- Visualization Workflow Result ---")
print(f"Final Answer (Text Intro): {viz_result.get('final_answer')}")
# The actual visualization output will be in viz_result.get('visualization_output')
# This will likely be a dictionary containing the Vega-Lite spec or image data.
# You'll need to render this output in your frontend/UI.
viz_output = viz_result.get('visualization_output')
if viz_output:
    print(f"Visualization Output Type: {viz_output.get('type')}")
    if viz_output.get('type') == 'vega-lite':
        print("Vega-Lite Spec (truncated):", json.dumps(viz_output.get('spec'), indent=2)[:500], "...")
        # In a real notebook, you might display this using altair or ipyvega
        # import altair as alt
        # alt.Chart.from_dict(viz_output['spec']).display()
    elif viz_output.get('type') == 'image':
         print("Image Data (truncated):", viz_output.get('data', '')[:100], "...")
         # In a real notebook, you might display this using IPython.display.Image
         # from IPython.display import Image
         # Image(base64.b64decode(viz_output['data'])) # Requires base64 import
else:
    print("Visualization Output: None")

print(f"Visualization Request: {viz_result.get('visualization_request')}")
print(f"Raw Context Data (truncated): {str(viz_result.get('raw_context_data', ''))[:200]}...\

")
print(f"Visualization Error: {viz_result.get('visualization_error')}")
print(f"Context (for text path, likely empty here): {viz_result.get('context')}")
print(f"Summarized Context (for text path, likely empty here): {viz_result.get('summarized_context')}")


##### Notes for reintegration:
##### 1. This notebook provides the core LangGraph workflow logic.
##### 2. Integrate the `GraphBuilder` class and its methods into your project's structure (e.g., a new module or within an existing one).
##### 3. Ensure all necessary project imports (`DataDistributor`, `Constants`, `RequestContext`, `Chain`, etc.) are correctly resolved in the final location.
##### 4. The `RequestContext` usage in `run_graph_async` demonstrates how to wrap the graph execution to provide context for logging within the graph nodes. Adapt this to how your server/entry point handles incoming requests.
##### 5. The handling of `visualization_output` needs to be integrated into your frontend or API response structure to render the chart or image.
##### 6. Refine the LLM prompts and the parsing of LLM responses (especially for tool calling and decision making) based on the capabilities and expected output format of your actual LangChainAgent instances or QueryHandler methods.
##### 7. Ensure robust error handling is propagated correctly through the graph and reported using your project's logging.
##### 8. Review the data fetching logic in `Workspace_context` to ensure it correctly calls `DataDistributor.vector_search` or `DataDistributor.parameter_search` with the right arguments derived from the LLM's tool call decision.
##### 9. Verify that `data_to_dataframe` correctly handles the format of data returned by your DataDistributor methods when used for visualization.
##### 10. Confirm LIDA's compatibility with your chosen LLM model and ensure the API key is securely managed.