In [1]:
# warnings
import warnings
warnings.filterwarnings("ignore")

from langgraph.graph import StateGraph, END
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from typing_extensions import TypedDict
from typing import List, Dict, Any
import re
import logging
from Tools.Logger import setup_logger
from transformers import AutoTokenizer
from huggingface_hub import login
from db_create import CargaDeArchivos

#tools
from Tools.Tool import run_sql_workflow, run_think_task, remove_think_tags

#Prompts
from Tools.Prompts import  plan_prompt, final_ans_prompt



In [2]:
# === Logger instantiation ===
setup_logger("log_S.log")
logger = logging.getLogger(__name__)

In [3]:
# === Tokenizer logging ==
try:
    login(token="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")
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
except Exception as e:
    logger.error(f"An error occurred during tokenizer setup: {e}", exc_info=True)
    raise

# === Database population and connection ===
try:
    db_manager = CargaDeArchivos()
    db_manager.run()
    db_conn = db_manager.conn
except Exception as e:
    logger.error(f"An error occurred during database population and connection: {e}", exc_info=True)
    raise


In [4]:
# == Orchestrator state ==
class AgentState(TypedDict):
    question: str
    plan: List[dict]
    current_step: int
    results: Dict[str, Any]
    query_results: List[str]
    db_conn: None
    tokenizer: any
    use_case: str
    ans_req: List[int]
    final_answer: str

## 1

In [5]:
# == Orchestrator nodes ==
def planner_node(state: AgentState) -> AgentState:
    try:
        user_question = state["question"]

        llm = OllamaLLM(model="qwen3:8b", temperature=0.0, enable_thinking=False)
        planner = ChatPromptTemplate.from_messages([
            ("system", plan_prompt),
            ("human", "user question: {task}"),
        ]) | llm | StrOutputParser()

        raw_plan = planner.invoke({"task": user_question})
        raw_plan = remove_think_tags(raw_plan)
        print(raw_plan)

        # Inicialización
        steps = []
        pattern = re.compile(r'"?(ACTIVITY\d+)"?\s*:\s*{')
        answer_pattern = re.compile(r'"?(FINAL)"?\s*:\s*{')
        lines = raw_plan.strip().splitlines()
        current_step = None
        final_steps = None
        inside_final = False

        for line in lines:
            line = line.strip()

            # Detecta inicio de un bloque ACTIVITY
            match = pattern.match(line)
            if match:
                if current_step:
                    steps.append(current_step)  # Guarda el paso anterior
                current_step = {
                    "id": match.group(1),
                    "type": "[THINK]",
                    "description": "",
                    "reason": "",
                    "steps": []
                }
                continue  # Salta al siguiente ciclo

            # Detecta inicio de FINAL
            final_match = answer_pattern.match(line)
            if final_match:
                if current_step:
                    steps.append(current_step)  # Guarda el último ACTIVITY
                    current_step = None
                inside_final = True
                continue

            # Si estamos dentro de un ACTIVITY
            if current_step:
                if '"type"' in line:
                    task_type = re.search(r'"type"\s*:\s*"([^"]+)"', line)
                    if task_type:
                        current_step["type"] = task_type.group(1)
                elif '"description"' in line:
                    desc = re.search(r'"description"\s*:\s*"([^"]+)"', line)
                    if desc:
                        current_step["description"] = desc.group(1)
                elif '"reason"' in line:
                    reason = re.search(r'"reason"\s*:\s*"([^"]+)"', line)
                    if reason:
                        current_step["reason"] = reason.group(1)
                elif '"steps"' in line:
                    steps_str = re.search(r'"steps"\s*:\s*\[([^\]]*)\]', line)
                    if steps_str:
                        current_step["steps"] = [
                            int(x.strip()) - 1 for x in steps_str.group(1).split(",") if x.strip()
                        ]

            # Si estamos dentro del bloque FINAL
            if inside_final and '"steps"' in line:
                steps_str = re.search(r'"steps"\s*:\s*\[([^\]]*)\]', line)
                if steps_str:
                    final_steps = [
                        int(x.strip()) - 1 for x in steps_str.group(1).split(",") if x.strip()
                    ]

        # Si quedó un ACTIVITY sin guardar
        if current_step:
            steps.append(current_step)

        return {
            "plan": steps,
            "current_step": 0,
            "results": {},
            "query_results": [],
            "db_conn": db_conn,
            "tokenizer": tokenizer,
            "question": user_question,
            "ans_req": final_steps
        }
        
    except Exception as e:
        logger.exception(f"Error in planner_node: {e}")
        raise


def execute_task_node(state: AgentState) -> AgentState:
    try:
        step = state["plan"][state["current_step"]]
        
        task = step["description"]
        dependencies = step["steps"]
        logger.info(f"Previous steps: {dependencies}")
        task_type = step["type"]
        # dependencies = step["steps"] # Not used in this version. Usar en context con if step in dependencies

        context = "\n".join(f"- {state['results'][step]}" for step in sorted(state["results"], key=int) if int(step) in dependencies)
        logger.info(f"Context: {context}")
        more= "The user original question was: "+ state["question"]
        print(f"\n[Task {state['current_step'] + 1}] {task}")

        if "SQL" in task_type:
            # print("SQL!!")
            answer, raw_result = run_sql_workflow(
                task, state["db_conn"], state["tokenizer"], context
            )
        else:
            answer = run_think_task(task, more+context)
            raw_result = answer

        return {
            "plan": state["plan"],
            "results": {**state["results"],str(state["current_step"]): answer}, #Saves answer before updating the current step
            "current_step": state["current_step"] + 1,            
            "query_results": state["query_results"] + [raw_result],
            "db_conn": state["db_conn"],
            "tokenizer": state["tokenizer"],
            "question": state["question"]
        }

    except Exception as e:
        logger.exception(f"Error in execute_task_node: {e}")
        raise

def final_answer_node(state: AgentState) -> AgentState:
    try:
        user_question = state["question"]
        final_steps = state["ans_req"]
        logger.info(f"Final answer required steps: {final_steps}")
        final_context = "\n".join(f" - {state['results'][step]}" for step in sorted(state["results"], key=int) if int(step) in final_steps)
        logger.info(f"Final context: {final_context}")

        
        llm = OllamaLLM(model="qwen3:8b", temperature=0.0, enable_thinking=False)
        model = ChatPromptTemplate.from_messages([
            ("system", final_ans_prompt.format(context=final_context)),
            ("human", "user question: {task}"),
        ]) | llm | StrOutputParser()

        final_answer = model.invoke({"task": user_question})
        
        state["final_answer"] = remove_think_tags(final_answer)
        return state
   
   
    except Exception as e:
        logger.exception(f"Error in final_answer_node: {e}")
        state["final_answer"] = "Error generating final answer."
        return state

In [6]:
# === Orchestrator routers ===
def node_router(state: AgentState) -> str:
    try:
        next_node = "generate_final_answer" if state["current_step"] >= len(state["plan"]) else "execute_task"
    except Exception as e:
        logger.exception(f"Error in node_router: {e}")
        next_node = "final_answer"
    return next_node

In [7]:
# === Orchetrator workflow ===
def build_orchestrator_workflow():
    try:
        graph = StateGraph(AgentState)
        graph.add_node("planner", planner_node)
        graph.add_node("execute_task", execute_task_node)
        graph.add_node("generate_final_answer", final_answer_node)
        
        graph.set_entry_point("planner")
        graph.add_edge("planner", "execute_task")
        graph.add_conditional_edges("execute_task", node_router)
        graph.set_finish_point("generate_final_answer")
        
        return graph.compile()
    except Exception as e:
        logger.exception(f" Error building orchestrator workflow: {e}")
        raise

In [8]:
workflow = build_orchestrator_workflow()
output = workflow.invoke({"question":"Which suppliers involved in duplicate invoices also have poor delivery performance?"})

{
    "ACTIVITY1": {
        "type": "[SQL]",
        "description": "Identify all duplicate invoices by matching invoice numbers or patterns.",
        "reason": "To find which invoices are considered duplicates, we need to query the database for invoice records that match based on specific criteria.",
        "steps": []
    },
    "ACTIVITY2": {
        "type": "[THINK]",
        "description": "Analyze the results of the duplicate invoice query to extract the list of supplier IDs associated with these duplicate invoices.",
        "reason": "This step is necessary to determine which suppliers are linked to the duplicate invoices.",
        "steps": [1]
    },
    "ACTIVITY3": {
        "type": "[SQL]",
        "description": "Retrieve delivery performance data for all suppliers.",
        "reason": "To assess whether a supplier has poor delivery performance, we need to query the database for their performance metrics.",
        "steps": []
    },
    "ACTIVITY4": {
        " "type"

In [9]:
output["results"]

{'0': 'The SQL query results indicate that there are 86 invoices with the "Exact Match" pattern, which is defined as the same as a duplicate invoice. However, the results do not explicitly provide invoice numbers or further details to identify specific duplicates. Therefore, based on the query results, we can confirm that there are 86 duplicate invoices identified by the "Exact Match" pattern.',
 '1': 'The SQL query results confirm the existence of 86 duplicate invoices identified by the "Exact Match" pattern, but they do not provide specific invoice numbers or details to identify individual duplicates. As a result, it is not possible to extract a list of supplier IDs associated with these duplicate invoices from the provided query results. Further analysis or additional data fields (such as supplier ID linkage to invoices) would be required to achieve this task.',
 '2': 'The SQL query results provide delivery performance data for all suppliers. There are 39 rows of data, with each row

In [10]:
output["final_answer"]

'The provided data does not include a direct list of suppliers involved in duplicate invoices, as the SQL query results only confirm the existence of 86 duplicate invoices without linking them to specific supplier IDs. Therefore, it is not possible to directly compare the list of suppliers involved in duplicate invoices with the list of suppliers with poor delivery performance.\n\n**Recommendation:** To perform this comparison, additional data fields such as supplier ID linkage to invoices would be required. Once available, the duplicate invoice suppliers could be cross-referenced with the poor delivery performance list to identify any overlaps.'

In [11]:
print(output["final_answer"])

The provided data does not include a direct list of suppliers involved in duplicate invoices, as the SQL query results only confirm the existence of 86 duplicate invoices without linking them to specific supplier IDs. Therefore, it is not possible to directly compare the list of suppliers involved in duplicate invoices with the list of suppliers with poor delivery performance.

**Recommendation:** To perform this comparison, additional data fields such as supplier ID linkage to invoices would be required. Once available, the duplicate invoice suppliers could be cross-referenced with the poor delivery performance list to identify any overlaps.


In [12]:
output["plan"]

[{'id': 'ACTIVITY1',
  'type': '[SQL]',
  'description': 'Identify all duplicate invoices by matching invoice numbers or patterns.',
  'reason': 'To find which invoices are considered duplicates, we need to query the database for invoice records that match based on specific criteria.',
  'steps': []},
 {'id': 'ACTIVITY2',
  'type': '[THINK]',
  'description': 'Analyze the results of the duplicate invoice query to extract the list of supplier IDs associated with these duplicate invoices.',
  'reason': 'This step is necessary to determine which suppliers are linked to the duplicate invoices.',
  'steps': [0]},
 {'id': 'ACTIVITY3',
  'type': '[SQL]',
  'description': 'Retrieve delivery performance data for all suppliers.',
  'reason': 'To assess whether a supplier has poor delivery performance, we need to query the database for their performance metrics.',
  'steps': []},
 {'id': 'ACTIVITY4',
  'type': '[THINK]',
  'description': 'Analyze the delivery performance data to identify supplier

In [13]:
output["ans_req"]

[4]