In [1]:
import json5
from qwen_agent.agents import Assistant
from qwen_agent.tools.base import BaseTool, register_tool
from qwen_agent.utils.output_beautify import typewriter_print
import re
from sql import run_sql_workflow

In [2]:
def remove_think_blocks(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

In [None]:
# Step 1 (Optional): Add a custom tool named `my_image_gen`.
@register_tool('get_cases_schema')
class GetCasesSchema(BaseTool):
    description = 'Returns the schema and usage notes for the `cases` SQL table.'
    parameters = []

    def call(self, params: str, **kwargs) -> str:
        # No parameters needed, but still need to parse for consistency
        _ = json5.loads(params) if params else {}

        return json5.dumps({
            'schema': """
=== CASES TABLE ===

You are a SQL assistant with access to the `cases` table. Below is the structure and brief description of each column.

Structure:
    id                     VARCHAR       -- Unique identifier for each case
    order_date             TIMESTAMP_NS  -- Date the order was placed
    employee_id            VARCHAR       -- ID of the employee handling the case
    branch                 VARCHAR       -- Branch responsible for the case
    supplier               VARCHAR       -- Supplier associated with the case
    avg_time               DOUBLE        -- Average time to complete the case
    estimated_delivery     TIMESTAMP_NS  -- Estimated delivery date
    delivery               TIMESTAMP_NS  -- Actual delivery date
    on_time                BOOLEAN       -- Whether the delivery late, False means late delivery
    in_full                BOOLEAN       -- Whether the delivery was complete
    number_of_items        INTEGER       -- Total items in the case
    ft_items               INTEGER       -- Fast-track items
    total_price            DOUBLE        -- Total price of the case
    total_activities       INTEGER       -- Total activities in the process
    rework_activities      INTEGER       -- Count of rework activities
    automatic_activities   INTEGER       -- Count of automated activities

Instructions:
- Use standard SQL (SELECT, WHERE, GROUP BY, etc.).
- Use `order_date` for filtering by time.
- Use aggregations (COUNT, SUM, AVG) for metrics.
"""
        }, ensure_ascii=False)

@register_tool('get_activities_schema')
class GetActivitiesSchema(BaseTool):
    description = 'Returns the schema and usage notes for the `activities` SQL table.'
    parameters = []

    def call(self, params: str, **kwargs) -> str:
        # No parameters needed, but still need to parse for consistency
        _ = json5.loads(params) if params else {}

        return json5.dumps({
            'schema': """
=== ACTIVITIES TABLE ===

You are a SQL assistant with access to the `activities` table. Below is the structure and brief description of each column.

Structure:
    id                       INTEGER     -- Unique ID for each activity
    timestamp                TIMESTAMP   -- Time the activity occurred
    name                     VARCHAR     -- Name/type of activity
    tpt                      DOUBLE      -- Time per task
    user                     VARCHAR     -- User who performed the activity
    user_type                VARCHAR     -- Type of user (e.g., Human, Bot)
    automatic                BOOLEAN     -- Whether the activity was automated
    rework                   BOOLEAN     -- Whether the activity was a rework
    case_index               INTEGER     -- Index of the activity in the case
    case_id                  VARCHAR     -- ID of the related case

Case metadata (prefixed with `case_`):
    case_order_date          TIMESTAMP   -- Order date of the case
    case_employee_id         VARCHAR     -- Employee responsible
    case_branch              VARCHAR     -- Responsible branch
    case_supplier            VARCHAR     -- Supplier involved
    case_avg_time            DOUBLE      -- Average processing time
    case_estimated_delivery  TIMESTAMP   -- Expected delivery date
    case_delivery            TIMESTAMP   -- Actual delivery date
    case_on_time             BOOLEAN     -- Whether the case was on time
    case_in_full             BOOLEAN     -- Whether the delivery was complete
    case_number_of_items     INTEGER     -- Number of items in the case
    case_ft_items            INTEGER     -- Fast-track items
    case_total_price         DOUBLE      -- Total case price

Instructions:
- Use standard SQL syntax (WHERE, GROUP BY, etc.).
- Use `automatic` and `rework` to analyze activities' automation and rework status.
- Use `timestamp`, `name`, or `user_type` for filtering or grouping activities.
- You may aggregate case-related columns, but avoid referencing other tables.
"""
        }, ensure_ascii=False)

@register_tool('get_variants_schema')
class GetVariantsSchema(BaseTool):
    description = 'Returns the schema and usage notes for the `variants` SQL table.'
    parameters = []

    def call(self, params: str, **kwargs) -> str:
        # No parameters needed, but still need to parse for consistency
        _ = json5.loads(params) if params else {}

        return json5.dumps({
            'schema': """
=== VARIANTS TABLE ===

You are a SQL assistant with access to the `variants` table. Below is the structure and brief description of each column.

Structure:
    id              BIGINT       -- Unique ID for each variant
    activities      VARCHAR[]    -- Ordered list of activity names in this variant
    cases           VARCHAR[]    -- Array of case IDs following this variant
    number_cases    BIGINT       -- Number of cases following this variant
    percentage      DOUBLE       -- Share of total cases for this variant
    avg_time        DOUBLE       -- Average processing time for this variant

Instructions:
- Each row represents a unique process path ("variant") followed by one or more cases.
- Use `number_cases`, `percentage`, or `avg_time` to rank, filter, or compare variants.
- Use array functions (e.g., `ANY`, `UNNEST`, `array_length`) to inspect activities or case IDs.
- Standard SQL syntax is allowed (WHERE, ORDER BY, LIMIT, etc.).
- Deviations are variants that differ from the most common one (highest `number_cases`).
- Do not reference other tables.
"""
        }, ensure_ascii=False)


llm_cfg = {
    'model': 'Qwen3:8b',
    'model_server': 'http://localhost:11434/v1',
    'generate_cfg': {
        'top_p': 0.8
    },
}


In [4]:

prompt_instruction = '''/no_think
After receiving the user's request, you should:
1. Identify the relevant SQL tables based on the user's query.
2. Retrieve the schema for those tables by calling the relevant schema-fetching tools (e.g., `get_cases_schema`, `get_activities_schema`, `get_variants_schema`).
3. Analyze the user's query and use the schema information to generate a prompt.
4. Provide a brief instruction about how to query the relevant tables based on the schema.
5. Return the table schemas where the query should be executed as well as its relevant columns with datatypes and descriptions. Do not Include any additional information.

The goal is to make the user request more specific by formulating a SQL query and instructions based on the relevant schemas of the tables.
'''

tools = ['get_cases_schema', 'get_activities_schema', 'get_variants_schema', 'code_interpreter']  # Tools include schema fetchers and code interpreter
#files = ['./doc.pdf']  # You can provide a PDF file if necessary
prompt_agent = Assistant(llm=llm_cfg,
                system_message=prompt_instruction,
                function_list=tools,
                #files=files
                )
""" 
messages= []
query = input('\nuser query: ')
# Append the user query to the chat history.
messages.append({'role': 'user', 'content': query})
response = []
response_plain_text = ''

for response in prompt_agent.run(messages=messages):
        response_plain_text = typewriter_print(response, response_plain_text)
response_plain_text = remove_think_blocks(response_plain_text)
messages.append({'role': 'assistant', 'content': response_plain_text})
"""

" \nmessages= []\nquery = input('\nuser query: ')\n# Append the user query to the chat history.\nmessages.append({'role': 'user', 'content': query})\nresponse = []\nresponse_plain_text = ''\n\nfor response in prompt_agent.run(messages=messages):\n        response_plain_text = typewriter_print(response, response_plain_text)\nresponse_plain_text = remove_think_blocks(response_plain_text)\nmessages.append({'role': 'assistant', 'content': response_plain_text})\n"

In [5]:
@register_tool('execute_sql_with_prompt')
class ExecuteSQLWithPrompt(BaseTool):
    description = 'Generates and executes a SQL query using a provided prompt and original user question. The prompt should describe what SQL to run.'

    parameters = [
        {
            'name': 'question',
            'type': 'string',
            'description': 'The original user question for context.',
            'required': True
        },
        {
            'name': 'prompt',
            'type': 'string',
            'description': 'The SQL prompt provided by another agent. It should describe what SQL to generate and run.',
            'required': True
        }
    ]

    def call(self, params: str, **kwargs) -> str:
        args = json5.loads(params)
        question = args['question']
        prompt = args['prompt']

        # You can keep this if the SQL generator expects consistent formatting
        general_instructions = """
        You are an SQL assistant specialized in DuckDB. Your task is to generate accurate SQL queries based on natural language questions/tasks, following the schema and rules below.

        ### MAIN RULES:
        - Generate only one SQL Query.
        - The result must be executable as it is, so do not include any instructions, just the SQL code.
        - Only use the provided schemas to generate the SQL query, and do not reference any other tables or schemas.
        - You can perform JOINs between the tables, but you should not reference any other tables or schemas.
        - If the query is already given in this prompt you should just return it as it is.
        """

        combined_prompt = general_instructions + prompt

        # Assume this function executes the final query based on prompt and returns results
        result = run_sql_workflow(question, combined_prompt)
        return result

In [6]:
sql_instruction = '''/no_think
You will receive a query and a prompt for SQL generation. Your need to::

1. Use the tool `execute_sql_with_prompt` which will generate and execute the SQL query based on the provided prompt and question.
2. Return the result of the SQL query execution as it is, without any additional instructions or comments.
'''

tools2 = ['execute_sql_with_prompt']  # Tools include schema fetchers and code interpreter

sql_bot = Assistant(llm=llm_cfg,
                system_message=sql_instruction,
                function_list=tools2,
                #files=files
                )


In [7]:
@register_tool('handoff_to_prompt_generator')
class HandoffToPromptAgent(BaseTool):
    description = 'Generates a prompt for the sub task that needs to be answered with a SQL query.'

    parameters = [
        {
            'name': 'task',
            'type': 'string',
            'description': 'The individual task that needs to be answered with a SQL query, no composed questions.',
            'required': True
        }
    ]

    def call(self, params: str, **kwargs) -> str:
        args = json5.loads(params)
        task = args['task']        
        sup_message = {'role': 'user', 'content':task}
        # Assume this function executes the final query based on prompt and returns results
        for response in prompt_agent.run(messages=[sup_message]):
            response_plain_text = response[-1]["content"]
        response_plain_text = remove_think_blocks(response_plain_text)
        return response_plain_text

@register_tool('handoff_to_sql_generator')
class HandoffToSQLAgent(BaseTool):
    description = 'Generates and executes SQL queries for the given task based on the prompt.'

    parameters = [
        {
            'name': 'task',
            'type': 'string',
            'description': 'The individual task that needs to be answered with a SQL query, no composed questions, It needs to include the relevant context.',
            'required': True
        },
        {
            'name': 'prompt',
            'type': 'string',
            'description': 'The full exact prompt provided by a previous tool call `handoff_to_prompt_generator`.',
            'required': True
        }
    ]

    def call(self, params: str, **kwargs) -> str:
        args = json5.loads(params)
        task = args['task']
        prompt= args['prompt']
        sup_message = {'role': 'user', 'content':task+prompt}
        # Assume this function executes the final query based on prompt and returns results
        for response in sql_bot.run(messages=[sup_message]):
            response_plain_text = response[-1]["content"]
        response_plain_text = remove_think_blocks(response_plain_text)
        return response_plain_text


In [8]:
supervisor_instruction='''
/no_think
You are the supervisor of the interaction between the user and two specialized agents.

Your goal is to answer the user's question, even if it requires multiple steps or SQL queries.

When you receive a user query:
1. Analyze whether it can be answered directly or if it needs to be broken down into multiple steps.
2. If multiple steps are required:
   - Break the query into clear subtasks.
   - For each subtask:
     a. Call the `handoff_to_prompt_generator` tool to generate a prompt.
     b. Then call the `handoff_to_sql_generator` tool generate and execute an SQL query based on the previous prompt.
3. If a single step is needed:
   - Do the same (generate prompt → generate and execute SQL).
4. Optionally analyze or summarize the results.
5. Combine the results from all subtasks and generate a final answer for the user.

Always return a final concise and insightful summary based on the results.

Execution Example:

- If the user asks "Identify late deliveries and the most common variant of the process," you would:
   1. Identify two subtasks: "Find late deliveries" and "Identify the most common variant."
   2. For each subtask, call the `handoff_to_prompt_generator` and `handoff_to_sql_generator` tools.
   3. Combine the results and return a final answer.

'''

supervisor= Assistant(llm=llm_cfg,
                system_message=supervisor_instruction,
                function_list=['handoff_to_prompt_generator','handoff_to_sql_generator'],
                )

In [9]:
messages= []
query = 'Identify late deliveries and the most common variant.'
# Append the user query to the chat history.
messages.append({'role': 'user', 'content': query})
response_plain_text = ''

for response in supervisor.run(messages=messages):
        response_plain_text = typewriter_print(response, response_plain_text)
response_plain_text = remove_think_blocks(response_plain_text)
messages.append({'role': 'assistant', 'content': response_plain_text})

<think>

</think>


[TOOL_CALL] handoff_to_prompt_generator
{"task": "Identify late deliveries"}
[TOOL_CALL] handoff_to_prompt_generator
{"task": "Identify the most common variant"}
[TOOL_RESPONSE] handoff_to_prompt_generator
To identify late deliveries, we can focus on the `cases` table, which contains a column `on_time` that indicates whether a delivery was on time or late. A value of `False` in the `on_time` column means the delivery was late.

### Instructions for Querying:
- Filter the `cases` table where `on_time = False` to identify late deliveries.
- You can further enrich the query by including additional details such as the actual delivery date (`delivery`), estimated delivery date (`estimated_delivery`), and the order date (`order_date`).

### Relevant Columns in the `cases` Table:
| Column Name             | Data Type         | Description                                      |
|-------------------------|-------------------|--------------------------------------------------

KeyboardInterrupt: 