In [None]:
import json
from pathlib import Path

RESOURCE_DIR = Path("../resources")
YAML_DIR = RESOURCE_DIR/"yaml_flow"
YAML_DIR.mkdir(exist_ok=True)

Knowledge data comes from FlowBench https://github.com/Justherozen/FlowBench

In [None]:
with open(RESOURCE_DIR / "knowledge.jsonl", "r") as f:
    knowledge_data = [
        json.loads(json_data)
        for json_data in f.read().split("\n")
        if len(json_data) > 0
    ]

In [None]:
# print(knowledge_data[0]["contents"]["flowchart"])

In [None]:
# k_content = 0
# k_flowchart = 0
# k_workflow_pipeline = 0
# k_api_tool = 0
# for data in knowledge_data:
#     if 'contents' in data:
#         k_content+=1
#         if "flowchart" in data["contents"]:
#             k_flowchart+=1
#             text = data["contents"]["flowchart"]
#             if "The workflow pipeline" in text:
#                 k_workflow_pipeline+=1
#             if "The API tool information" in text:
#                 k_api_tool+=1
#             else:
#                 print(text)
#                 break
# print(k_content/len(knowledge_data))
# print(k_flowchart/len(knowledge_data))
# print(k_workflow_pipeline/len(knowledge_data))
# print(k_api_tool/len(knowledge_data))

Data are simple to parse to get the data with mermaid and tool

We need to split by triple lines jump then keep the block where "The workflow pipeline" is at the beginning.

In [None]:
def parse_data(knowledge_snippet: dict[str, dict[str, str]])->str:
    text = knowledge_snippet["contents"]["flowchart"]
    blocks = [elem for elem in text.split("\n\n\n") if "The workflow pipeline" in elem.strip()]
    if len(blocks)==1:
        block_text = blocks[0].strip()
        return block_text
        # return "\n".join(block_text.split('\n')[1:])
    else:
        raise Exception("Parsing Error somehow")

Now we transform the raw text into a YAML format (since we have chosen to use YAML as our base format as ansible for example).

Also in the context of LLM YAML is more token efficient 
https://medium.com/better-programming/yaml-vs-json-which-is-more-efficient-for-language-models-5bc11dd0f6df

But whatever we are not going to use the YAML directly in LLM (except for testing agaisnt our method)

In [None]:
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os

api_key = os.environ.get("GEMINI_API_KEY")
        
client = genai.Client(api_key=api_key)
model_name="gemini-2.5-flash"
def transform_to_yaml(block_text:str, client:genai.Client)->str:
    prompt = """
    You are an AI specialized in converting a conversational workflow and API documentation into a structured YAML format. This YAML is for configuring an AI agent's behavior. The conversion must be precise.

**Schema & Example:**

The YAML should follow this exact structure, using indentation, key names, and data types as shown below.

```yaml
flow:
  - id: <node_id>
    prompt: <"agent_prompt_text">
    actions:
      - condition: <"condition_string_to_check_tool_output_or_user_input">
        next_step: <target_node_id>
      - condition: <"another_condition">
        next_step: <another_target_node_id>
  
  - id: <node_id_with_tool_call>
    tool_call:
      name: <api_name>
      description: <"api_description">
      parameters:
        <parameter_name_1>:
          type: <parameter_type>
          description: <"parameter_description">
          required: <true/false>
        <parameter_name_2>:
          ...
    response: <"agent_response_using_{variables}">
    next_step: <target_node_id>

  - id: <node_id_for_end_or_linear_flow>
    prompt: <"final_prompt_or_intermediate_prompt">
    next_step: <target_node_id_or_null>
    
    **Example 1: Simple Weather Check**
**Input:**
Workflow:
Start --> Ask for city --> Call weather_api --> Show weather --> End
API: weather_api
Description: Get current weather for a city.
Input: {"city": {"type": "string"}}
Output: {"temperature": {"type": "string"}, "condition": {"type": "string"}}

**Output:**
flow:
  - id: ask_city
    prompt: "What city's weather would you like to know?"
    next_step: get_weather
  
  - id: get_weather
    tool_call:
      name: weather_api
      description: "Get current weather for a city."
      parameters:
        city:
          type: string
          description: "Name of the city."
          required: true
      output:
        temperature:
          type: string
          description: temperature of the city
        condition:
          type: string
          description: weather condition of the city
    response: "The weather in {city} is {temperature} with a condition of {condition}."
    next_step: end
  
  - id: end
    prompt: "Is there anything else I can help with?"
    next_step: null
    
    **Task**:
Now, convert the following text into the same YAML format, ensuring all rules of the schema are followed and Jinja2 templating is used where appropriate."""
    prompt+=f"\n\n **Input:**\n {block_text} \n\n **Output:** \n"
    response = client.models.generate_content(
                model=model_name,  # Utilise le modèle approprié
                contents=prompt.strip(),
            )
    return response.text

In [None]:
for k, k_data in enumerate(knowledge_data):
    file_path = YAML_DIR/f"{k}.yaml"
    if not file_path.exists():
        yaml_text = transform_to_yaml(parse_data(k_data),client)
        yaml_text=yaml_text.replace("```yaml","").replace("```","").strip()
        import yaml
        test_yaml = yaml.safe_load(yaml_text)
        with open(file_path,"w") as f:
            f.write(yaml_text)

TODO:
1. Ensure the graph is working (no dead node)
2. Make the 43 graphs into YAML

In [None]:
def validate_graph_no_dead_nodes(yaml_data: dict) -> tuple[bool, list[str]]:
    """
    Validate that a flow graph has no dead nodes (nodes without parents).
    
    Args:
        yaml_data: Parsed YAML data containing flow structure
        
    Returns:
        tuple[bool, list[str]]: (is_valid, list_of_dead_nodes)
    """
    if 'flow' not in yaml_data:
        return False, ["No 'flow' key found in YAML data"]
    
    flow = yaml_data['flow']
    if not isinstance(flow, list):
        return False, ["Flow must be a list of nodes"]
    
    # Extract all node IDs and their connections
    all_node_ids = set()
    connections = {}  # node_id -> list of target nodes
    
    for node in flow:
        if not isinstance(node, dict) or 'id' not in node:
            continue
        
        node_id = node['id']
        all_node_ids.add(node_id)
        connections[node_id] = []
        
        # Check for direct next_step
        if 'next_step' in node and node['next_step'] is not None:
            connections[node_id].append(node['next_step'])
        
        # Check for actions with next_step
        if 'actions' in node:
            for action in node['actions']:
                if 'next_step' in action and action['next_step'] is not None:
                    connections[node_id].append(action['next_step'])
    
    # Find nodes that are referenced as targets
    referenced_nodes = set()
    for source_node, targets in connections.items():
        for target in targets:
            if target in all_node_ids:  # Only count valid references
                referenced_nodes.add(target)
    
    # Find nodes without parents (dead nodes)
    # The first node (typically "start" or similar) is allowed to have no parents
    # We identify the start node as one that's not referenced by any other node
    potential_start_nodes = all_node_ids - referenced_nodes
    
    # If there's exactly one unreferenced node, it's the start node
    if len(potential_start_nodes) == 1:
        dead_nodes = []
        is_valid = True
    else:
        # Multiple unreferenced nodes means we have dead nodes
        # (assuming one should be the start node)
        dead_nodes = list(potential_start_nodes)
        is_valid = len(potential_start_nodes) <= 1
    
    return is_valid, dead_nodes

In [None]:
# Test the validation function on existing YAML files
import os
import yaml

def test_yaml_files():
    """Test all YAML files in the resources/yaml_flow directory"""
    results = {}
    
    for filename in os.listdir("../resources/yaml_flow/"):
        if filename.endswith('.yaml'):
            filepath = f"../resources/yaml_flow/{filename}"
            try:
                with open(filepath, 'r') as f:
                    yaml_data = yaml.safe_load(f)
                
                is_valid, dead_nodes = validate_graph_no_dead_nodes(yaml_data)
                results[filename] = {
                    'valid': is_valid,
                    'dead_nodes': dead_nodes
                }
                
            except Exception as e:
                results[filename] = {
                    'valid': False,
                    'error': str(e)
                }
    
    return results

# Run the test
validation_results = test_yaml_files()

# Display results
valid_count = 0
invalid_count = 0

print("=== YAML Flow Validation Results ===\n")

for filename, result in validation_results.items():
    if result.get('valid', False):
        valid_count += 1
        print(f"✅ {filename}: VALID")
    else:
        invalid_count += 1
        if 'error' in result:
            print(f"❌ {filename}: ERROR - {result['error']}")
        else:
            print(f"⚠️  {filename}: INVALID - Dead nodes: {result['dead_nodes']}")

print(f"\n=== Summary ===")
print(f"Valid files: {valid_count}")
print(f"Invalid files: {invalid_count}")
print(f"Total files: {valid_count + invalid_count}")