In [10]:
import yaml
from pprint import pprint
from collections import defaultdict

# Define the path to your YAML file
yaml_file_path = 'pipeline_flow.yml'

# Function to extract DAG from a section of the pipeline
def extract_dag_for_section(section_data):
    dag = defaultdict(list)  # Adjacency list representation of the DAG
    
    # Maps outputs to step names to identify dependencies
    output_to_step = {}
    
    for step in section_data:
        step_name = step.get('step')
        inputs = step.get('inputs', [])
        outputs = step.get('outputs', [])
        
        # Assume dependencies based on matching inputs to previously mapped outputs
        for input_item in inputs:
            for input_key, input_value in input_item.items():
                if input_key in output_to_step:
                    # Add dependency from output provider to current step
                    dag[output_to_step[input_key]].append(step_name)
        
        # Map current step's outputs for future dependency checks
        for output_item in outputs:
            for output_key, output_value in output_item.items():
                output_to_step[output_key] = step_name
                
    return dag

# Open the YAML file and load its contents
with open(yaml_file_path, 'r') as file:
    config = yaml.safe_load(file)

# Assuming the config has top-level keys corresponding to sections
for section_name, section_data in config.items():
    print(f"Processing section: {section_name}")
    dag = extract_dag_for_section(section_data)
    pprint(dict(dag))
    print("\n")


Processing section: series_extraction
{'ExcelCleaner.clean_excel': ['TableFinder.find_tables',
                              'SeriesExtractor.extract_series'],
 'ExcelCompatibilityChecker.check_file': ['SeriesExtractor.extract_series'],
 'ExcelLoader.load_file': ['ExcelCompatibilityChecker.check_file',
                           'ExcelValidator.validate_excel',
                           'ExcelCleaner.clean_excel'],
 'ExcelValidator.validate_excel': ['ExcelCleaner.clean_excel'],
 'SeriesExtractor.extract_series': ['SeriesIterator.iterate_series'],
 'TableFinder.find_tables': ['SeriesExtractor.extract_series']}


Processing section: ast_building
{'FormulaParser.parse_formula': ['SeriesImplementer.implement_series',
                                 'SeriesImplementer.implement_series']}


Processing section: ast_transformation
{'FormulaGenerator.get_ast_generator': ['FunctionReplacer.replace_functions'],
 'FunctionReplacer.replace_functions': ['ASTGeneratorCollector.get_collection']}


P

In [11]:
config

{'series_extraction': [{'step': 'ExcelLoader.load_file',
   'inputs': [{'excel_raw_file_path': {'type': 'str',
      'description': 'Path to raw Excel'}},
    {'excel_reduced_file_path': {'type': 'str',
      'description': 'Path to Excel with only two rows per table'}}],
   'outputs': [{'excel_raw': {'type': 'ExcelFile',
      'description': 'Loaded raw Excel'}},
    {'excel_reduced': {'type': 'ExcelFile',
      'description': 'Loaded raw Excel file'}}]},
  {'step': 'ExcelCompatibilityChecker.check_file',
   'inputs': [{'excel_raw': {'type': 'ExcelFile',
      'description': 'Loaded raw Excel'}},
    {'extracted_tables': {'type': 'dict[Worksheet, list[Table]]',
      'description': 'Located tables inside an Excel with table metadata'}}],
   'outputs': [{'is_compatible': {'type': 'bool',
      'description': 'True if the files are compatible'}}],
   'decision_point': 'is_compatible',
   'actions': [{'result': True, 'next_step': 'SeriesExtractor.extract_series'},
    {'result': False, '

In [12]:
from graphviz import Digraph

# Function to create visualization for a given section of the pipeline
def create_visualization(section_name, steps):
    dot = Digraph(comment=section_name)
    dot.attr('node', shape='box')  # Set nodes to be boxes

    # Keep track of steps and their outputs for matching inputs to outputs
    output_to_step = {}

    for step in steps:
        step_name = step['step']
        step_label = f"{step_name}"

        # Collect outputs for this step
        outputs = [output for output_dict in step.get('outputs', []) for output in output_dict]
        if outputs:
            step_label += "\nOutputs: " + ", ".join(outputs)
            for output in outputs:
                output_to_step[output] = step_name

        # Create node with step and its outputs as label
        dot.node(step_name, label=step_label)

    # Add edges based on inputs matching to outputs from other steps
    for step in steps:
        step_name = step['step']
        inputs = [input_key for input_dict in step.get('inputs', []) for input_key in input_dict]
        
        for input_key in inputs:
            # If input matches an output from another step, draw an edge
            if input_key in output_to_step:
                dot.edge(output_to_step[input_key], step_name, label=input_key)

    return dot

# For each section in the YAML, create and render a visualization
for section_name, steps in config.items():
    dot = create_visualization(section_name, steps)
    dot.format = 'jpg'
    dot.render(f'{section_name}.gv', view=False)  # This will save and optionally open the graph visualization
