In [3]:
import yaml
from pprint import pprint
from collections import defaultdict

# Define the path to your YAML file
yaml_file_path = 'pipeline_flow.yml'

# Function to extract DAG from a section of the pipeline
def extract_dag_for_section(section_data):
    dag = defaultdict(list)  # Adjacency list representation of the DAG
    
    # Maps outputs to step names to identify dependencies
    output_to_step = {}
    
    for step in section_data:
        step_name = step.get('step')
        inputs = step.get('inputs', [])
        outputs = step.get('outputs', [])
        
        # Assume dependencies based on matching inputs to previously mapped outputs
        for input_item in inputs:
            for input_key, input_value in input_item.items():
                if input_key in output_to_step:
                    # Add dependency from output provider to current step
                    dag[output_to_step[input_key]].append(step_name)
        
        # Map current step's outputs for future dependency checks
        for output_item in outputs:
            for output_key, output_value in output_item.items():
                output_to_step[output_key] = step_name
                
    return dag

# Open the YAML file and load its contents
with open(yaml_file_path, 'r') as file:
    config = yaml.safe_load(file)

# Assuming the config has top-level keys corresponding to sections
for section_name, section_data in config.items():
    print(f"Processing section: {section_name}")
    dag = extract_dag_for_section(section_data)
    pprint(dict(dag))
    print("\n")


Processing section: series_extraction
{'ExcelCleaner.clean_excel': ['TableFinder.find_tables',
                              'SeriesExtractor.extract_series'],
 'ExcelCompatibilityChecker.check_file': ['SeriesExtractor.extract_series'],
 'ExcelLoader.load_file': ['ExcelCompatibilityChecker.check_file',
                           'ExcelValidator.validate_excel',
                           'ExcelCleaner.clean_excel'],
 'ExcelValidator.validate_excel': ['ExcelCleaner.clean_excel'],
 'SeriesExtractor.extract_series': ['SeriesIterator.iterate_series'],
 'TableFinder.find_tables': ['SeriesExtractor.extract_series']}


Processing section: ast_building
{'FormulaParser.parse_formula': ['SeriesImplementer.implement_series',
                                 'SeriesImplementer.implement_series']}


Processing section: ast_transformation
{'FormulaGenerator.get_ast_generator': ['FunctionReplacer.replace_functions'],
 'FunctionReplacer.replace_functions': ['ASTGeneratorCollector.get_collection']}


P