### Manage notebooks parallel runs

This notebook control multiple notebook runs, it do this using a DAG definition to set the notebooks parameters. In this way it can divide the Table ingestion through multiple runs instead of a serial processing

In [8]:
# Import all necessary libs
import json

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 12, Finished, Available)

In [4]:
'''
This cell is marked as "Parameters",
it means that in production, when running this notebook within a pipeline
I can pass whatever parameter that I want , in this case protheus_tables
 '''

protheus_tables=''

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 8, Finished, Available)

In [14]:
def split_list_into_n_parts(lst, n):
    '''
    This function just split a list into n parts
    '''
    part_len = len(lst) // n
    
    remainder = len(lst) % n
    
    start = 0
    
    parts = []
    
    for i in range(n):
        end = start + part_len + (1 if i < remainder else 0)
        parts.append(lst[start:end])
        start = end
    
    return parts

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 18, Finished, Available)

In [18]:
# 
def parameters_to_list(protheus_tables):
    '''
    This function parser the parameters dictionary to a list of table names
    '''
    parsed_json = json.loads(protheus_tables)

    table_names = [item["TABLE_NAME"] for item in parsed_json]
    
    return table_names

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 21, Finished, Available)

In [20]:
table_names = parameters_to_list(protheus_tables)
table_names

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 23, Finished, Available)

['FPA010',
 'STJ010',
 'P10010',
 'SA1010',
 'SA2010',
 'SE5010',
 'SD1010',
 'SC7010']

In [25]:
splited_lists = split_list_into_n_parts(table_names,10)
splited_lists

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 28, Finished, Available)

[['FPA010', 'STJ010'],
 ['P10010', 'SA1010'],
 ['SA2010', 'SE5010'],
 ['SD1010'],
 ['SC7010']]

In [26]:
'''
This cell is the major logic of the notebook:
it defines what and how the ingestion notebooks will be running in parallel
whithin the same session of this notebook. This cell basically controls
others notebooks runs, theirs dependencies, and parameters.
It controls the Protheus ingestion notebooks, 
distributing all the tables ingestion processing in multiple parallels runs
instead of processing one table at time in serial
'''

# Define the DAG (Directed Acyclic Graph) for running multiple notebook tasks
DAG = {
    "activities": [  # List of activities to be run
        {
            "name": "1-full-ssh-ingestion",  # Unique activity name
            "path": "full-protheus-ssh-ingestion-dag",  # Path to the notebook
            "timeoutPerCellInSeconds": 700,  # Cell execution timeout
            "args": {"table_names_dag": f"{splited_lists[0]}"},  # Notebook parameters
        },
        {
            "name": "2-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[1]}"}, # notebook parameters
        },
        {
            "name": "3-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[2]}"}, # notebook parameters
        },
        {
            "name": "4-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[3]}"}, # notebook parameters
        },
        {
            "name": "5-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[4]}"}, # notebook parameters
        },
        {
            "name": "6-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[5]}"}, # notebook parameters
        },
        {
            "name": "7-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[6]}"}, # notebook parameters
        },
        {
            "name": "8-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[7]}"}, # notebook parameters
        },
        {
            "name": "9-full-ssh-ingestion", # activity name, must be unique
            "path": "full-protheus-ssh-ingestion-dag", # notebook path
            "timeoutPerCellInSeconds": 700, # max timeout for each cell, default to 90 seconds
            "args": {"table_names_dag": f"{splited_lists[8]}"}, # notebook parameters
        },
        {
            "name": "10-full-ssh-ingestion",
            "path": "full-protheus-ssh-ingestion-dag",
            "timeoutPerCellInSeconds": 700,
            "args": {"table_names_dag": f"{splited_lists[9]}"},
        }
    ],
    "timeoutInSeconds": 7000,  # Total execution timeout for the DAG
    "concurrency": 10  # Max number of concurrent activities
}

# Execute the DAG and visualize it using specified layout and size
log_mensage = mssparkutils.notebook.runMultiple(DAG, {
    "displayDAGViaGraphviz": True,  # Enable DAG visualization
    "DAGLayout": "spectral",  # Layout style for the visualization
    "DAGSize": 15  # Size of the visualization
})

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 29, Finished, Available)

VBox(children=(HBox(children=(HTML(value='Status: Pending', description='1-full-ssh-ingestion'), FloatProgress…

StatementMeta(, d2cc5d38-bba3-4c55-8f42-a22731ba1480, 30, Finished, Available)

In [9]:
# this cell pass the run.multiple output to the exit to be acessed by the pipeline
output = {}
output['successes'] = f'{log_mensage}'

mssparkutils.notebook.exit(output)
mssparkutils.session.stop()

StatementMeta(, 67db0a00-a0ec-4871-8ddf-fb1d5296f652, 13, Finished, Available)

ExitValue: {'successes': "{'1-full-ssh-ingestion': {'exitVal': '', 'exception': None}, '2-full-ssh-ingestion': {'exitVal': '', 'exception': None}, '3-full-ssh-ingestion': {'exitVal': '', 'exception': None}, '4-full-ssh-ingestion': {'exitVal': '', 'exception': None}, '5-full-ssh-ingestion': {'exitVal': '', 'exception': None}}"}

In [None]:
# debug purpose: list all notebooks in the workspace

import sempy.fabric as fabric
notebooks = fabric.list_items().query("Type == 'Notebook'")
notebooks

StatementMeta(, , , Cancelled, )