In [0]:
%run ./ProcessConfigHandler

In [0]:
%run ./LakeflowAPIHandler

In [0]:
import json
import pprint
import traceback
import threading
import multiprocessing
from datetime import datetime, timezone, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

In [0]:
class SqlIngestionCommon:
    def __init__(self):
        pass

    def start_gateway_pipeline(self, p_row_dict, api_obj=None):

        v_state = None
        v_retry_max = 20
        v_retry_delay_seconds = 60
        v_retry_count = 0

        print(f"Starting Gateway Pipeline {p_row_dict['GatewayPipelineName']}....")
        # Retrieve Gateway Current Information
        api_obj = LakeflowAPI() if api_obj is None else api_obj
        v_pipeline_dict = api_obj.get_pipeline(p_row_dict['GatewayPipelineId'])

        if v_pipeline_dict['status'] == 'ok':
            v_state = v_pipeline_dict['latest_update_state']
            if v_state != 'RUNNING':
                # Set Cluster Specs based on Historical Flag at the Gateway Level
                v_clusters_list = json.loads(p_row_dict['ClusterJSONSpecsLarge']) if p_row_dict['IsHistoricalGatewayLevel'] == 1 else json.loads(p_row_dict['ClusterJSONSpecsSmall'])
                # Update Cluster Specs and Start Gateway
                gw_mng_obj = UpdateSqlGatewayPipeline(p_row_dict['GatewayPipelineId'])
                v_pipeline_start_dict = gw_mng_obj.update_pipeline_clusters(v_clusters_list["clusters"]) 
                # Wait for Gateway to be in Running State
                v_retry_count = 0
                while v_state != 'RUNNING' and v_retry_count < v_retry_max:
                    time.sleep(v_retry_delay_seconds)
                    v_pipeline_dict = api_obj.get_pipeline(p_row_dict['GatewayPipelineId'])
                    v_state = v_pipeline_dict['latest_update_state']
                    v_retry_count += 1

        if v_state != "RUNNING":
            raise Exception (f"Error: Gateway Pipeline {p_row_dict['GatewayPipelineName']} failed to start - State: {v_state}")
        else:
            print(f"Gateway Pipeline {p_row_dict['GatewayPipelineName']} in RUNNIG state....")

        return v_state

    def stop_gateway_pipeline(self, p_row_dict, api_obj=None):

        v_state = None
        v_retry_max = 120
        v_retry_delay_seconds = 60
        v_retry_count = 0
        v_ingestion_last_update_state = None

        print(f"Shutting down Gateway {p_row_dict['GatewayPipelineName']} when Ingestion {p_row_dict['IngestionPipelineName']} is IDLE....")
        api_obj = LakeflowAPI() if api_obj is None else api_obj
        
        time.sleep(20)
        # Check if Ingestion Pipeline no longer running before stoppping gateway
        v_pipeline_dict = api_obj.get_pipeline(p_row_dict['IngestionPipelineId'])
        
        if v_pipeline_dict['status'] == 'ok':
            v_state = v_pipeline_dict['state']
            v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
            v_retry_count = 0
            while v_state != 'IDLE' and v_retry_count < v_retry_max:
                time.sleep(v_retry_delay_seconds)
                v_pipeline_dict = api_obj.get_pipeline(p_row_dict['IngestionPipelineId'])
                v_state = v_pipeline_dict['state']
                v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
                v_retry_count += 1
        
        # Set Process Status to Fail/Success
        if v_ingestion_last_update_state == 'COMPLETED':
            self.set_pipeline_process_status(p_row_dict, 'C')
        else:
            self.set_pipeline_process_status(p_row_dict, 'F')

        # Sop Gateway Pipeline
        if v_state is not None and v_state == 'IDLE':
            v_pipeline_dict = api_obj.get_pipeline(p_row_dict['GatewayPipelineId'])
            if v_pipeline_dict['status'] == 'ok':
                v_state = v_pipeline_dict['latest_update_state']
                if v_state == 'RUNNING':
                    v_pipeline_start_dict = api_obj.stop_pipeline(p_row_dict['GatewayPipelineId'])
                    v_retry_count = 0
                    while v_state == 'RUNNING' and v_retry_count < v_retry_max:
                        time.sleep(v_retry_delay_seconds)
                        v_pipeline_dict = api_obj.get_pipeline(p_row_dict['GatewayPipelineId'])
                        v_state = v_pipeline_dict['latest_update_state']
                        v_retry_count += 1 

        return {'GatewayState': v_state, 'IngestionState': v_ingestion_last_update_state}
    
    def wait_for_ingestion_pipeline_idle(self, p_row_dict, api_obj=None):
        v_state = None
        v_ingestion_last_update_state = None
        v_retry_max = 120
        v_retry_delay_seconds = 60
        v_retry_count = 0

        time.sleep(20)
        # Check if Ingestion Pipeline no longer running
        v_pipeline_dict = api_obj.get_pipeline(p_row_dict['IngestionPipelineId'])
        
        if v_pipeline_dict['status'] == 'ok':
            v_state = v_pipeline_dict['state']
            v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
            v_retry_count = 0
            while v_state != 'IDLE' and v_retry_count < v_retry_max:
                time.sleep(v_retry_delay_seconds)
                v_pipeline_dict = api_obj.get_pipeline(p_row_dict['IngestionPipelineId'])
                v_state = v_pipeline_dict['state']
                v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
                v_retry_count += 1
        
        return v_ingestion_last_update_state

    def delay_pipeline_start(self, p_row_dict, api_obj=None):
        # Delay before Starting Pipeline based on Configuration
        print(f"Waiting {p_row_dict['IngestionDelayStartinSeconds']} seconds before starting Ingestion Pipeline {p_row_dict['IngestionPipelineName']}....")
        time.sleep(p_row_dict['IngestionDelayStartinSeconds'])
    
    def get_snapshot_startime(self):
        # Snapshots Start Time in UTC to watch for Ingestion Pipeline
        #Current time in UTC
        now_utc = datetime.now(timezone.utc)
        # substract 5 minutes
        now_utc = now_utc - timedelta(minutes=5)
        # Format with timezone offset and 'Z'
        return now_utc.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'  # ISO 8601 with milliseconds
    
    def wait_for_snapshots_complete(self, p_config_row_dict, p_expected_table_count=None, p_snapshots_start_time=None, api_obj=None):
        # Initialize Parameters if not passed
        p_expected_table_count = p_config_row_dict['TableCount'] if not p_expected_table_count else p_expected_table_count
        p_snapshots_start_time = '2000-01-01T00:00:00.000Z' if not p_snapshots_start_time else p_snapshots_start_time
        api_obj = LakeflowAPI() if api_obj is None else api_obj

        # Instantiate API Extension
        api_ext_obj = LakeflowAPIExtension()

        # Wait for All Snaphots to complete for current Pipeline
        # First use SQL Query
        if not api_ext_obj.wait_for_snapshots_complete(p_config_row_dict['GatewayPipelineId'], p_config_row_dict['IngestionPipelineId'], p_expected_table_count, p_snapshots_start_time):
            # If Failed, use API Events
            if not api_ext_obj.wait_for_snapshots_complete_via_api(p_config_row_dict['GatewayPipelineId'], p_config_row_dict['IngestionPipelineId'], p_expected_table_count, p_snapshots_start_time):
                # If Failed, use Config Delay
                self.delay_pipeline_start(p_config_row_dict, api_obj)
                

In [0]:
class CreateSqlIngestionPipeline(SqlIngestionCommon):
    def __init__(self, p_environment,  p_internal_product_id, p_source_server_name, p_source_database_name=None, p_internal_client_id=None, p_internal_facility_id=None):
        super().__init__()
        self.params_dict = {
            'p_environment': p_environment,
            'p_internal_product_id': p_internal_product_id,
            'p_source_server_name': p_source_server_name,
            'p_source_database_name': p_source_database_name,
            'p_internal_client_id': p_internal_client_id,
            'p_internal_facility_id': p_internal_facility_id
        }

        self.process_max_workers = 10
        self.trackeback_length = 1000
        lock_manager = multiprocessing.Manager()
        self.lock = lock_manager.Lock()

    def set_process_max_workers(self, process_max_workers):
        self.process_max_workers = process_max_workers

    def get_process_max_workers(self):
        return self.process_max_workers
    
    def set_traceback_length(self, traceback_length):
        self.trackeback_length = traceback_length

    def get_traceback_length(self):
        return self.trackeback_length

    def process(self):

        v_thread_errors = []        # To collect all exceptions
        v_thread_results = []       # To collect successful results
        v_validation_errors = []    # To collect validation errors
        v_skipped_results = []      # To collect skipped results
        v_config_row_array = []     # To collect valid config rows to process
        # Initialize Return Dictionary - will display results in that order using pprint
        v_return_dict = {'Total': 0, 'Success': 0, 'SuccessResults': [], 'Skipped': 0, 'SkippedResults': [], 'ValidationFailed' : 0, 'ValidationFailures': [], 'ExecutionFailed' : 0, 'ExecutionFailures': []}
        # Retrieve Unity Catalog from Configuration
        process_config = ProcessConfigData(self.params_dict['p_environment'])
        v_unity_catalog = process_config.get_config_attribute_value('AnalyticsUnityCatalog')
        # Retrieve new pipelines Configuration
        df_config_data_rows = process_config.get_table_list_aggregate(
                                                                                            self.params_dict['p_internal_product_id'],
                                                                                            self.params_dict['p_source_server_name'], 
                                                                                            self.params_dict['p_source_database_name'],
                                                                                            self.params_dict['p_internal_client_id'],
                                                                                            self.params_dict['p_internal_facility_id']
                                                                                        )
        # Raise exception if no configurations found
        if df_config_data_rows is None or df_config_data_rows.count() == 0:
            raise Exception(f"Unable to find implementation configuration for {self.params_dict}")
        else:
            v_return_dict['Total'] = df_config_data_rows.count()
            print("Pipeline Configuration Rows:", df_config_data_rows.count())
            #df_config_data_rows.display()
        # Loop through new Configurations - Skip if Pipeline already created...
        for row in df_config_data_rows.collect():
            if row['IngestionPipelineId']:
                v_return_dict['Skipped'] += 1
                v_skipped_message = f"Warning: SKipping - PipelineId ({row['IngestionPipelineId']}) is filled in for this implementation: Pipeline={row['IngestionPipelineName']}, Server={row['SourceServerName1']}, Database={row['SourceDatabaseName1']}"
                v_skipped_results.append(v_skipped_message)
                print(v_skipped_message)
            else:
                # Get row as Dictionary and Validate Attributes
                v_config_row_dict = row.asDict()
                v_validation_message = self.validate_config_row(v_config_row_dict)
                # If Validate Successfully, add additional Attributes to row dictionary and save in array to process
                if v_validation_message is None:
                    v_config_row_dict['Status'] = 'Pending'
                    v_config_row_dict['DestinationCatalog'] = v_unity_catalog
                    v_config_row_dict['process_config'] = process_config
                    v_config_row_dict['params_dict'] = self.params_dict

                    v_config_row_dict['ProcessIdentifier'] = f"Pipeline={v_config_row_dict['IngestionPipelineName']} Server={v_config_row_dict['SourceServerName1']} Database={v_config_row_dict['SourceDatabaseName1']}"

                    v_config_row_array.append(v_config_row_dict)

                    print(f"Implementation will proceed for {row['IngestionPipelineName']}, Server={row['SourceServerName1']}, Database={row['SourceDatabaseName1']}")
                else:
                    # Collect Validation Failure Details and save in array to display at process end.
                    v_return_dict['ValidationFailed'] += 1
                    v_validation_message = f"Implementation will NOT proceed for {row['IngestionPipelineName']}, Server={row['SourceServerName1']}, Database={row['SourceDatabaseName1']}: {v_validation_message}"
                    v_validation_errors.append(v_validation_message)
                    print(v_validation_message)

        if len(v_config_row_array) == 0:
            v_return_dict['ValidationFailures'] = v_validation_errors
            v_return_dict['SkippedResults'] = v_skipped_results
        else:
            # Launch New Pipelines to Create based on Configuration in parallel...
            # Collect Thread Results and Errors
            with ThreadPoolExecutor(max_workers=self.get_process_max_workers()) as executor:
                futures = {executor.submit(self.process_config_row, obj): obj for obj in v_config_row_array}
                for future in as_completed(futures):
                    obj = futures[future]
                    try:
                        result = future.result()
                        v_pipeline_id = result['pipeline_id'] if result and 'pipeline_id' in result else None
                        v_thread_results.append(str(obj['ProcessIdentifier']) + ' \nMessage=Ingestion Pipeline Successfully Created - ID: ' + str(v_pipeline_id))
                        #print(f"Thread Successes: ", v_thread_results)
                    except Exception as e:
                        v_thread_errors.append(str(obj['ProcessIdentifier']) + ' \nMessage=' + str(e).replace('\n','')[0:self.get_traceback_length()] + ' \nTraceback=' + traceback.format_exc().replace('\n','')[0:self.get_traceback_length()] )
                        #print(f"Thread Errors: ", v_thread_errors)
        # Collect Run Statistics
        v_return_dict['Success'] = len(v_thread_results)
        v_return_dict['ExecutionFailed'] = len(v_thread_errors)
        v_return_dict['ExecutionFailures'] = v_thread_errors
        v_return_dict['ValidationFailures'] = v_validation_errors
        v_return_dict['SuccessResults'] = v_thread_results
        v_return_dict['SkippedResults'] = v_skipped_results
        # Display Summary
        print("\n\nSummary:\n")
        pprint.pprint(v_return_dict, indent=4, compact=True, sort_dicts=False, width=10000)
        # Raise Exception if any errors were encountered or no Configurations to Process
        if len(v_thread_errors) + len(v_validation_errors) > 0:
            raise Exception(f"{len(v_thread_errors)} Execution errors and {len(v_validation_errors)} Validation Errors were encountered during processing. See exception for details.")
        elif len(v_config_row_array) == 0:
            raise Exception(f"Unable to find valid implementation configuration(s) for {self.params_dict}")
        
        return v_return_dict

    def validate_config_row(self, p_config_row_dict):
        validation_message = ''
        if p_config_row_dict['SourceServerName1'] is None:
            validation_message += 'SourceServerName1 is required\n'
        if p_config_row_dict['SourceDatabaseName1'] is None:
            validation_message += 'SourceDatabaseName1 is required\n'
        if p_config_row_dict['IngestionPipelineName'] is None:
            validation_message += 'IngestionPipelineName is required\n'
        if p_config_row_dict['DestinationSchema'] is None:
            validation_message += 'DestinationSchema is required\n'
        if p_config_row_dict['GatewayPipelineId'] is None:
            validation_message += 'GatewayPipelienID is required\n'
        if p_config_row_dict['TableList'] is None:
            validation_message += 'TableList is required\n'
        else:
            try:
                v__json = json.loads(p_config_row_dict['TableList'])
            except:
                validation_message += 'TableList is not valid JSON\n'
        
        return validation_message if validation_message != '' else None

    def build_pipeline_json_dict(self, p_config_row_dict):
        
        return {
                "name": p_config_row_dict['IngestionPipelineName'],
                "ingestion_definition": {
                                            "ingestion_gateway_id": p_config_row_dict['GatewayPipelineId'],
                                            "source_type": "SQLSERVER",
                                            "objects": [
                                                {
                                                    "table": {
                                                        "source_catalog": p_config_row_dict['SourceDatabaseName1'],
                                                        "source_schema": table["SourceSchema"],
                                                        "source_table": table["SourceTable"],
                                                        "destination_catalog": p_config_row_dict['DestinationCatalog'],
                                                        "destination_schema": p_config_row_dict['DestinationSchema'],
                                                        "destination_table": table["DestinationTable"]
                                                    }
                                                }
                                                for table in json.loads(p_config_row_dict['TableList'])["data"]
                                            ]
                                        }
                }

    def process_config_row(self, p_config_row_dict):

        print("Processing: ", p_config_row_dict['ProcessIdentifier'])
        reponse_dict = {}
        
        try:
            # This function can be called outside of the main process function, so we need to make sure we have the process_config object instantiated
            if 'process_config' not in p_config_row_dict:
                p_config_row_dict['process_config'] = process_config = ProcessConfigData(self.params_dict['p_environment'])

            v_api_json_data_dict = self.build_pipeline_json_dict(p_config_row_dict)
            #print('v_api_json_data_dict:',v_api_json_data_dict)
        
            # Ensure Gateway is running before Pipeline Creation
            self.start_gateway_pipeline(p_config_row_dict)

            v_lakeflow_api = LakeflowIngestionAPI()
            reponse_dict = v_lakeflow_api.create_ingestion_pipeline(v_api_json_data_dict)

            if reponse_dict['status'] == 'ok':
                p_config_row_dict['IngestionPipelineId'] = reponse_dict['pipeline_id']
                self.process_config_update_pipeline_id(p_config_row_dict)

                # Set Permissions if defined
                gb_vars_obj = GlobalVars(self.params_dict['p_environment'])
                v_permissions_list = gb_vars_obj.get_pipeline_permissions_list()
                if v_permissions_list is not None and len(v_permissions_list) > 0:
                    v_lakeflow_api.update_pipeline_permissions(p_config_row_dict['IngestionPipelineId'], v_permissions_list)
                
                # Initial Start of Ingestion Pipeline
                self.start_initial_ingestion_pipeline(p_config_row_dict, v_lakeflow_api)

                # Potentially Shut Down Gateway Pipeline - TBD based on Observation
                # self.stop_gateway_pipeline(p_config_row_dict)
            else:
                raise Exception (f"Error: Create Ingestion Pipeline Failed for {p_config_row_dict['ProcessIdentifier']}: str({reponse_dict})")
            #print("Response:", reponse_dict)
        except Exception as e:
            print(f"Error: {p_config_row_dict['ProcessIdentifier']} {e}")
            raise Exception(f"Error: {p_config_row_dict['ProcessIdentifier']} {e}")
        return reponse_dict
    
    def start_initial_ingestion_pipeline(self, p_config_row_dict, api_obj):

        # Set Pipeline Status on Process Table
        self.set_pipeline_process_status(p_config_row_dict, 'P')

        # Snapshots Start Time in UTC to watch for Ingestion Pipeline
        v_snapshot_start_time = self.get_snapshot_startime()
        print(f"v_snapshot_start_time: {v_snapshot_start_time} Id={p_config_row_dict['GatewayPipelineId']}")

        # Start Initial Ingestion Pipeline
        api_obj.start_pipeline(p_config_row_dict['IngestionPipelineId'])

        # Wait for All Snaphots to complete for current Pipeline
        self.wait_for_snapshots_complete(p_config_row_dict, p_config_row_dict['TableCount'], v_snapshot_start_time, api_obj)
        
        # Start Pipeline to process Snapshots
        api_obj.start_pipeline(p_config_row_dict['IngestionPipelineId'])

        v_retry_max = 20
        v_retry_count = 0
        v_ingestion_last_update_state = None

        # Wait for Ingestion Pipeline to be IDLE
        v_pipeline_dict = api_obj.get_pipeline(p_config_row_dict['IngestionPipelineId'])
        if v_pipeline_dict['status'] == 'ok':
            v_state = v_pipeline_dict['state']
            v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
            v_retry_count = 0
            while v_state != 'IDLE' and v_retry_count < v_retry_max:
                time.sleep(60)
                v_pipeline_dict = api_obj.get_pipeline(p_config_row_dict['IngestionPipelineId'])
                v_state = v_pipeline_dict['state']
                v_ingestion_last_update_state = v_pipeline_dict['latest_update_state']
                v_retry_count += 1
        
        if v_ingestion_last_update_state != 'COMPLETED':
            # Set Pipeline Failed Status on Process Table
            self.set_pipeline_process_status(p_config_row_dict, 'F')
            # Raise Exception
            raise Exception(f"Error: Ingestion Pipeline Initial Run Failed for {p_config_row_dict['ProcessIdentifier']}: {v_ingestion_last_update_state} - {v_pipeline_dict}")
        else:
            # Set Pipeline Success Status on Process Table
            self.set_pipeline_process_status(p_config_row_dict, 'C')
        
        return v_pipeline_dict

    def process_config_update_pipeline_id(self, p_config_row_dict):
        v_stored_procedure_params_dict = { 
                                            'PipelineType': 'Ingestion',
                                            'InternalProductId': p_config_row_dict['InternalProductId'],
                                            'SourceServerName1': p_config_row_dict['SourceServerName1'],
                                            'SourceDatabaseName1': p_config_row_dict['SourceDatabaseName1'],
                                            'SourceConfigTable': p_config_row_dict['SourceConfigTable'],
                                            'PipelineId': p_config_row_dict['IngestionPipelineId'],
                                        }

        p_config_row_dict['process_config'].set_pipeline_id(v_stored_procedure_params_dict)

    def get_ingestion_api(self):
        return LakeflowIngestionAPI()
    
    def set_pipeline_process_status(self, p_config_row_dict, p_status):
        v_stored_procedure_params_dict = {  'PipelineName': p_config_row_dict['IngestionPipelineName'], \
                                            'InternalProductId' : p_config_row_dict['InternalProductId'], \
                                            'DataSourceId' : p_config_row_dict['DataSourceId'], \
                                            'InternalClientId' : p_config_row_dict['InternalClientId'], \
                                            'InternalFacilityId' : p_config_row_dict['InternalFacilityId'], \
                                            'StepType' : 'Extract', \
                                            'Status': p_status
                                        }
        p_config_row_dict['process_config'].set_pipeline_process_status(v_stored_procedure_params_dict)
    