In [0]:
import json
import re
import requests
import time
import threading
from datetime import datetime
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines

In [0]:
class LakeflowAPI:
    def __init__(self):
        notebook_context  = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
        self.api_token = notebook_context.apiToken().get()
        self.databricks_url = notebook_context.apiUrl().get()
        self.api_base_headers_dict = {'Authorization': f'Bearer {self.api_token}', "Content-Type": "application/json"}

    def get_workspace_api_url(self):
        return self.databricks_url
    
    def build_api_header_dict(self, p_headers_dict=None):
        return self.api_base_headers_dict | (p_headers_dict or {})

    def get_pipeline_api_url(self):
        workspace_api_url = self.get_workspace_api_url()
        return workspace_api_url + "/api/2.0/pipelines"

    def get_catalog_api_url(self):
        workspace_api_url = self.get_workspace_api_url()
        return workspace_api_url + "/api/2.0/unity-catalog/catalogs"

    def create_pipeline(self, p_pipeline_json, p_headers_dict=None):
        print("\nCreating pipeline...\n")
        return_dict = {'status': 'error'}
                
        pipeline_api_url = self.get_pipeline_api_url()
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict
        return_dict['pipeline_json'] = p_pipeline_json
        
        response = requests.post(pipeline_api_url, headers=headers_dict, json=p_pipeline_json)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['pipeline_id'] = return_dict['response']["pipeline_id"]

        #print("\n","return_dict:",return_dict,"\n")

        return return_dict
    
    def update_pipeline(self, p_pipeline_id, p_pipeline_json, p_headers_dict=None):
        print(f"\nUpdating pipeline...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}

        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict
        return_dict['pipeline_json'] = p_pipeline_json
        
        response = requests.put(pipeline_api_url, headers=headers_dict, json=p_pipeline_json)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['pipeline_id'] = p_pipeline_id

        #print("\n",f"{p_pipeline_id} - return_dict:",return_dict,"\n")

        return return_dict
    
    def start_pipeline(self, p_pipeline_id, p_full_refresh=False, p_refresh_table_list=None, p_full_refresh_table_list=None, p_cause="No Reason passed",  p_headers_dict=None):

        print(f"\nStarting pipeline...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}
        
        v_pipeline_json = {} 

        v_pipeline_json['full_refresh'] = p_full_refresh
        v_pipeline_json['cause'] = p_cause

        if not p_full_refresh:
            if p_refresh_table_list and len(p_refresh_table_list) > 0:
                v_pipeline_json['refresh_selection'] = p_refresh_table_list
            if p_full_refresh_table_list and len(p_full_refresh_table_list) > 0:
                v_pipeline_json['full_refresh_selection'] = p_full_refresh_table_list

        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id) + "/updates"
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict
        return_dict['pipeline_json'] = v_pipeline_json

        response = requests.post(pipeline_api_url, headers=headers_dict, json=v_pipeline_json)

        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['pipeline_update_id'] = return_dict['response']["update_id"]

        return return_dict

    def stop_pipeline(self, p_pipeline_id, p_headers_dict=None):
        print(f"\nStopping pipeline...{p_pipeline_id}\n")

        return_dict = {'status': 'error'}
        v_pipeline_json = {}

        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id) + "/stop"
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict
        return_dict['pipeline_json'] = v_pipeline_json

        response = requests.post(pipeline_api_url, headers=headers_dict, json=v_pipeline_json)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'

        return return_dict
    
    def get_pipeline(self, p_pipeline_id, p_headers_dict=None):
        print(f"\nGetting pipeline definition...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}
        
        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.get(pipeline_api_url, headers=headers_dict)
        return_dict['response'] = response.json()
        return_dict['pipeline_id'] = p_pipeline_id

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['name'] = return_dict['response']['name']
            return_dict['state'] = return_dict['response']['state']
            
            return_dict['latest_update_state'] = return_dict['response']['latest_updates'][0]['state'] \
                        if 'latest_updates' in return_dict['response'] else return_dict['state']
            
        return return_dict
    
    def get_pipeline_updates(self, p_pipeline_id, p_until_update_id=None, p_page_token=None, p_max_results=None, p_headers_dict=None):
        print(f"\nGetting pipeline updates...{p_pipeline_id} - {p_until_update_id}\n")
        return_dict = {'status': 'error'}
        
        query_params_dict = {'until_update_id' : p_until_update_id, 'page_token' : p_page_token, 'max_results' : p_max_results}

        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id) + "/updates"
        
        query_string = self.build_query_string(query_params_dict)
        if query_string:
            pipeline_api_url += "?" + query_string

        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.get(pipeline_api_url, headers=headers_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['pipeline_id'] = p_pipeline_id
            
        return return_dict
    
    def get_pipeline_events(self, p_pipeline_id, p_filter=None, p_page_token=None, p_max_results=250, p_order_by=None, p_headers_dict=None):
        #print(f"\nGetting pipeline events...{p_pipeline_id} - {p_filter}\n")
        return_dict = {'status': 'error'}
        
        query_params_dict = {'filter' : p_filter, 'page_token' : p_page_token, 'max_results' : p_max_results, 'order_by' : p_order_by}

        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id) + "/events"
        
        query_string = self.build_query_string(query_params_dict)
        if query_string:
            pipeline_api_url += "?" + query_string

        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.get(pipeline_api_url, headers=headers_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
            return_dict['pipeline_id'] = p_pipeline_id
            
        return return_dict
    
    def build_query_string (self, p_dict):
        query_string = "&".join([f"{k}={v}" for k,v in p_dict.items() if v is not None])
        return query_string if len(query_string) > 0 else None
    
    def get_pipeline_permissions(self, p_pipeline_id, p_headers_dict=None):
        print(f"\nGet pipeline permissions...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}
        
        pipeline_api_url = self.get_workspace_api_url() + "/api/2.0/permissions/pipelines/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.get(pipeline_api_url, headers=headers_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
        
        return return_dict

    def update_pipeline_permissions(self, p_pipeline_id, p_permissions_list, p_headers_dict=None):
        # Add to existing permissions - do not require IS_OWNER to be set
        print(f"\nUpdating pipeline permissions...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}

        v_pipeline_json_dict = {}
        v_pipeline_json_dict['access_control_list'] = p_permissions_list
        v_pipeline_json = json.dumps(v_pipeline_json_dict, indent=4, ensure_ascii=False)
        
        pipeline_api_url = self.get_workspace_api_url() + "/api/2.0/permissions/pipelines/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.patch(pipeline_api_url, headers=headers_dict, json=v_pipeline_json_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
        
        return return_dict
    
    def set_pipeline_permissions(self, p_pipeline_id, p_permissions_list, p_headers_dict=None):
        # Requires IS_OWNER to be set
        print(f"\nUpdating pipeline permissions...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}

        v_pipeline_json_dict = {}
        v_pipeline_json_dict['access_control_list'] = p_permissions_list
        v_pipeline_json = json.dumps(v_pipeline_json_dict, indent=4, ensure_ascii=False)
        
        pipeline_api_url = self.get_workspace_api_url() + "/api/2.0/permissions/pipelines/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.put(pipeline_api_url, headers=headers_dict, json=v_pipeline_json_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'
        
        return return_dict

    def delete_pipeline(self, p_pipeline_id, p_headers_dict=None):
        print(f"\nDeleting pipeline...{p_pipeline_id}\n")
        return_dict = {'status': 'error'}
        
        pipeline_api_url = self.get_pipeline_api_url() + "/" + str(p_pipeline_id)
        headers_dict = self.build_api_header_dict(p_headers_dict)

        return_dict['pipeline_api_url'] = pipeline_api_url
        return_dict['headers_dict'] = headers_dict

        response = requests.delete(pipeline_api_url, headers=headers_dict)
        return_dict['response'] = response.json()

        if response.status_code == 200:
            return_dict['status'] = 'ok'

        return return_dict


In [0]:
class LakeflowAPIExtension():
    def __init__(self):
        super().__init__()
        self.workspace_client = WorkspaceClient()

    def wait_for_gateway_running(self, gateway_pipeline_id: str, max_wait_minutes: int = 30) -> bool:
        """
        Wait for gateway pipeline to reach RUNNING state.
        This is required before we can query event logs.
        
        Returns:
            bool: True if gateway is running, False if timeout or failure
        """
        print("\n" + "=" * 80)
        print("WAITING FOR GATEWAY TO REACH RUNNING STATE")
        print("=" * 80)
        print(f"Gateway ID: {gateway_pipeline_id}")
        print(f"Max wait: {max_wait_minutes} minutes")
        print("Note: Event logs cannot be queried until gateway is RUNNING")
        
        start_time = time.time()
        max_wait_seconds = max_wait_minutes * 60
        check_interval = 15  # Check every 15 seconds
        
        try:
            while (time.time() - start_time) < max_wait_seconds:
                # Get gateway pipeline status
                pipeline = self.workspace_client.pipelines.get(gateway_pipeline_id)
                pipeline_state = pipeline.state.value if pipeline.state else 'UNKNOWN'
                
                elapsed = int(time.time() - start_time)
                print(f"   [{elapsed}s] Gateway ({gateway_pipeline_id}) state: {pipeline_state}")
                
                # Check if gateway is running
                if pipeline_state == 'RUNNING':
                    # Also check latest update state if available
                    if pipeline.latest_updates:
                        latest_update = pipeline.latest_updates[0]
                        update_state = latest_update.state.value if latest_update.state else 'UNKNOWN'
                        print(f" Latest update state ({gateway_pipeline_id}): {update_state}")
                        
                        if update_state in ['COMPLETED', 'RUNNING']:
                            print(f"\n✅ SUCCESS: Gateway ({gateway_pipeline_id}) is RUNNING with update state {update_state}!")
                            return True
                        else:
                            print(f"   Waiting for update to complete (current: {update_state})...")
                    else:
                        # No updates yet but pipeline is RUNNING
                        print(f"\n✅ SUCCESS: Gateway ({gateway_pipeline_id}) is RUNNING (no updates yet)!")
                        return True
                
                # Check for failure states
                if pipeline_state in ['FAILED', 'DELETED']:
                    print(f"\n❌ ERROR: Gateway ({gateway_pipeline_id}) is in {pipeline_state} state")
                    return False
                
                # Wait before next check
                time.sleep(check_interval)
            
            # Timeout
            print(f"\n⏰ TIMEOUT: Gateway ({gateway_pipeline_id}) did not reach RUNNING state within {max_wait_minutes} minutes")
            return False
            
        except Exception as e:
            print(f"\n❌ ERROR: Failed to check gateway ({gateway_pipeline_id}) status: {e}")
            import traceback
            traceback.print_exc()
            return False
    
    def wait_for_snapshots_complete(
        self,
        gateway_pipeline_id: str,
        ingestion_id: str,
        expected_table_count: int,
        snapshot_start_time: datetime = '2000-01-01T00:00:00.000Z', #datetime.now(),
        start_ingestion_flag: bool = False,
        max_wait_minutes: int = 120
    ) -> bool:
        """
        Wait for all table snapshots to complete using event log.
        
        IMPORTANT: Gateway must be in RUNNING state before this method is called.
        This method will first ensure gateway is running, then monitor event logs.
        
        Query checks: SELECT COUNT(*) FROM event_log(gateway_id)
                      WHERE event_type = 'flow_progress'
                      AND CAST(PARSE_JSON(message):eventType AS STRING) = 'SNAPSHOT_COMPLETED'
        
        Returns:
            bool: True if all snapshots completed, False if timeout or failure
        """
        print("\n" + "=" * 80)
        print("WAITING FOR GATEWAY SNAPSHOTS TO COMPLETE")
        print("=" * 80)
        print(f"Gateway ID: {gateway_pipeline_id}")
        print(f"Expected tables: {expected_table_count}")
        print(f"Max wait: {max_wait_minutes} minutes")
        
        # STEP 1: Ensure gateway is in RUNNING state before querying event logs
        print(f"\n[STEP 1] Ensuring gateway is in RUNNING state.. IDs {gateway_pipeline_id}/{ingestion_id}")
        gateway_running = self.wait_for_gateway_running(gateway_pipeline_id, max_wait_minutes=20)
        
        if not gateway_running:
            print("\n❌ ERROR: Gateway did not reach RUNNING state")
            print("   Cannot query event logs until gateway is RUNNING")
            return False
        
        print("\n" + "=" * 80)
        print("TRIGGERING INITIAL INGESTION (TO START GATEWAY SNAPSHOTS)")
        print("=" * 80)
        print("Note: The first ingestion run triggers the gateway to start taking snapshots")
        print("      We'll wait for snapshots to complete, then run ingestion again")
        print("      Pipeline already has table configuration - no need to specify tables")

        if start_ingestion_flag:
            api_obj = LakeflowAPI()
            response_dict = api_obj.start_pipeline(ingestion_id)
            initial_update_id = response_dict['pipeline_update_id'] if response_dict['status'] == 'ok' else 'not_found'

            print(f"\n✓ Initial ingestion triggered - Update ID: {initial_update_id}")
            #print(f"  This will cause the gateway to begin snapshot process for all {len(tables)} configured tables")

        print(f"\n[STEP 2] Gateway is RUNNING - monitoring snapshot progress via event logs - IDs {gateway_pipeline_id}/{ingestion_id}...")
        
        # SQL query to count completed snapshots
        snapshot_query = f"""
        SELECT COUNT(*) as completed_count
        FROM event_log('{gateway_pipeline_id}')
        WHERE event_type = 'flow_progress'
          AND timestamp >= '{snapshot_start_time}'
          AND CAST(PARSE_JSON(message):eventType AS STRING) = 'SNAPSHOT_COMPLETED'
        """

        start_time = time.time()
        max_wait_seconds = max_wait_minutes * 60
        check_interval = 15  # Check every 15 seconds
        elapsed_time = 0
        last_count = 0
        error_flag = False
        
        try:
            while elapsed_time < max_wait_seconds:
                
                try:
                    result = spark.sql(snapshot_query).collect()   
                    
                    if result:
                        completed_count = result[0]['completed_count']
                        progress_pct = (completed_count / expected_table_count * 100) if expected_table_count > 0 else 0

                        # Print progress update
                        elapsed = int(time.time() - start_time)
                        if completed_count != last_count or elapsed % 30 == 0:
                            print(f"\n⏱  [{elapsed}s] Snapshot Progress: {progress_pct:.1f}%")
                            print(f"   Completed: {completed_count}/{expected_table_count} tables")
                            
                            # Progress bar
                            bar_length = 50
                            filled = int(bar_length * completed_count / expected_table_count) if expected_table_count > 0 else 0
                            bar = '█' * filled + '░' * (bar_length - filled)
                            print(f"   [{bar}]")
                            
                            last_count = completed_count
                        
                        # Check if all completed
                        if completed_count >= expected_table_count:
                            total_time = int(time.time() - start_time)
                            print(f"\n{'=' * 80}")
                            print(f"✅ SUCCESS: All {expected_table_count} table snapshots completed!")
                            print(f"{'=' * 80}")
                            print(f"   Total time: {int(total_time / 60)} min {total_time % 60} sec")
                            print(f"   Gateway SqlEvent ({gateway_pipeline_id}) is ready!")
                            return True
                    
                except Exception as e:
                    elapsed = int(time.time() - start_time)
                    print(f"\n   [{elapsed}s] ⚠ IDs {gateway_pipeline_id}/{ingestion_id} - Could not query event log: {e}")
                    error_flag = True
                    break
                    #print(f"      Retrying...")
                
                # Wait before next check
                time.sleep(check_interval)
                elapsed_time = int(time.time() - start_time)
            
            if not error_flag:
                # Timeout
                print(f"\n{'=' * 80}")
                print(f"⏰ TIMEOUT: Snapshots did not complete within {max_wait_minutes} minutes IDs {gateway_pipeline_id}/{ingestion_id}")
                print(f"{'=' * 80}")
                print(f"   Completed: {last_count}/{expected_table_count} tables")
                print(f"   You may need to:")
                print(f"   1. Increase max_wait_minutes")
                print(f"   2. Check gateway status in Databricks UI")
                print(f"   3. Split into smaller pipelines (50+ tables)")
                return False
            
        except Exception as e:
            print(f"\n❌ ERROR: Failed to monitor snapshots: {e}")
            import traceback
            traceback.print_exc()
            return False
    
    def wait_for_pipeline_idle(self, pipeline_id: str, max_wait_minutes: int = 30) -> bool:
        """
        Wait for pipeline to reach IDLE state (not running).
        
        Args:
            pipeline_id: Pipeline ID to check
            max_wait_minutes: Maximum minutes to wait
            
        Returns:
            bool: True if pipeline is idle, False if timeout or failure
        """
        print(f"\n{'=' * 80}")
        print(f"CHECKING PIPELINE STATE")
        print(f"{'=' * 80}")
        print(f"Pipeline ID: {pipeline_id}")
        
        start_time = time.time()
        max_wait_seconds = max_wait_minutes * 60
        check_interval = 10  # Check every 10 seconds
        
        try:
            while (time.time() - start_time) < max_wait_seconds:
                # Get pipeline status
                pipeline = self.workspace_client.pipelines.get(pipeline_id)
                pipeline_state = pipeline.state.value if pipeline.state else 'UNKNOWN'
                
                elapsed = int(time.time() - start_time)
                
                # Check if pipeline is idle
                if pipeline_state == 'IDLE':
                    print(f"✓ Pipeline is IDLE - ready for next run")
                    return True
                elif pipeline_state == 'RUNNING':
                    # Check update state
                    if pipeline.latest_updates:
                        latest_update = pipeline.latest_updates[0]
                        update_state = latest_update.state.value if latest_update.state else 'UNKNOWN'
                        print(f"   [{elapsed}s] Pipeline state: {pipeline_state}, Update state: {update_state}")
                        
                        # Check if update is in terminal state
                        if update_state in ['COMPLETED', 'FAILED', 'CANCELED']:
                            print(f"   Update {update_state} - waiting for pipeline to become IDLE...")
                        else:
                            print(f"   Pipeline is running (update: {update_state})...")
                    else:
                        print(f"   [{elapsed}s] Pipeline state: {pipeline_state} (no updates)")
                elif pipeline_state in ['FAILED', 'DELETED']:
                    print(f"⚠ WARNING: Pipeline is in {pipeline_state} state")
                    return False
                else:
                    print(f"   [{elapsed}s] Pipeline state: {pipeline_state}")
                
                # Wait before next check
                time.sleep(check_interval)
            
            # Timeout
            print(f"\n⏰ TIMEOUT: Pipeline did not become IDLE within {max_wait_minutes} minutes")
            print(f"   You may continue anyway or wait manually")
            return False
            
        except Exception as e:
            print(f"⚠ WARNING: Failed to check pipeline status: {e}")
            return False


    def wait_for_snapshots_complete_via_api(self,
        gateway_pipeline_id: str,
        ingestion_id: str,
        expected_table_count: int,
        start_timestamp: datetime = '2000-01-01T00:00:00.000Z',
        start_ingestion_flag: bool = False,
        max_wait_minutes: int = 120
        ) -> bool:
        """
        Wait for all snapshots to complete using event log.
        
        Returns:
            bool: True if all snapshots completed, False if timeout or failure
        """
        print("\n" + "=" * 80)
        print("WAITING FOR GATEWAY SNAPSHOTS TO COMPLETE")
        print("=" * 80 + "\n")
        print(f"Waiting for snapshots to complete for ingestion {ingestion_id}...")
        print(f"Gateway ID: {gateway_pipeline_id}")
        print(f"Ingestion ID: {ingestion_id}")
        print(f"Max wait: {max_wait_minutes} minutes")
        print("Note: Event logs cannot be queried until gateway is RUNNING")
        print("Note: If your pipeline is not in RUNNING state, this method will fail")
        
        # STEP 1: Ensure gateway is in RUNNING state before querying event logs
        print(f"\n[STEP 1] Ensuring gateway is in RUNNING state.. IDs {gateway_pipeline_id}/{ingestion_id}")
        # gateway_running = self.wait_for_gateway_running(gateway_pipeline_id, max_wait_minutes=20)
        
        # if not gateway_running:
        #     print("\n❌ ERROR: Gateway did not reach RUNNING state")
        #     print("   Cannot query event logs until gateway is RUNNING")
        #     return False
        
        api_obj = LakeflowAPI()

        start_time = time.time()
        max_wait_seconds = max_wait_minutes * 60
        check_interval = 15  # Check every 15 seconds
        max_zero_count = 30  # Maximum number of consecutive zero counts before timeout
        zero_count = 0
        elapsed_time = 0
        last_count = 0
        error_flag = False

        last_timestamp = start_timestamp
        snapshot_completed_count = 0
        snapshot_completed_total = 0

        while elapsed_time < max_wait_seconds:

            filter = f"level='INFO' AND timestamp >= '{last_timestamp}'"
            return_events = api_obj.get_pipeline_events(
                                                        p_pipeline_id=gateway_pipeline_id,
                                                        p_filter=filter,
                                                        p_page_token=None,
                                                        p_max_results=250,
                                                        p_order_by='timestamp asc')
            
            events_count = len(return_events['response']['events']) if return_events['status'] == 'ok' and 'events' in return_events['response'] else 0
            print(f"Event Response: Status={return_events['status']} Count={events_count} Filter={filter} Id={gateway_pipeline_id} ZeroCount={zero_count} Elapsed={elapsed_time} Max={max_wait_seconds}")   

            if  return_events['status'] != 'ok':
                break
        
            #Check if all snapshots completed
            snapshot_completed_count = 0
            if events_count > 0:
                last_timestamp = return_events['response']['events'][-1]['timestamp']
                for event in return_events['response']['events']:
                    if event['event_type'] == 'flow_progress':
                        event_message = json.loads(event['message'])
                        if event_message['eventType'] == 'SNAPSHOT_COMPLETED':
                            snapshot_completed_count += 1

                snapshot_completed_total += snapshot_completed_count
                print(f"Snapshot Tracker: {snapshot_completed_count} / {snapshot_completed_total}")

                progress_pct = (snapshot_completed_total / expected_table_count * 100) if expected_table_count > 0 else 0

                # Print progress update
                elapsed = int(time.time() - start_time)
                if snapshot_completed_total != last_count or elapsed % 30 == 0:
                    print(f"\n⏱  [{elapsed}s] Snapshot Progress: {progress_pct:.1f}%")
                    print(f"   Completed: {snapshot_completed_total}/{expected_table_count} tables")
                    
                    # Progress bar
                    bar_length = 50
                    filled = int(bar_length * snapshot_completed_total / expected_table_count) if expected_table_count > 0 else 0
                    bar = '█' * filled + '░' * (bar_length - filled)
                    print(f"   [{bar}]")
                    
                    last_count = snapshot_completed_total
                
                if snapshot_completed_total >= expected_table_count:
                    total_time = int(time.time() - start_time)
                    print(f"\n{'=' * 80}")
                    print(f"\n✅ SUCCESS: All {expected_table_count} snapshots completed!")
                    print(f"{'=' * 80}")
                    print(f"   Total time: {int(total_time / 60)} min {total_time % 60} sec")
                    print(f"   Gateway APIEvent  ({gateway_pipeline_id}) is ready!")
                    return True
            else:
                zero_count += 1   
                if zero_count >= max_zero_count:
                    break
            # else:
            #     print(f"\n❌ ERROR: Not all snapshots completed. Expected: {expected_table_count}, Actual: {snapshot_completed_total}")
            #     return False 
            
            # Wait before next check
            time.sleep(check_interval)
            elapsed_time = int(time.time() - start_time)

        print(f"\n❌ ERROR: Not all snapshots completed after checking for {max_wait_seconds}. Expected: {expected_table_count}, Actual: {snapshot_completed_total} ZeroCount={zero_count}")
        return False     





In [0]:
class LakeflowGatewayAPI(LakeflowAPI):
    def __init__(self):
        super().__init__()

    def create_gateway_pipeline(self, p_pipeline_json, p_headers_dict=None):
        print("\nCreating gateway pipeline...\n")
        v_template_pipeline_json =   {
                                        "pipeline_type": "INGESTION_GATEWAY", 
                                        "photon": False,
                                        "serverless": False,
                                        "continuous": True,
                                        "name": None,
                                        "catalog": None,
                                        "target": None,
                                        "clusters": None,
                                        # Gateway-specific configuration
                                        "gateway_definition": {
                                            "connection_name" : None,
                                            "gateway_storage_catalog": None,
                                            "gateway_storage_schema" : None
                                        }
                                    }
        v_pipeline_json = v_template_pipeline_json | p_pipeline_json
        self.validate_gateway_pipeline(v_template_pipeline_json, v_pipeline_json)
        return self.create_pipeline(v_pipeline_json, p_headers_dict)
    
    def validate_gateway_pipeline(self, p_template_pipeline_json, p_pipeline_json):
        
        if not p_pipeline_json:
            raise ValueError("Pipeline JSON is empty")
        if not isinstance(p_pipeline_json, dict):
            raise ValueError("Pipeline JSON is not a dictionary")
        for key,value in p_template_pipeline_json.items():
            if key not in p_pipeline_json or p_pipeline_json[key] is None:
                raise ValueError(f"Invalid key: {key} or Value not set")
            if key == "gateway_definition": 
                if not isinstance(value, dict):
                    raise ValueError("Gateway definition must be a dictionary")
                for key2,value2 in value.items():
                    if key2 not in p_pipeline_json[key] or p_pipeline_json[key][key2] is None:
                        raise ValueError(f"Invalid key: {key2} or Value not set")
        return True
        

In [0]:
class LakeflowIngestionAPI(LakeflowAPI):
    def __init__(self):
        super().__init__()

    def create_ingestion_pipeline(self, p_pipeline_json, p_headers_dict=None):
        v_template_pipeline_json =   {
                                        "pipeline_type": "MANAGED_INGESTION",
                                        "name": None,
                                        "photon": True,
                                        "serverless": True,
                                        "continuous": False,
                                        # Ingestion-specific configuration
                                        "ingestion_definition": {
                                            "source_type": "SQLSERVER",
                                            "ingestion_gateway_id": None,
                                            "objects": [
                                                            {
                                                                "table": {
                                                                    "source_catalog": None,
                                                                    "source_schema": None,
                                                                    "source_table": None,
                                                                    "destination_catalog": None,
                                                                    "destination_schema": None,
                                                                    "destination_table": None
                                                                }
                                                            }
                                                        ]
                                            }
                                        }
        v_pipeline_json = v_template_pipeline_json | p_pipeline_json
        self.validate_ingestion_pipeline(v_template_pipeline_json, v_pipeline_json)
        return self.create_pipeline(v_pipeline_json, p_headers_dict)
    
    def validate_ingestion_pipeline(self, p_template_pipeline_json, p_pipeline_json):
        if not p_pipeline_json:
            raise ValueError("Pipeline JSON is empty")
        if not isinstance(p_pipeline_json, dict):
            raise ValueError("Pipeline JSON is not a dictionary")
        for key,value in p_template_pipeline_json.items():
            if key not in p_pipeline_json or p_pipeline_json[key] is None:
                raise ValueError(f"Invalid key: {key} or Value not set")
            if key == "ingestion_definition": 
                if not isinstance(value, dict):
                    raise ValueError("Ingestion definition must be a dictionary")
                for key2,value2 in value.items():
                    if key2 not in p_pipeline_json[key] or p_pipeline_json[key][key2] is None:
                        raise ValueError(f"Invalid key: {key2} or Value not set")
                    if key2 == "objects": 
                        if not isinstance(value2, list):
                            raise ValueError("Objects must be a list")
                        # for obj in value2:
                        #     if not isinstance(obj, dict):
                        #         raise ValueError("Object must be a dictionary")
                        #     for key3,value3 in obj.items():
                        #         if key3 not in obj or obj[key3] is None:
                        #             raise ValueError(f"Invalid key: {key3} or Value not set")
                        #         if key3 == "table": 
                        #             if not isinstance(value3, dict):
                        #                 raise ValueError("Table must be a dictionary")
                        #             for key4,value4 in value3.items():
                        #                 if key4 not in obj[key3] or obj[key3][key4] is None:
                        #                     raise ValueError(f"Invalid key: {key4} or Value not set")
        return True

In [0]:
# from datetime import datetime, timezone, timedelta
# #Current time in UTC
# now_utc = datetime.now(timezone.utc)
# # substract 5 minutes
# now_utc = now_utc - timedelta(minutes=5)
# # Format with timezone offset and 'Z'
# formatted = now_utc.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'  # ISO 8601 with milliseconds

# print(formatted)

# # If you want explicit offset instead of 'Z'
# formatted_with_offset = now_utc.strftime('%Y-%m-%dT%H:%M:%S.%f%z')[:-3]
# print(formatted_with_offset)
