# DAG monitoring

This notebook is used for monitoring a specific DAG file processing.

In [1]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,SOLNDEMOUSR,PUBLIC,INDSOL_CMSGOV_PRICING,PUBLIC


In [2]:
# variable/parameter initialization

import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time ,math
import re

pd.set_option('display.max_colwidth', None)

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]

def get_cleansed_file_basename(p_datafile):
    fl_basename = get_basename_of_datafile(p_datafile)
    # Replace all non alphanumeric characters with _
    fl_name = re.sub('[^0-9a-zA-Z]+', '_', fl_basename)
    return fl_name

# INPUT_DATA_STAGE = config['APP_DB']['ext_stage']
INPUT_DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']

DATA_FILE = '2022_10_01_priority_health_HMO_in-network-rates.zip'
# DATA_FILE = 'reduced_sample_data.json'
# DATA_FILE = '2022-12-01_cigna-health-life-insurance-company_national-ppo_in-network-rates.json.gz'

DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)
DATA_FILE_BASENAME_CLEANSED = get_cleansed_file_basename(DATA_FILE)

TARGET_DATA_STAGE = config['APP_DB']['ext_stage']
TARGET_FOLDER = config['APP_DB']['folder_parsed']


print(f'Input DataFile: @{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}/{DATA_FILE}')
print(f'Target: @{TARGET_DATA_STAGE}/{TARGET_FOLDER}')

Input DataFile: @data_stg/data/2022_10_01_priority_health_HMO_in-network-rates.zip
Target: @ext_data_stg/raw_parsed


---
## Inspection

In [11]:

print('List of tasks running currently ...')
df_running = sp_session.table('current_segment_parsing_tasks_v').to_pandas()

sql_stmt = f'''
select 
    * exclude(task_name ,assigned_task_name)
    ,timestampdiff('minute' ,start_time ,current_timestamp()) as elapsed_minutes
from current_segment_parsing_tasks_v
order by bucket
'''
df_running = sp_session.sql(sql_stmt).to_pandas()
display(df_running)


List of tasks running currently ...


Unnamed: 0,START_TIME,BUCKET,FROM_IDX,TO_IDX,SEGMENTS_RECORD_COUNT,ELAPSED_MINUTES
0,2023-01-10 08:48:34.820,3,3001,4000,999,12
1,2023-01-10 08:48:34.724,4,4001,5000,999,12
2,2023-01-10 08:54:59.174,5,5001,6000,999,6
3,2023-01-10 08:56:55.396,6,6001,7000,999,4
4,2023-01-10 08:59:22.865,7,7001,8000,999,1


In [4]:

print('Tasks ,warehouses and state')
sp_session.sql(f''' SHOW TASKS IN  DATABASE {config['APP_DB']['database']}; ''').collect()
stmt = f'''
    select "name" as task_name
        ,"warehouse" as warehouse
        ,"state" as state
    from table(result_scan(last_query_id()))
    where "name" like '%{DATA_FILE_BASENAME_CLEANSED.upper()}%'
    order by state;
'''

# stmt = f'''
#     select *
#     from table(result_scan(last_query_id()))
#     where "name" like '%{DATA_FILE_BASENAME_CLEANSED.upper()}%'
#     and "state" not in ('suspended' ,'started')
#     ;
# '''

# -- and state 
df = sp_session.sql(stmt).to_pandas()
display(df)


Tasks ,warehouses and state


Unnamed: 0,TASK_NAME,WAREHOUSE,STATE
0,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_17001_18000,DEMO_BUILD_WH,started
1,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_0_1000,DEMO_BUILD_WH,started
2,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_10001_11000,DEMO_BUILD_WH,started
3,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_1001_2000,DEMO_BUILD_WH,started
4,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_11001_12000,DEMO_BUILD_WH,started
...,...,...,...
74,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_8001_9000,DEMO_BUILD_WH,started
75,T_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES_9001_10000,DEMO_BUILD_WH,started
76,T_FH_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES,DEMO_BUILD_WH,started
77,T_SEGH_2022_10_01_PRIORITY_HEALTH_HMO_IN_NETWORK_RATES,DEMO_BUILD_WH,started


In [5]:

print('Tasks that had executed ...')
sql_stmt = f'''
select 
    split_part(task_name ,'.' ,3) as task_name_shortened
    ,* exclude(data_file ,task_name ,inserted_at ,start_time ,end_time ,task_ret_status)
    ,task_ret_status:stored_segment_count::int as stored_segment_count
    ,timestampdiff('minute' ,start_time ,end_time) as elapsed_minutes
from segment_task_execution_status
where 
    data_file = '{DATA_FILE}'
    and task_ret_status is not null
'''
executed_tasks_df = sp_session.sql(sql_stmt).to_pandas()

total_segment_count_stored ,avg_load_time = (0,0)

total_rows = len(executed_tasks_df)
print(f'No of tasks completed: {total_rows}')

if(total_rows > 0):
    total_segment_count_stored = executed_tasks_df[['STORED_SEGMENT_COUNT']].sum()[0]
    print(f'Total segment counts stored so far: {total_segment_count_stored}')

    avg_load_time = executed_tasks_df['ELAPSED_MINUTES'].mean()
    avg_load_time = math.ceil(avg_load_time)
    print(f'Average load time per task: {avg_load_time} minutes')

print('sample 5 rows')
display(executed_tasks_df[0:5])


Tasks that had executed ...
No of tasks completed: 0
sample 5 rows


Unnamed: 0,TASK_NAME_SHORTENED,STORED_SEGMENT_COUNT,ELAPSED_MINUTES


In [6]:

print('File ingestion statuses')
file_ingestion_df = (sp_session.table('file_ingestion_elapsed_v')
    .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
    .select_expr(
        "* exclude(data_file)"
        ,"timestampdiff('hour' ,start_time ,end_time) as elapsed_hours"
    )
    .to_pandas())
display(file_ingestion_df)

File ingestion statuses


Unnamed: 0,START_TIME,END_TIME,ELAPSED_MINUTES,ELAPSED_HOURS
0,2023-01-10 08:48:21.230,2023-01-10 08:48:34.887000-08:00,0,0


In [7]:

print('File header')
df = (
    sp_session.table('in_network_rates_file_header')
    .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
    .select_expr(
        # "* exclude(header ,inserted_at ,data_file_basename ,cleansed_data_file_basename ,data_file)"
        "header ,inserted_at"
        ,"header:total_segments::int as total_segments_in_file"
        )
    # .to_pandas()
)
# display(df)
df.show()

File header
-------------------------------------------------------
|"HEADER"  |"INSERTED_AT"  |"TOTAL_SEGMENTS_IN_FILE"  |
-------------------------------------------------------
|          |               |                          |
-------------------------------------------------------



In [8]:

print('New files loaded in stage:')
df = sp_session.sql('alter stage ext_data_stg refresh;').collect()
sp_session.sql('alter external table ext_negotiated_arrangments_staged refresh;').collect()
cnt = len(df)
print(f'Files count: {cnt}')

New files loaded in stage:
Files count: 0


--- 
### Closeout

    With that we are finished this section of the demo setup

In [9]:
# sp_session.close()
print('Finished!!!')

Finished!!!


In [10]:
# sp_session.call('delete_dag_for_datafile',DATA_FILE_BASENAME_CLEANSED ,False);