# DAG monitoring

In [2]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,VSEKAR,PUBLIC,INDSOL_CMSGOV_PRICING,PUBLIC


In [8]:
# variable/parameter initialization

import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time
import re

pd.set_option('display.max_colwidth', None)

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]

def get_cleansed_file_basename(p_datafile):
    fl_basename = get_basename_of_datafile(p_datafile)
    # Replace all non alphanumeric characters with _
    fl_name = re.sub('[^0-9a-zA-Z]+', '_', fl_basename)
    return fl_name

INPUT_DATA_STAGE = config['APP_DB']['ext_stage']
# INPUT_DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']

DATA_FILE = '2022_10_01_priority_health_HMO_in-network-rates.zip'
# DATA_FILE = 'reduced_sample_data.json'
DATA_FILE = '2022-12-01_cigna-health-life-insurance-company_national-ppo_in-network-rates.json.gz'

DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)
DATA_FILE_BASENAME_CLEANSED = get_cleansed_file_basename(DATA_FILE)

TARGET_DATA_STAGE = config['APP_DB']['ext_stage']
TARGET_FOLDER = config['APP_DB']['folder_parsed']

SEGMENTS_PER_TASK = 1500

print(f'Input DataFile: @{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}/{DATA_FILE}')
print(f'Target: @{TARGET_DATA_STAGE}/{TARGET_FOLDER}')

Input DataFile: @ext_data_stg/data/2022-12-01_cigna-health-life-insurance-company_national-ppo_in-network-rates.json.gz
Target: @ext_data_stg/raw_parsed


---
## Inspection

In [13]:

print('List of tasks running currently ...')
df_running = sp_session.table('current_segment_parsing_tasks_v').to_pandas()

sql_stmt = f'''
select 
    * exclude(task_name ,assigned_task_name)
    ,timestampdiff('minute' ,start_time ,current_timestamp()) as elapsed_minutes
from current_segment_parsing_tasks_v
order by bucket
'''
df_running = sp_session.sql(sql_stmt).to_pandas()
display(df_running)


List of tasks running currently ...


Unnamed: 0,START_TIME,BUCKET,FROM_IDX,TO_IDX,SEGMENTS_RECORD_COUNT,ELAPSED_MINUTES
0,2023-01-08 17:55:18.075,0,0,1500,1500,38
1,2023-01-08 17:53:54.269,1,1501,3000,1499,40
2,2023-01-08 17:55:18.655,2,3001,4500,1499,38
3,2023-01-08 17:53:55.655,3,4501,6000,1499,40
4,2023-01-08 17:55:18.613,4,6001,7500,1499,38


In [14]:
print('Tasks that had executed ...')
sql_stmt = f'''
select 
    split_part(task_name ,'.' ,3) as task_name_shortened
    ,* exclude(data_file ,task_name ,inserted_at ,start_time ,end_time ,task_ret_status)
    ,task_ret_status:stored_segment_count::int as stored_segment_count
    ,timestampdiff('minute' ,start_time ,end_time) as elapsed_minutes
from segment_task_execution_status
where 
    data_file = '{DATA_FILE}'
    and task_ret_status is not null
'''
executed_tasks_df = sp_session.sql(sql_stmt).to_pandas()
display(executed_tasks_df)

total_segment_count_stored = executed_tasks_df[['STORED_SEGMENT_COUNT']].sum()[0]
print(f'Total segment counts stored so far: {total_segment_count_stored}')

Tasks that had executed ...


Unnamed: 0,TASK_NAME_SHORTENED,STORED_SEGMENT_COUNT,ELAPSED_MINUTES


Total segment counts stored so far: 0.0


In [15]:
print('File ingestion statuses')
file_ingestion_df = sp_session.table('file_ingestion_elapsed_v').to_pandas()
display(file_ingestion_df)

File ingestion statuses


Unnamed: 0,DATA_FILE,START_TIME,END_TIME,ELAPSED_MINUTES
0,reduced_sample_data.json,2023-01-07 04:01:57.259,2023-01-08 18:33:48.100000-08:00,2312
1,2022-12-01_cigna-health-life-insurance-company_national-ppo_in-network-rates.json.gz,2023-01-08 17:53:33.246,2023-01-08 18:33:48.100000-08:00,40
2,2022_10_01_priority_health_HMO_in-network-rates.zip,2023-01-08 05:45:21.341,2023-01-08 18:33:48.100000-08:00,768


--- 
### Closeout

    With that we are finished this section of the demo setup

In [16]:
# sp_session.close()
print('Finished!!!')

Finished!!!
