# Load Demo with DAG

In this we demonstrate loading of a sample data file [reduced_sample_data.json](../../data/reduced_sample_data.json) and showcase all the 
various resources that gets populated. We now are demonstrating using DAG

#### Pre-requisite
It is assumed that the setup steps are completed successfully. These are:
 - creating the database, schemas, stages
 - defining the external stage
 - defining the functions and stored procedures
 - defining the external tables and views

If you had not done this, use the streamlit (./bin/run_app.sh) to create these as defined in the subpage 'Setup'

In [65]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,VSEKAR,PUBLIC,INDSOL_CMSGOV_PRICING,PUBLIC


In [66]:
import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time
import re

pd.set_option('display.max_colwidth', None)

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]

def get_cleansed_file_basename(p_datafile):
    fl_basename = get_basename_of_datafile(p_datafile)
    # Replace all non alphanumeric characters with _
    fl_name = re.sub('[^0-9a-zA-Z]+', '_', fl_basename)
    return fl_name

# INPUT_DATA_STAGE = config['APP_DB']['ext_stage']
INPUT_DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']

DATA_FILE = '2022_10_01_priority_health_HMO_in-network-rates.zip'
# DATA_FILE = 'reduced_sample_data.json'

DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)
DATA_FILE_BASENAME_CLEANSED = get_cleansed_file_basename(DATA_FILE)

TARGET_DATA_STAGE = config['APP_DB']['ext_stage']
TARGET_FOLDER = config['APP_DB']['folder_parsed']

SEGMENTS_PER_TASK = 200

In [59]:
display(Markdown("### Cleanup block"))
# We will cleanup specific resources and artifacts from possible previous runs.

stmts = [
    f''' delete from segment_task_execution_status where data_file = '{DATA_FILE}'; '''
    ,f''' delete from task_to_segmentids where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_file_header where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_segment_header where data_file = '{DATA_FILE}'; '''
    ,f''' alter stage {INPUT_DATA_STAGE} refresh; '''
]    
    
print(' truncating tables ...')
for stmt in stmts:
    sp_session.sql(stmt).collect()

print(f''' cleaning up files in external stage under path {TARGET_FOLDER}/{DATA_FILE_BASENAME}/ ...''')

stmt = f''' select relative_path from directory(@{TARGET_DATA_STAGE}) where relative_path like '%{DATA_STAGE_FOLDER}/{DATA_FILE_BASENAME}/%'; '''
files = sp_session.sql(stmt).collect()
for r in files:
    stmt = f''' remove @{TARGET_DATA_STAGE}/{r['RELATIVE_PATH']}; '''
    sp_session.sql(stmt).collect()

### Cleanup block

 truncating tables ...
 cleaning up files in external stage under path raw_parsed/2022_10_01_priority_health_HMO_in-network-rates/ ...


---
## Data loading
We will be loading the segments and file header using DAG. 

In [61]:
# we build out the DAG
df = sp_session.call('in_network_rates_dagbuilder' ,f'{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}' ,DATA_FILE 
    ,f"@{TARGET_DATA_STAGE}/{TARGET_FOLDER}" ,SEGMENTS_PER_TASK ,config['SNOW_CONN']['warehouse'])

sp_session.sql(f''' alter stage {TARGET_DATA_STAGE} refresh; ''').collect()
print(' Status of execution')
print(df)

 Status of execution
{
  "data_file": "2022_10_01_priority_health_HMO_in-network-rates.zip",
  "root_task": "DAG_ROOT_2022_10_01_priority_health_HMO_in_network_rates",
  "status": true,
  "task_matrix_shape": [
    5,
    15
  ],
  "term_task": "TERM_tsk_2022_10_01_priority_health_HMO_in_network_rates"
}


The above operation results in defining the DAG in Snowflake like here. The task names are specific to the data file being parsed.
![](../../doc/soln_images/task_dags.png)

In [63]:
# Next we invoke the DAG

start_time = time.time()
print(f'Started at: {datetime.now().strftime("%H:%M:%S")}')

sql_stmts = [
    # f''' alter warehouse {config['SNOW_CONN']['warehouse']} set max_concurrency_level = 8 '''
    # XSMALL | SMALL | MEDIUM | LARGE | XLARGE | XXLARGE | XXXLARGE | X4LARGE | X5LARGE | X6LARGE
    ,f''' alter warehouse {config['SNOW_CONN']['warehouse']} set warehouse_size = SMALL; '''
    ,f''' execute task DAG_ROOT_{DATA_FILE_BASENAME_CLEANSED}; '''
]
for stmt in sql_stmts:
    print(stmt)
    sp_session.sql(stmt).collect()

end_time = time.time()
print(f'Ended at: {datetime.now().strftime("%H:%M:%S")}')

elapsed_time = end_time - start_time
elapsed = str(timedelta(seconds=elapsed_time))
print(f'Elapsed: {elapsed}')

Started at: 19:06:36
 alter warehouse DEMO_BUILD_WH set max_concurrency_level = 8 
 alter warehouse DEMO_BUILD_WH set warehouse_size = SMALL; 
Ended at: 19:06:38
Elapsed: 0:00:01.469378


---
## Inspection (TODO)

In [None]:
import time

continue_sleeping = True

while continue_sleeping == True: 
    time.sleep(2*60)

    sql_stmt = f'''
        select *
        from current_segment_parsing_tasks_v
        where l.data_file = '{DATA_FILE}'
        ;
    '''
    df_running = sp_session.table('current_segment_parsing_tasks_v').to_pandas()
    
    len_running_count = len(df_running)
    print(f'Current running tasks [{len_running_count}]...')
    display(df_running)
    
    sql_stmt = f'''
        select 
            count(l.*) as l_count
            ,count(r.*) as r_count
            ,l_count - r_count as row_count_diff
            -- l.*
        from segment_task_execution_status as l
            full outer join segments_counts_for_datafile_v as r
                on r.task_name = l.task_name
        where not (l.task_name  like any ('DAG_%' ,'TERM_%' ,'%T_FH_%' ))
            and JSON_EXTRACT_PATH_TEXT(l.task_ret_status ,'task_ignored_parsing') is null
            and l.data_file = '{DATA_FILE}'
    '''
    df = sp_session.sql(sql_stmt).to_pandas()
    row_count = df['ROW_COUNT_DIFF'][0]
    finished_row_count = df['R_COUNT'][0]
    print(f'Finished tasks [{row_count}] ...')
    if row_count == 0:
        print('Finished sleeping ...')
        continue_sleeping = False
        break

    display(df)
    print('\n ------------------------------ ')

In [None]:
display(Markdown("The table in_network_rates_file_header holds the file header elements"))

df = (sp_session.table('in_network_rates_file_header')
        .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
        .to_pandas())

display(df)

In [None]:
print(' The HEADER has the following data ')
json.loads(df['HEADER'][0])

In [None]:
display(Markdown("The table in_network_rates_segment_header lists the various segments that were loaded"))

df = (sp_session.table('in_network_rates_segment_header')
        .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
        .sample(n=5)
        .to_pandas())

display(df)

In [None]:
print(' The NEGOTIATED_RATES_INFO has the following data ')
json.loads(df['NEGOTIATED_RATES_INFO'][0])

In [None]:
para = f''' 
The parsed data are stored as parquet files in the external stage at: @{TARGET_DATA_STAGE}/{TARGET_FOLDER}. The directory structure follows the format:

<ext stage>/<folder_parsed>/<data file basename (ex: reduced_sample_data)/<segment_id>/<segment type (negotiated_rates)>/data_<seq_no>_<chunk_no>.parquet.gz

Description:
- ext stage : external stage name
- folder_parsed : configured value of APP_DB.folder_parsed in config.ini
- data file basename : the data file basename, without the file extension
- segment_id : a unique identifier for the segment, this is a composite key of <negotiation_arrangement>::<billing_code_type>::<billing_code>::<billing_code_type_version>
- segment type : indicates the segment children type, this could be either of negotiated_rates/bundled_codes/covered_services
- The file which will contain the record, stored in parquet file.
'''
print(para)

stmt = f''' select relative_path from directory(@{TARGET_DATA_STAGE}) where relative_path like '%{TARGET_FOLDER}/{DATA_FILE_BASENAME}%' limit 5; '''
df = sp_session.sql(stmt).to_pandas()
display(df)

In [None]:
para = f''' 
The parquet file can be queried inside Snowflake via external table: ext_negotiated_arrangments_staged. Each of the folder structure is partitioned to seperate columns , which can help with pruning to specific segments.
The value contains the negotiated_rates and other children elements stored in JSON format
'''
print(para)

pd.set_option('display.max_colwidth', 25)
df = (sp_session.table('ext_negotiated_arrangments_staged')
        .filter(F.col('P_DATA_FL') == F.lit(DATA_FILE_BASENAME))
        .sample(n=5)
        .to_pandas())

display(df)


In [None]:
print(' a sample view of one of the records')
j = json.loads(df['VALUE'][0])
print(j)

In [None]:
display(Markdown("The table segment_task_execution_status contains the audit of execution for the above stored procedures"))

pd.set_option('display.max_colwidth', None)
df = (sp_session.table('segment_task_execution_status')
        .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
        .sample(n=5)
        .to_pandas())

display(df)

In [None]:
display(Markdown("The view segments_counts_for_datafile_v, based of table segment_task_execution_status, will help to identify the number of negotiated_arrangement segments for a specific data file. This view is populated once all the segments in a specific data files are parsed out"))

df = (sp_session.table('segments_counts_for_datafile_v')
        .filter(F.col('DATA_FILE') == F.lit(DATA_FILE))
        .sample(n=5)
        .to_pandas())

display(df)

--- 
### Closeout

    With that we are finished this section of the demo setup

In [None]:
# sp_session.close()
print('Finished!!!')