# Load Demo with DAG for CIGNA

In [1]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,SOLNDEMOUSR,PUBLIC,INDSOL_CMSGOV_PRICING,PUBLIC


In [2]:

# Create warehouses for parallelism
# whs = []
# sp_session.sql('use role accountadmin;').collect()
# for idx in range(11):
#     wh_name = f'INDSOL_PRICE_TRANS_TASK_{idx}_WH'
#     sql_stmt = f'''
#         create or replace warehouse {wh_name} with
#             WAREHOUSE_SIZE = 'MEDIUM'
#             AUTO_RESUME = TRUE
#             AUTO_SUSPEND = 300
#             COMMENT = 'warehouse created as part of pricing transperancy industry solution usecase.'
#         ;
#     '''
#     sp_session.sql(sql_stmt).collect()
#     sp_session.sql(f'grant ALL PRIVILEGES on warehouse {wh_name} to role public;').collect()
#     whs.append(wh_name)
# print(','.join(whs))
# sp_session.use_role(f'''{config['APP_DB']['role']}''')

In [3]:
# Parameter initialization

import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time
import re

pd.set_option('display.max_colwidth', None)

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]

def get_cleansed_file_basename(p_datafile):
    fl_basename = get_basename_of_datafile(p_datafile)
    # Replace all non alphanumeric characters with _
    fl_name = re.sub('[^0-9a-zA-Z]+', '_', fl_basename)
    return fl_name

INPUT_DATA_STAGE = config['APP_DB']['ext_stage']
# INPUT_DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']

# DATA_FILE = '2022_10_01_priority_health_HMO_in-network-rates.zip'
# DATA_FILE = 'reduced_sample_data.json'
DATA_FILE = '2022-12-01_cigna-health-life-insurance-company_national-ppo_in-network-rates.json.gz'

DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)
DATA_FILE_BASENAME_CLEANSED = get_cleansed_file_basename(DATA_FILE)

TARGET_DATA_STAGE = config['APP_DB']['ext_stage']
TARGET_FOLDER = config['APP_DB']['folder_parsed']

SEGMENTS_PER_TASK = 15000

warehouses = config['SNOW_CONN']['warehouse']
warehouses = 'INDSOL_PRICE_TRANS_TASK_0_WH,INDSOL_PRICE_TRANS_TASK_1_WH,INDSOL_PRICE_TRANS_TASK_2_WH,INDSOL_PRICE_TRANS_TASK_3_WH,INDSOL_PRICE_TRANS_TASK_4_WH,INDSOL_PRICE_TRANS_TASK_5_WH,INDSOL_PRICE_TRANS_TASK_6_WH,INDSOL_PRICE_TRANS_TASK_7_WH,INDSOL_PRICE_TRANS_TASK_8_WH,INDSOL_PRICE_TRANS_TASK_9_WH,INDSOL_PRICE_TRANS_TASK_10_WH'

# XSMALL | SMALL | MEDIUM | LARGE | XLARGE | XXLARGE | XXXLARGE | X4LARGE | X5LARGE | X6LARGE
warehouse_size = 'MEDIUM'


In [4]:

display(Markdown("### Cleanup block"))
# We will cleanup specific resources and artifacts from possible previous runs.

stmts = [
    f''' delete from segment_task_execution_status where data_file = '{DATA_FILE}'; '''
    ,f''' delete from task_to_segmentids where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_file_header where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_segment_header where data_file = '{DATA_FILE}'; '''
    ,f''' alter stage {INPUT_DATA_STAGE} refresh; '''
]    
    
print(' truncating tables ...')
for stmt in stmts:
    sp_session.sql(stmt).collect()

print(f''' cleaning up files in external stage under path {TARGET_FOLDER}/{DATA_FILE_BASENAME}/ ...''')

stmt = f''' select relative_path from directory(@{TARGET_DATA_STAGE}) where relative_path like '%{DATA_STAGE_FOLDER}/{DATA_FILE_BASENAME}/%'; '''
files = sp_session.sql(stmt).collect()
for r in files:
    stmt = f''' remove @{TARGET_DATA_STAGE}/{r['RELATIVE_PATH']}; '''
    sp_session.sql(stmt).collect()

### Cleanup block

 truncating tables ...


In [None]:
# reset the warehouse size to desired

print(f'''No of warehouses: {len(warehouses.split(','))}''')
for wh in warehouses.split(','):
    sp_session.sql(f''' alter warehouse {wh} set max_concurrency_level = 8; ''').collect()
    sp_session.sql(f''' alter warehouse {wh} set warehouse_size = {warehouse_size}; ''').collect()


---
## Data loading
We will be loading the segments and file header using DAG. 

In [None]:
# we build out the DAG
df = sp_session.call('in_network_rates_dagbuilder_matrix' ,f'{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}' ,DATA_FILE 
    ,f"@{TARGET_DATA_STAGE}/{TARGET_FOLDER}" ,SEGMENTS_PER_TASK ,warehouses ,10 ,8)

sp_session.sql(f''' alter stage {TARGET_DATA_STAGE} refresh; ''').collect()
print(' Status of execution')
print(df)

The above operation results in defining the DAG in Snowflake like here. The task names are specific to the data file being parsed.
![](../../doc/soln_images/task_dags.png)

In [None]:
# Next we invoke the DAG

start_time = time.time()
print(f'Started at: {datetime.now().strftime("%H:%M:%S")}')

sql_stmts = [
    f''' execute task DAG_ROOT_{DATA_FILE_BASENAME_CLEANSED}; '''
]
for stmt in sql_stmts:
    print(stmt)
    sp_session.sql(stmt).collect()

end_time = time.time()
print(f'Ended at: {datetime.now().strftime("%H:%M:%S")}')

elapsed_time = end_time - start_time
elapsed = str(timedelta(seconds=elapsed_time))
print(f'Elapsed: {elapsed}')

---
## Inspection

In [None]:

print('Tasks to segments')
file_ingestion_df = sp_session.table('TASK_TO_SEGMENTIDS').filter(F.col('DATA_FILE') == F.lit(DATA_FILE)).to_pandas()
display(file_ingestion_df)

In [None]:

print('Tasks ,warehouses and state')
sp_session.sql(f''' SHOW TASKS IN  DATABASE {config['APP_DB']['database']}; ''').collect()
stmt = f'''
    select "name" as task_name
        ,"warehouse" as warehouse
        ,"state" as state
    from table(result_scan(last_query_id()))
    where "name" like '%{DATA_FILE_BASENAME_CLEANSED.upper()}%'
       -- and state != 'suspended'
    order by state;
'''
df = sp_session.sql(stmt).to_pandas()
display(df)

--- 
### Closeout

    With that we are finished this section of the demo setup

In [None]:
sp_session.close()
print('Finished!!!')