# Load Demo

In [28]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,SOLNDEMOUSR,PUBLIC,INDSOL_CMSGOV_PRICING_EJ,PUBLIC


In [30]:
# Parameter initialization

import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time
import re

pd.set_option('display.max_colwidth', None)

def get_file_from_download_url(p_fl_url):
    splits = p_fl_url.split('/')
    return splits[len(splits) - 1]

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]

def get_cleansed_file_basename(p_datafile):
    fl_basename = get_basename_of_datafile(p_datafile)
    # Replace all non alphanumeric characters with _
    fl_name = re.sub('[^0-9a-zA-Z]+', '_', fl_basename)
    return fl_name

DATA_FILE_URL = 'https://priorityhealthtransparencymrfs.s3.amazonaws.com/2023_03_01_priority_health_HMO_in-network-rates.zip'
DATA_FILE = get_file_from_download_url(DATA_FILE_URL)
DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)
DATA_FILE_BASENAME_CLEANSED = get_cleansed_file_basename(DATA_FILE)

# INPUT_DATA_STAGE = config['APP_DB']['ext_stage']
INPUT_DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']

TARGET_DATA_STAGE = config['APP_DB']['ext_stage']
TARGET_FOLDER = config['APP_DB']['folder_parsed']

# This will need to be updated based on provider
# Priority Health ~ 500
# CIGNA ~ 15000
SEGMENTS_PER_TASK = 500

warehouses = config['SNOW_CONN']['warehouse']
create_warehouses = True
#warehouses = 'INDSOL_PRICE_TRANS_TASK_0_WH,INDSOL_PRICE_TRANS_TASK_1_WH,INDSOL_PRICE_TRANS_TASK_2_WH,INDSOL_PRICE_TRANS_TASK_3_WH,INDSOL_PRICE_TRANS_TASK_4_WH,INDSOL_PRICE_TRANS_TASK_5_WH,INDSOL_PRICE_TRANS_TASK_6_WH,INDSOL_PRICE_TRANS_TASK_7_WH,INDSOL_PRICE_TRANS_TASK_8_WH,INDSOL_PRICE_TRANS_TASK_9_WH,INDSOL_PRICE_TRANS_TASK_10_WH'
warehouses = 'INDSOL_PRICE_TRANS_TASK_0_WH,INDSOL_PRICE_TRANS_TASK_1_WH,INDSOL_PRICE_TRANS_TASK_2_WH,INDSOL_PRICE_TRANS_TASK_3_WH,INDSOL_PRICE_TRANS_TASK_4_WH,INDSOL_PRICE_TRANS_TASK_5_WH'

# XSMALL | SMALL | MEDIUM | LARGE | XLARGE | XXLARGE | XXXLARGE | X4LARGE | X5LARGE | X6LARGE
warehouse_size = 'MEDIUM'


In [24]:
# Download file and upload to stage

from urllib import request

# Create local download folder
DOWNLOAD_FOLDER=f'{PROJECT_HOME_DIR}/temp_t'
os.makedirs(DOWNLOAD_FOLDER ,exist_ok=True)
download_file_path = os.path.join(DOWNLOAD_FOLDER, DATA_FILE)

if not os.path.exists(os.path.dirname(download_file_path)):
    print(f'Downloading file to local: {download_file_path}')
    request.urlretrieve(DATA_FILE_URL, download_file_path)

Downloading file to local: ../..//temp_t/2023_03_01_priority_health_HMO_in-network-rates.zip


('../..//temp_t/2023_03_01_priority_health_HMO_in-network-rates.zip',
 <http.client.HTTPMessage at 0x7f915aef7af0>)

In [40]:
# Upload data file to stage, if not present

sql_stmt = f'''select relative_path
from directory(@{INPUT_DATA_STAGE})
where relative_path like '%{DATA_FILE}_%'
;'''
print(sql_stmt)

df = sp_session.sql(sql_stmt).to_pandas()
if (len(df) < 1):
    print(f'Uploading to stage {INPUT_DATA_STAGE} ...')
    sp_session.file.put(
        local_file_name = download_file_path
        ,stage_location = f'{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}'
        ,auto_compress=False ,overwrite=True)
    
    df = sp_session.sql(sql_stmt).to_pandas()

display(df)

Uploading to stage data_stg ...


In [25]:


stage_df = list_stage(sp_session ,INPUT_DATA_STAGE)
display(stage_df)

Uploading to stage data_stg ...


SnowparkFetchDataException: (1406): Failed to fetch a Pandas Dataframe. The error is: to_pandas() did not return a Pandas DataFrame. If you use session.sql(...).to_pandas(), the input query can only be a SELECT statement. Or you can use session.sql(...).collect() to get a list of Row objects for a non-SELECT statement, then convert it to a Pandas DataFrame.

In [31]:
# Create warehouses for parallelism

if create_warehouses == True:
    whs = warehouses.split(',')
    print(f'Creating {len(whs)} warehouses ..')

    sp_session.sql('use role sysadmin;').collect()
    for wh_name in whs:
        print(f'    - {wh_name}')
        sql_stmt = f'''
            create or replace warehouse {wh_name} with
                WAREHOUSE_SIZE = 'XSMALL'
                AUTO_RESUME = TRUE
                AUTO_SUSPEND = 300
                COMMENT = 'warehouse created as part of pricing transperancy industry solution usecase.'
            ;
        '''
        sp_session.sql(sql_stmt).collect()
        rl = config['APP_DB']['role']
        sp_session.sql(f'grant ALL PRIVILEGES on warehouse {wh_name} to role {rl};').collect()
       
    sp_session.use_role(f'''{config['APP_DB']['role']}''')

Creating 6 warehouses ..
    - INDSOL_PRICE_TRANS_TASK_0_WH
    - INDSOL_PRICE_TRANS_TASK_1_WH
    - INDSOL_PRICE_TRANS_TASK_2_WH
    - INDSOL_PRICE_TRANS_TASK_3_WH
    - INDSOL_PRICE_TRANS_TASK_4_WH
    - INDSOL_PRICE_TRANS_TASK_5_WH


In [32]:
# Cleanup block

# We will cleanup specific resources and artifacts from possible previous runs.
stmts = [
    f''' delete from segment_task_execution_status where data_file = '{DATA_FILE}'; '''
    ,f''' delete from task_to_segmentids where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_file_header where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_segment_header where data_file = '{DATA_FILE}'; '''
    ,f''' alter stage {INPUT_DATA_STAGE} refresh; '''
]    
    
print(' truncating tables ...')
for stmt in stmts:
    sp_session.sql(stmt).collect()

print(f''' cleaning up files in external stage under path {TARGET_FOLDER}/{DATA_FILE_BASENAME}/ ...''')

stmt = f''' select relative_path from directory(@{TARGET_DATA_STAGE}) where relative_path like '%{DATA_STAGE_FOLDER}/{DATA_FILE_BASENAME}/%'; '''
files = sp_session.sql(stmt).collect()
for r in files:
    stmt = f''' remove @{TARGET_DATA_STAGE}/{r['RELATIVE_PATH']}; '''
    sp_session.sql(stmt).collect()

 truncating tables ...
 cleaning up files in external stage under path raw_parsed_v3/2023_03_01_priority_health_HMO_in-network-rates/ ...


In [22]:
# Cautious enablement, used during development for testing
print(f'Cleaning dags for datafile: {DATA_FILE_BASENAME_CLEANSED}')
sp_session.call('delete_dag_for_datafile',DATA_FILE_BASENAME_CLEANSED ,False);

Cleaning dags for datafile: 2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates


In [23]:
# reset the warehouse size to desired

print(f'''No of warehouses: {len(warehouses.split(','))}''')
for wh in warehouses.split(','):
    sp_session.sql(f''' alter warehouse {wh} set max_concurrency_level = 8; ''').collect()
    sp_session.sql(f''' alter warehouse {wh} set warehouse_size = {warehouse_size}; ''').collect()


No of warehouses: 11


---
## Data loading
We will be loading the segments and file header using DAG. 

In [33]:
# we build out the DAG
df = sp_session.call('in_network_rates_dagbuilder_matrix' ,f'{INPUT_DATA_STAGE}/{DATA_STAGE_FOLDER}' ,DATA_FILE 
    ,f"@{TARGET_DATA_STAGE}/{TARGET_FOLDER}" ,SEGMENTS_PER_TASK ,warehouses ,10 ,8)

sp_session.sql(f''' alter stage {TARGET_DATA_STAGE} refresh; ''').collect()
print(' Status of execution')
print(df)

 Status of execution
{
  "data_file": "2023_03_01_priority_health_HMO_in-network-rates.zip",
  "root_task": "DAG_ROOT_2023_03_01_priority_health_HMO_in_network_rates",
  "status": true,
  "task_matrix_shape": [
    10,
    8
  ],
  "term_task": "TERM_tsk_2023_03_01_priority_health_HMO_in_network_rates"
}


The above operation results in defining the DAG in Snowflake like here. The task names are specific to the data file being parsed.
![](../../doc/soln_images/task_dags.png)

In [25]:
# Next we invoke the DAG

start_time = time.time()
print(f'Started at: {datetime.now().strftime("%H:%M:%S")}')

sql_stmts = [
    f''' execute task DAG_ROOT_{DATA_FILE_BASENAME_CLEANSED}; '''
]
for stmt in sql_stmts:
    print(stmt)
    sp_session.sql(stmt).collect()

end_time = time.time()
print(f'Ended at: {datetime.now().strftime("%H:%M:%S")}')

elapsed_time = end_time - start_time
elapsed = str(timedelta(seconds=elapsed_time))
print(f'Elapsed: {elapsed}')

Started at: 12:48:44
 execute task DAG_ROOT_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates; 
Ended at: 12:48:44
Elapsed: 0:00:00.938872


---
## Inspection

In [26]:

print('Tasks to segments')
file_ingestion_df = sp_session.table('TASK_TO_SEGMENTIDS').filter(F.col('DATA_FILE') == F.lit(DATA_FILE)).to_pandas()
display(file_ingestion_df)

Tasks to segments


Unnamed: 0,BUCKET,DATA_FILE,ASSIGNED_TASK_NAME,FROM_IDX,TO_IDX,SEGMENTS_RECORD_COUNT,INSERTED_AT
0,41,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_615001_630000,615001,630000,14999,2023-02-27 09:45:33.494
1,8,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_120001_135000,120001,135000,14999,2023-02-27 09:45:33.494
2,48,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_720001_735000,720001,735000,14999,2023-02-27 09:45:33.494
3,74,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_1110001_999999999,1110001,999999999,998889998,2023-02-27 09:45:33.494
4,55,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_825001_840000,825001,840000,14999,2023-02-27 09:45:33.494
...,...,...,...,...,...,...,...
70,62,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_930001_945000,930001,945000,14999,2023-02-27 09:45:33.494
71,50,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_750001_765000,750001,765000,14999,2023-02-27 09:45:33.494
72,47,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_705001_720000,705001,720000,14999,2023-02-27 09:45:33.494
73,58,2023-02-01_KFHP_SC-COMMERCIAL_in-network-rates.zip,T_2023_02_01_KFHP_SC_COMMERCIAL_in_network_rates_870001_885000,870001,885000,14999,2023-02-27 09:45:33.494


In [27]:

print('Tasks ,warehouses and state')
sp_session.sql(f''' SHOW TASKS IN  DATABASE {config['APP_DB']['database']}; ''').collect()
stmt = f'''
    select "name" as task_name
        ,"warehouse" as warehouse
        ,"state" as state
    from table(result_scan(last_query_id()))
    where "name" like '%{DATA_FILE_BASENAME_CLEANSED.upper()}%'
       -- and state != 'suspended'
    order by state;
'''
df = sp_session.sql(stmt).to_pandas()
display(df)

Tasks ,warehouses and state


Unnamed: 0,TASK_NAME,WAREHOUSE,STATE
0,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_1005001_1020000,INDSOL_PRICE_TRANS_TASK_0_WH,started
1,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_75001_90000,INDSOL_PRICE_TRANS_TASK_2_WH,started
2,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_825001_840000,INDSOL_PRICE_TRANS_TASK_1_WH,started
3,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_855001_870000,INDSOL_PRICE_TRANS_TASK_9_WH,started
4,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_870001_885000,INDSOL_PRICE_TRANS_TASK_2_WH,started
...,...,...,...
75,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_585001_600000,INDSOL_PRICE_TRANS_TASK_2_WH,started
76,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_615001_630000,INDSOL_PRICE_TRANS_TASK_2_WH,started
77,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_720001_735000,INDSOL_PRICE_TRANS_TASK_0_WH,started
78,T_2023_02_01_KFHP_SC_COMMERCIAL_IN_NETWORK_RATES_765001_780000,INDSOL_PRICE_TRANS_TASK_4_WH,started


--- 
### Closeout

    With that we are finished this section of the demo setup

In [28]:
sp_session.close()
print('Finished!!!')

Finished!!!
