# Load Demo (on a reduced_sample_data.json) with DAG

In this we demonstrate loading of a sample data file [reduced_sample_data.json](../../data/reduced_sample_data.json) and showcase all the 
various resources that gets populated. We now are demonstrating using DAG

#### Pre-requisite
It is assumed that the setup steps are completed successfully. These are:
 - creating the database, schemas, stages
 - defining the external stage
 - defining the functions and stored procedures
 - defining the external tables and views

If you had not done this, use the streamlit (./bin/run_app.sh) to create these as defined in the subpage 'Setup'

In [17]:
# Initialization block
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config(PROJECT_HOME_DIR)
sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)

if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['SNOW_CONN']['warehouse']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,VSEKAR,PUBLIC,INDSOL_CMSGOV_PRICING,PUBLIC


In [18]:
import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
import time

pd.set_option('display.max_colwidth', None)

def get_basename_of_datafile(p_datafile:str) -> str:
    base = os.path.basename(p_datafile)
    fl_base = os.path.splitext(base)
    return fl_base[0]


# DATA_STAGE = config['APP_DB']['ext_stage']
DATA_STAGE = 'data_stg'
DATA_STAGE_FOLDER = config['APP_DB']['folder_data']
DATA_FILE = 'reduced_sample_data.json'
DATA_FILE_BASENAME = get_basename_of_datafile(DATA_FILE)

In [19]:
display(Markdown("### Cleanup block"))
# We will cleanup specific resources and artifacts from possible previous runs.

stmts = [
    f''' delete from segment_task_execution_status where data_file = '{DATA_FILE}'; '''
    ,f''' delete from task_to_segmentids where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_file_header where data_file = '{DATA_FILE}'; '''
    ,f''' delete from in_network_rates_segment_header where data_file = '{DATA_FILE}'; '''
    ,f''' alter stage {DATA_STAGE} refresh; '''
]    
    
print(' truncating tables ...')
for stmt in stmts:
    sp_session.sql(stmt).collect()

print(f''' cleaning up files in external stage under path {DATA_STAGE_FOLDER}/{DATA_FILE_BASENAME}/ ...''')

stmt = f''' select relative_path from directory(@{DATA_STAGE}) where relative_path like '%{DATA_STAGE_FOLDER}/{DATA_FILE_BASENAME}/%'; '''
files = sp_session.sql(stmt).collect()
for r in files:
    stmt = f''' remove @{DATA_STAGE}/{r['RELATIVE_PATH']}; '''
    sp_session.sql(stmt).collect()

### Cleanup block

 truncating tables ...
 cleaning up files in external stage under path data/reduced_sample_data/ ...


---
## Data loading
We will be loading the segments and file header using DAG. 

In [20]:
# we build out the DAG
df = sp_session.call('in_network_rates_dagbuilder' ,f'{DATA_STAGE}/{DATA_STAGE_FOLDER}' ,DATA_FILE 
    ,f"@{DATA_STAGE}/{config['APP_DB']['folder_parsed']}" , 200 ,config['SNOW_CONN']['warehouse'])

sp_session.sql(f''' alter stage {config['APP_DB']['ext_stage']} refresh; ''').collect()
print(' Status of execution')
print(df)

 Status of execution
{
  "data_file": "reduced_sample_data.json",
  "root_task": "DAG_ROOT_reduced_sample_data",
  "status": true,
  "task_matrix_shape": [
    5,
    5
  ],
  "term_task": "TERM_tsk_reduced_sample_data"
}


The above operation results in defining the DAG in Snowflake like here. The task names are specific to the data file being parsed.
![](../../doc/images/task_dags.png)

In [21]:
# Next we invoke the DAG

start_time = time.time()
print(f'Started at: {datetime.now().strftime("%H:%M:%S")}')

sql_stmts = [
    f''' alter warehouse {config['SNOW_CONN']['warehouse']} set warehouse_size = MEDIUM; '''
    # ,f''' truncate table segment_task_execution_status; '''
    # ,f''' truncate table task_to_segmentids; '''
    # ,f''' truncate table in_network_rates_file_header; '''
    # ,f''' truncate table in_network_rates_segment_header; '''
    ,f''' execute task DAG_ROOT_{DATA_FILE_BASENAME}; '''
]
for stmt in sql_stmts:
    print(stmt)
    sp_session.sql(stmt).collect()

end_time = time.time()
print(f'Ended at: {datetime.now().strftime("%H:%M:%S")}')

elapsed_time = end_time - start_time
elapsed = str(timedelta(seconds=elapsed_time))
print(f'Elapsed: {elapsed}')

Started at: 14:54:25
 alter warehouse DEMO_BUILD_WH set warehouse_size = MEDIUM; 
 execute task DAG_ROOT_reduced_sample_data; 
Ended at: 14:54:26
Elapsed: 0:00:00.734540


---
## Inspection (TODO)

In [22]:
import time

#sleep for around 10 mins. the DAG takes about 10 min to complete 
time.sleep(10*60)

In [23]:
# # Dag cleanup (blank)

# # sp_session.sql(f''' use warehouse DEMO_BUILD_WH; ''').collect()

# sp_session.sql(f''' SHOW TASKS IN  DATABASE {config['APP']['database']}; ''').collect()

# df = sp_session.sql(f''' 
#    SELECT 
#       concat('drop task if exists ', "name" ,';') as drop_stmt
#    FROM TABLE(RESULT_SCAN(LAST_QUERY_ID()))
#    --where "state" = 'suspended'
#    ORDER BY "state" asc;
# ''').to_pandas()

# for index, row in df.iterrows():
#    sp_session.sql(row['DROP_STMT']).collect()


In [24]:
display(Markdown("The table in_network_rates_file_header holds the file header elements"))

df = (sp_session.table('in_network_rates_file_header')
        .sample(n=5)
        .to_pandas())

display(df)

The table in_network_rates_file_header holds the file header elements

Unnamed: 0,DATA_FILE,HEADER,INSERTED_AT
0,reduced_sample_data.json,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""last_updated_on"": ""2022-10-01"",\n ""reporting_entity_name"": ""Priority Health"",\n ""reporting_entity_type"": ""health insurance issuer"",\n ""total_segments"": 10,\n ""version"": ""http://json-schema.org/draft-07/schema#""\n}",2023-01-05 11:54:56.371


In [25]:
print(' The HEADER has the following data ')
json.loads(df['HEADER'][0])

 The HEADER has the following data 


{'DATA_FILE': 'reduced_sample_data.json',
 'last_updated_on': '2022-10-01',
 'reporting_entity_name': 'Priority Health',
 'reporting_entity_type': 'health insurance issuer',
 'total_segments': 10,
 'version': 'http://json-schema.org/draft-07/schema#'}

In [26]:
display(Markdown("The table in_network_rates_segment_header lists the various segments that were loaded"))

df = (sp_session.table('in_network_rates_segment_header')
        .sample(n=5)
        .to_pandas())

display(df)

The table in_network_rates_segment_header lists the various segments that were loaded

Unnamed: 0,DATA_FILE,SEGMENT_ID,NEGOTIATED_RATES_INFO,NEGOTIATED_RATES_COUNT,BUNDLED_CODES_COUNT,COVERED_SERVICES_COUNT,INSERTED_AT
0,reduced_sample_data.json,ffs::cpt::76391::,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""SEGMENT_ID"": ""ffs::cpt::76391::"",\n ""SEQ_NO"": 5,\n ""billing_code"": ""76391"",\n ""billing_code_type"": ""CPT"",\n ""billing_code_type_version"": """",\n ""description"": ""Magnetic resonance (eg, vibration) elastography"",\n ""name"": ""Magnetic resonance (eg, vibration) elastography"",\n ""negotiation_arrangement"": ""ffs""\n}",1784,-1,-1,2023-01-05 11:55:13.956
1,reduced_sample_data.json,ffs::cpt::j9144::,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""SEGMENT_ID"": ""ffs::cpt::j9144::"",\n ""SEQ_NO"": 10,\n ""billing_code"": ""J9144"",\n ""billing_code_type"": ""CPT"",\n ""billing_code_type_version"": """",\n ""description"": ""Injection, daratumumab, 10 mg and hyaluronidase-fihj"",\n ""name"": ""Injection, daratumumab, 10 mg and hyaluronidase-fihj"",\n ""negotiation_arrangement"": ""ffs""\n}",573,-1,-1,2023-01-05 11:55:32.399
2,reduced_sample_data.json,ffs::cpt::86160::,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""SEGMENT_ID"": ""ffs::cpt::86160::"",\n ""SEQ_NO"": 4,\n ""billing_code"": ""86160"",\n ""billing_code_type"": ""CPT"",\n ""billing_code_type_version"": """",\n ""description"": ""Complement Antigen Each Component"",\n ""name"": ""Complement Antigen Each Component"",\n ""negotiation_arrangement"": ""ffs""\n}",2779,-1,-1,2023-01-05 11:55:10.542
3,reduced_sample_data.json,ffs::cpt::62322::,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""SEGMENT_ID"": ""ffs::cpt::62322::"",\n ""SEQ_NO"": 1,\n ""billing_code"": ""62322"",\n ""billing_code_type"": ""CPT"",\n ""billing_code_type_version"": """",\n ""description"": ""Injection(s), of diagnostic or therapeutic substance(s) (eg, anesthetic, antispasmodic, opioid, steroid, other solution), not including neurolytic substances, including needle"",\n ""name"": ""Injection(s), of diagnostic or therapeutic substance(s) (eg, anesthetic, antispasmodic, opioid, steroid, other solution), not including neurolytic substances, including needle"",\n ""negotiation_arrangement"": ""ffs""\n}",4170,-1,-1,2023-01-05 11:55:00.294
4,reduced_sample_data.json,ffs::cpt::a4614::,"{\n ""DATA_FILE"": ""reduced_sample_data.json"",\n ""SEGMENT_ID"": ""ffs::cpt::a4614::"",\n ""SEQ_NO"": 3,\n ""billing_code"": ""A4614"",\n ""billing_code_type"": ""CPT"",\n ""billing_code_type_version"": """",\n ""description"": ""Peak expiratory flow rate meter hand held"",\n ""name"": ""Peak expiratory flow rate meter hand held"",\n ""negotiation_arrangement"": ""ffs""\n}",235,-1,-1,2023-01-05 11:55:07.227


In [27]:
print(' The NEGOTIATED_RATES_INFO has the following data ')
json.loads(df['NEGOTIATED_RATES_INFO'][0])

 The NEGOTIATED_RATES_INFO has the following data 


{'DATA_FILE': 'reduced_sample_data.json',
 'SEGMENT_ID': 'ffs::cpt::76391::',
 'SEQ_NO': 5,
 'billing_code': '76391',
 'billing_code_type': 'CPT',
 'billing_code_type_version': '',
 'description': 'Magnetic resonance (eg, vibration) elastography',
 'name': 'Magnetic resonance (eg, vibration) elastography',
 'negotiation_arrangement': 'ffs'}

In [28]:
para = f''' 
The parsed data are stored as parquet files in the external stage at: @{config['APP_DB']['ext_stage']}/{config['APP_DB']['folder_parsed']}. The directory structure follows the format:

<ext stage>/<folder_parsed>/<data file basename (ex: reduced_sample_data)/<segment_id>/<segment type (negotiated_rates)>/data_<seq_no>_<chunk_no>.parquet.gz

Description:
- ext stage : external stage name
- folder_parsed : configured value of APP_DB.folder_parsed in config.ini
- data file basename : the data file basename, without the file extension
- segment_id : a unique identifier for the segment, this is a composite key of <negotiation_arrangement>::<billing_code_type>::<billing_code>::<billing_code_type_version>
- segment type : indicates the segment children type, this could be either of negotiated_rates/bundled_codes/covered_services
- The file which will contain the record, stored in parquet file.
'''
print(para)

stmt = f''' select relative_path from directory(@{config['APP_DB']['ext_stage']}) where relative_path like '%{config['APP_DB']['folder_parsed']}/reduced_sample_data%' limit 5; '''
df = sp_session.sql(stmt).to_pandas()
display(df)

 
The parsed data are stored as parquet files in the external stage at: @ext_data_stg/raw_parsed. The directory structure follows the format:

<ext stage>/<folder_parsed>/<data file basename (ex: reduced_sample_data)/<segment_id>/<segment type (negotiated_rates)>/data_<seq_no>_<chunk_no>.parquet.gz

Description:
- ext stage : external stage name
- folder_parsed : configured value of APP_DB.folder_parsed in config.ini
- data file basename : the data file basename, without the file extension
- segment_id : a unique identifier for the segment, this is a composite key of <negotiation_arrangement>::<billing_code_type>::<billing_code>::<billing_code_type_version>
- segment type : indicates the segment children type, this could be either of negotiated_rates/bundled_codes/covered_services
- The file which will contain the record, stored in parquet file.



Unnamed: 0,RELATIVE_PATH
0,raw_parsed/reduced_sample_data/ffs::cpt::15004::/negotiated_rates/data_7_0.parquet.gz
1,raw_parsed/reduced_sample_data/ffs::cpt::15004::/negotiated_rates/data_7_1.parquet.gz
2,raw_parsed/reduced_sample_data/ffs::cpt::33235::/negotiated_rates/data_8_0.parquet.gz
3,raw_parsed/reduced_sample_data/ffs::cpt::33235::/negotiated_rates/data_8_1.parquet.gz
4,raw_parsed/reduced_sample_data/ffs::cpt::62322::/negotiated_rates/data_1_0.parquet.gz


In [29]:
para = f''' 
The parquet file can be queried inside Snowflake via external table: ext_negotiated_arrangments_staged. Each of the folder structure is partitioned to seperate columns , which can help with pruning to specific segments.
The value contains the negotiated_rates and other children elements stored in JSON format
'''
print(para)

pd.set_option('display.max_colwidth', 25)
df = (sp_session.table('ext_negotiated_arrangments_staged')
        .sample(n=5)
        .to_pandas())

display(df)


 
The parquet file can be queried inside Snowflake via external table: ext_negotiated_arrangments_staged. Each of the folder structure is partitioned to seperate columns , which can help with pruning to specific segments.
The value contains the negotiated_rates and other children elements stored in JSON format



Unnamed: 0,VALUE,P_DATA_FL,P_SEGMENT_ID,P_NEGOTIATION_ARRANGEMENT,P_BILLING_CODE_TYPE,P_BILLING_CODE,P_BILLING_CODE_TYPE_VERSION,P_SEGMENT_TYPE
0,"{\n ""CHUNK_NO"": 0,\n...",reduced_sample_data,ffs::cpt::85041::,ffs,cpt,85041,,negotiated_rates
1,"{\n ""CHUNK_NO"": 1,\n...",reduced_sample_data,ffs::cpt::92583::,ffs,cpt,92583,,negotiated_rates
2,"{\n ""CHUNK_NO"": 0,\n...",reduced_sample_data,ffs::cpt::93463::,ffs,cpt,93463,,negotiated_rates
3,"{\n ""CHUNK_NO"": 0,\n...",reduced_sample_data,ffs::cpt::92583::,ffs,cpt,92583,,negotiated_rates
4,"{\n ""CHUNK_NO"": 0,\n...",reduced_sample_data,ffs::cpt::62322::,ffs,cpt,62322,,negotiated_rates


In [30]:
print(' a sample view of one of the records')
j = json.loads(df['VALUE'][0])
print(j)

 a sample view of one of the records
{'CHUNK_NO': 0, 'DATA_FILE': 'reduced_sample_data.json', 'NEGOTIATED_RATES': [{'negotiated_prices': [{'billing_class': 'professional', 'expiration_date': '9999-12-31', 'negotiated_rate': 1.51, 'negotiated_type': 'negotiated', 'service_code': ['11']}], 'provider_groups': [{'npi': [1902809940], 'tin': {'type': 'ein', 'value': '133757370'}}]}, {'negotiated_prices': [{'billing_class': 'professional', 'expiration_date': '9999-12-31', 'negotiated_rate': 1.51, 'negotiated_type': 'negotiated', 'service_code': ['11']}], 'provider_groups': [{'npi': [1104829159], 'tin': {'type': 'ein', 'value': '840611484'}}]}, {'negotiated_prices': [{'billing_class': 'professional', 'expiration_date': '9999-12-31', 'negotiated_rate': 2.65, 'negotiated_type': 'fee schedule', 'service_code': ['11', '22']}], 'provider_groups': [{'npi': [1245307818], 'tin': {'type': 'ein', 'value': '520890739'}}]}, {'negotiated_prices': [{'billing_class': 'professional', 'expiration_date': '9999-

In [31]:
display(Markdown("The table segment_task_execution_status contains the audit of execution for the above stored procedures"))

pd.set_option('display.max_colwidth', None)
df = (sp_session.table('segment_task_execution_status')
        .sample(n=5)
        .to_pandas())

display(df)

The table segment_task_execution_status contains the audit of execution for the above stored procedures

Unnamed: 0,DATA_FILE,TASK_NAME,ELAPSED,TASK_RET_STATUS,INSERTED_AT
0,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_2001_2200,0:00:01.951368,"{""data_file"": ""reduced_sample_data.json"", ""start_rec_num"": 2001, ""end_rec_num"": 2200, ""Parsing_error"": """", ""elapsed"": ""=> 0:00:01.951368 "", ""last_seg_no"": 10, ""EOF_Reached"": True}",2023-01-05 11:55:56.736
1,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_1801_2000,0:00:01.898082,"{""data_file"": ""reduced_sample_data.json"", ""start_rec_num"": 1801, ""end_rec_num"": 2000, ""Parsing_error"": """", ""elapsed"": ""=> 0:00:01.898082 "", ""last_seg_no"": 10, ""EOF_Reached"": True}",2023-01-05 11:54:56.681
2,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_1401_1600,0:00:01.994492,"{""data_file"": ""reduced_sample_data.json"", ""start_rec_num"": 1401, ""end_rec_num"": 1600, ""Parsing_error"": """", ""elapsed"": ""=> 0:00:01.994492 "", ""last_seg_no"": 10, ""EOF_Reached"": True}",2023-01-05 11:55:00.650
3,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_2401_2600,0:00:01.978640,"{""data_file"": ""reduced_sample_data.json"", ""start_rec_num"": 2401, ""end_rec_num"": 2600, ""Parsing_error"": """", ""elapsed"": ""=> 0:00:01.978640 "", ""last_seg_no"": 10, ""EOF_Reached"": True}",2023-01-05 11:55:08.742
4,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_3401_3600,0:00:01.991522,"{""data_file"": ""reduced_sample_data.json"", ""start_rec_num"": 3401, ""end_rec_num"": 3600, ""Parsing_error"": """", ""elapsed"": ""=> 0:00:01.991522 "", ""last_seg_no"": 10, ""EOF_Reached"": True}",2023-01-05 11:55:24.571


In [32]:
display(Markdown("The view segments_counts_for_datafile_v, based of table segment_task_execution_status, will help to identify the number of negotiated_arrangement segments for a specific data file. This view is populated once all the segments in a specific data files are parsed out"))

df = (sp_session.table('segments_counts_for_datafile_v')
        .sample(n=5)
        .to_pandas())

display(df)

The view segments_counts_for_datafile_v, based of table segment_task_execution_status, will help to identify the number of negotiated_arrangement segments for a specific data file. This view is populated once all the segments in a specific data files are parsed out

Unnamed: 0,DATA_FILE,TASK_NAME,TOTAL_NO_OF_SEGMENTS
0,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_201_400,10
1,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_1601_1800,10
2,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_1001_1200,10
3,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_4801_5000,10
4,reduced_sample_data.json,INDSOL_CMSGOV_PRICING.PUBLIC.T_REDUCED_SAMPLE_DATA_3401_3600,10


--- 
### Closeout

    With that we are finished this section of the demo setup

In [33]:
# sp_session.close()
print('Finished!!!')

Finished!!!
