<a id="CELL1"></a>
## CELL 1 


In [167]:
"""CELL 1
builds and returns a database session
local assumes a psql instance in a local docker container
only postgres database is supported for configuration_application at this time
"""
"""
gets env-based configuration secret
returns a session to the configuration db
for dev env it pre-populates the database with helper and seed data
"""
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

2019-08-01 13:03:35,510 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-08-01 13:03:35,574 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-08-01 13:03:35,586 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-08-01 13:03:35,587 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-08-01 13:03:35,593 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-08-01 13:03:35,594 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-08-01 13:03:35,598 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-08-01 13:03:35,599 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-08-01 13:03:3

## CONFIGURATION - PLEASE TOUCH
### <font color=pink>This cell will be off in production as configurations will come from the configuration postgres DB</color>

In [13]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
"""
PIPELINE STATE:

raw-->ingest-->master-->enhance-->enrich-->metrics-->dimensional

"""
# config vars: this dataset
config_pharma = "sun" # the pharmaceutical company which owns {brand}
config_brand = "ilumya" # the brand this pipeline operates on
config_state = "master" # the state this transform runs in
config_name = "master_patient_substatus" # the name of this transform!!!, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. 
# Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_branch = "sun-extract-validation"
# None
# if None, input_branch is automagically set to your working branch
input_pharma = "sun"
input_brand = "ilumya"
input_state = "ingest"
input_name = "symphony_health_association_ingest_column_mapping"

#This contract defines the base of the output structure of data into S3.
#
#contract structure in s3: 
#s3:// {ENV} / {BRANCH} / {PARENT} / {CHILD} / {STATE} / {name of input}
#
#ENV - environment Must be one of development, uat, production.
#Prefixed with integrichain- due to global unique reqirement
#BRANCH - the software branch for development this will be the working pull request (eg pr-225)
#in uat this will be edge, in production this will be master
#PARENT - The top level source identifier
#this is generally the customer (and it is aliased as such) but can be IntegriChain for internal sources,
#or another aggregator for future-proofing
#CHILD - The sub level source identifier, generally the brand (and is aliased as such)
#STATE - One of: raw, ingest, master, enhance, enrich, metrics


### <font color=orange>SETUP - DON'T TOUCH </font>
Populating config mocker based on config parameters...

In [168]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
"""
RETURNS: A list of 2 items: [transformation_id, run_id] where transformation_id corresponds
to the configuration created/found for {transformation} and run_id is a randomly generated 6 digit
number (to avoid publishing to the same place with the same dataset)
"""
transform_id = ids[0]
run_id = ids[1]

2019-08-01 13:03:42,135 - core.logging - DEBUG - Adding/getting mocks for specified configurations...
2019-08-01 13:03:42,172 - core.logging - DEBUG - Done. Creating mock run event and committing results to configuration mocker.


In [15]:
# debug only
print(transform_id)
print(run_id)
# e.g:
# 6
# 644707


6
706377


### <font color=orange>SETUP - DON'T TOUCH </font>
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 


In [169]:
"""************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET, BATCH_JOB_QUEUE
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract
from core.logging import get_logger

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)
        
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

In [170]:
#debug only
print(BRANCH_NAME)
print(ENV_BUCKET)
print(BATCH_JOB_QUEUE)
#e.g: 
#DC-578_PatientStatus
#ichain-dev
#dev-core

DC-578_PatientSubStatus
ichain-dev
dev-core


***
# CORE Cartridge Notebook::[master_patient_substatus]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

## CONFIGURATION - VARIABLES - PLEASE TOUCH

# TRANSFORM

In [278]:
""" 
CONFIGURATION ********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>
e.g.
class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""
"""
imports
"""
import pandas as pd
import re 



 
class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    ''' 

    ## col of interest
    ##Field	        Description	     Data Type / Format	Comments	Required	             OLD Name in Data Model	New Name in Data Model	<-- Map to	BIPI	Sun	Alkermes - Status
    ##Sub Status	Sub-StatusCode	X(50)		                    Required if Available	substatus	             substatus				
    col_substatus: str 
    customer_name: str
        
    def master_substatus(customer_name:str):
        try:
            
            if customer_name=='sun':
                substatus_dict = Transform.master_substatus_sun()
                substatus_conversion_dict = Transform.master_substatus_conversion_sun()
            elif customer_name=='bi':
                substatus_dict = Transform.master_substatus_bi()
            elif customer_name=='alkermes':
                substatus_dict = Transform.master_substatus_alkermes()
            else:
                #go = False # something did not work
                logger.exception('expecting customer name as sun bi or alkermes')
                raise Exception('expecting customer name as sun bi or alkermes') 
        except Exception as e:
            go = False # something did not work
            logger.exception("exception:".format(e))
            raise Exception(str(e))    
        return substatus_dict, substatus_conversion_dict  
    
    def master_substatus_sun():
        # need to input/ define for ic-gold mapping
        # for substatus for sun
        # temporary until furture User defines
        # IC - GOLD persistence solution
        substatus_dict = {}
        substatus_dict[1]='ALT THERAPY'
        substatus_dict[2]='APPEAL'
        substatus_dict[3]='BENEFITS'
        substatus_dict[4]='COPAY ASSISTANCE'
        substatus_dict[5]='DELAY'
        substatus_dict[6]='DOSAGE'
        substatus_dict[7]='FORMULARY'
        substatus_dict[8]='FOUNDATION'
        substatus_dict[9]='HOLD OTHER'
        substatus_dict[10]='HOLD RTS'
        substatus_dict[11]='INFORMATION'
        substatus_dict[12]='INS OTHER'
        substatus_dict[13]='INSURANCE COPAY'
        substatus_dict[14]='INSURANCE DENIED'
        substatus_dict[15]='INSURANCE HOLD'
        substatus_dict[16]='INSURANCE OON'
        substatus_dict[17]='INSURANCE OTHER'
        substatus_dict[18]='INVENTORY HOLD'
        substatus_dict[19]='MATERIAL'
        substatus_dict[20]='NEW'
        substatus_dict[21]='OTHER'
        substatus_dict[22]='PA'
        substatus_dict[23]='PATIENT CONTACT'
        substatus_dict[24]='PATIENT DECEASED'
        substatus_dict[25]='PATIENT END'
        substatus_dict[26]='PATIENT FINANCIAL'
        substatus_dict[27]='PATIENT HOLD'
        substatus_dict[28]='PATIENT RESPONSE'
        substatus_dict[29]='PRESCRIBER'
        substatus_dict[30]='PRESCRIBER END'
        substatus_dict[31]='PRESCRIBER HOLD'
        substatus_dict[32]='PT HOLD'
        substatus_dict[33]='QUANTITY'
        substatus_dict[34]='READY'
        substatus_dict[35]='SERVICES END'
        substatus_dict[36]='SHIPMENT'
        substatus_dict[37]='STEP EDIT'
        substatus_dict[38]='THERAPY COMPLETE'
        substatus_dict[39]='THERAPY END'
        substatus_dict[40]='THERAPY HOLD'
        substatus_dict[41]='TRANSFER HUB'
        substatus_dict[42]='TRANSFER SP'
        substatus_dict[43]='TREATMENT DELAY'
        return substatus_dict
   

    def master_substatus_conversion_sun():
        substatus_conversion_dict = {}
        substatus_conversion_dict = {'BENEFITS INVESTIGATION':'BENEFITS','INS OON ':'INSURANCE OON','OTHER ':'OTHER','P05':'PA','PATENT RESPONSE':'PATIENT RESPONSE','PATIENT  RESPONSE':'PATIENT RESPONSE','PATIENT RESPOSNE':'PATIENT RESPONSE','PRESCRIBERHOLD':'PRESCRIBER HOLD','TRANSER SP':'TRANSFER SP'}
        return substatus_conversion_dict
    
    
    def master_substatus_bi():
        substatus_dict = {}
        return substatus_dict

    
    def master_substatus_alkermes():
        substatus_dict = {}
        return substatus_dict

    
    def master_patient_substatus(self,df):
        try:        
            logger.info('try:')
            go = False # assume things are not working YET.
           
            dffail = pd.DataFrame() # initialize df for fails
            
            # df in

            #dfSize = df.size
            dfShape = df.shape
            logger.info('df in  shape: {} {}'.format(dfShape[0],dfShape[1])) 
            logger.info('df in {}'.format(df.head()))  
            
            # am I expecting certain column names? YES 
            substatusColNameExpected = transform.col_substatus
            
            logger.info('expecting column name patient sub status as:{}'.format(substatusColNameExpected))
            columnNamesArr = df.columns.values.tolist()
            logger.info('df column names:{}'.format(columnNamesArr))
            
            if substatusColNameExpected in columnNamesArr:
                logger.info('Clean: space Strip and Upper and other cleanup...')  

                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.upper() if x is not None else x)   
                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.strip() if x is not None else x)
                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.replace('_',' ').replace('\r', '').replace('\t', '').replace('\w', '') if x is not None else x)
                #df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.replace('[^a-zA-Z0-9]', '') if x is not None else x)
                # re.sub(r'\s+', ' ',   stringin)  


                # master data IC-GOLD substatus
                substatus_dict = {}
                substatus_conversion_dict = {}
                substatus_dict, substatus_conversion_dict = Transform.master_substatus(transform.customer_name)
                print(substatus_conversion_dict)
                # store the golden values in a list
                substatus_list = list(substatus_dict.values())           
                logger.info('Gold Domain List:{}'.format(substatus_list))  
                
                # apply master conversions
                #df = df[substatusColNameExpected].map(substatus_conversion_dict).fillna(df[substatusColNameExpected])
                #df.replace({substatusColNameExpected: substatus_conversion_dict})
                #df[substatusColNameExpected]=df[substatusColNameExpected].notnull().map(substatus_conversion_dict).fillna(df[substatusColNameExpected])
                df[substatusColNameExpected].replace(substatus_conversion_dict, inplace=True)
                
                
                # what fails
                dffail = df.loc[~df[substatusColNameExpected].isin(substatus_list)]
                # apply master selection for the column of interest
                # what passes
                df = df.loc[df[substatusColNameExpected].isin(substatus_list)]
                
                # meta data log for what comes out of the function pass and fail df
                dfOutSize = df.size
                dfOutShape = df.shape
                dffailSize = dffail.size
                dffailShape = dffail.shape
                logger.info('df in   shape: {} {}'.format(dfShape[0],dfShape[1]))                 
                logger.info('df pass shape: {} {}'.format(dfOutShape[0],dfOutShape[1]))
                logger.info('df fail shape: {} {}'.format(dffailShape[0],dffailShape[1]))  
                # meta data log for what comes out of the function pass df
                logger.info('df pass {}'.format(df.head()))
                # meta data log for what comes out of the function fail df
                logger.info('df fail {}'.format(dffail.head()))  
                go = True
            else:
                go = False # something did not work
                logger.exception('expecting column name for patient substatus if/else exception raise')
                raise Exception("sub_status ColNameExpected NOT in columnNamesArr")              
        except Exception as e:
            go = False # something did not work
            logger.exception("exception:".format(e))
            raise Exception(str(e))
        else:
            pass
        finally:
            pass
        return df,dffail,go
                

transform = Transform()

### *Please place your value assignments for development below !!!*
### <font color=pink>This cell will be turned off in production, Engineering will set to pull from the configuration</color>

In [279]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull from the configuration application instead
## For the last example, this could look like...
## transform.some_ratio = 0.6
## transform.site_name = "WALGREENS"

transform.col_substatus = 'sub_status'
transform.customer_name = 'sun'


### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

## Planned
1. Collect all unique raw patient substatus instances
2. Auto-map as many raw patient substatus instances to a defined cleansed data model per **Customer**

3. Process for identifying and manually mapping where auto-map fails.
4. Do not publish un-mapped instances. Drop them, give us the ability to triage and map to IC-gold in a later event.


### FETCH DATA - TOUCH, BUT CAREFULLY
### <font color=pink>This cell will be turned off in production, as the input_contract will be handled by the pipeline</color>

In [173]:
"""
************ FETCH DATA - TOUCH, BUT CAREFULLY **************
This cell will be turned off in production, as the input_contract will be handled by the pipeline.
"""
logger.info("FETCH DATA CELL - TOUCH - This cell will be turned off in production, as the input_contract will be handled by the pipeline. ")

# for testing / development only
run_id = 3

if not input_branch:
    input_branch = BRANCH_NAME
input_contract = DatasetContract(branch=input_branch,
                                 state=input_state, 
                                 parent=input_pharma, 
                                 child=input_brand, 
                                 dataset=input_name)
run_filter = []
run_filter.append(dict(partition="__metadata_run_id", comparison="==", values=[run_id]))
# IF YOU HAVE PUBLISHED DATA MULTIPLE TIMES, uncomment the above line and change the int to the run_id to fetch.
# Otherwise, you will have duplicate values in your fetched dataset!

# bypass/comment out when unit testing individual parquet files
#df = input_contract.fetch(filters=run_filter)



2019-08-01 13:04:27,863 - core.transforms.master.master_patient_substatus - INFO - FETCH DATA CELL - TOUCH - This cell will be turned off in production, as the input_contract will be handled by the pipeline. 


## *<font color=grey>unit test development only*</font>

In [174]:
import pyarrow.parquet as pq
import s3fs

def pandas_from_parquet_s3(file_path):  
    s3 = s3fs.S3FileSystem()
    df = (
        pq
        .ParquetDataset(file_path, filesystem=s3)
        .read_pandas()
        .to_pandas()
    )    
    return df

In [290]:
# unit test/development 
# isolate on individual parquet files
#TEST 1
#df = pandas_from_parquet_s3('ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=3/d7ad974cef284e19aa7b5ac410220b96.parquet')
# TEST 2
# df = pandas_from_parquet_s3('ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=3/1a6ffd3598d442e38fbba66ea85a55a2.parquet')
# TEST 3
df = pandas_from_parquet_s3('ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=3/6eceb7ce59bd4dec8720316b4209b0e3.parquet')
# TEST 4
#df = pandas_from_parquet_s3('ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=3/5c00059d9fc04b0e8bc4ce764c50f3fb.parquet')
# TEST 5
#df = pandas_from_parquet_s3('ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=3/90ca3aa7b0bb4246a281591b013ff54e.parquet')

# THEN ALL TEST use 
# then use the FETCH DATA - TOUCH, BUT CAREFULLY CELL

In [291]:
# unit test/development
# before shot unit testing only
dfSize = df.size
dfShape = df.shape
print('size: {} shape: {} {}'.format(dfSize,dfShape[0],dfShape[1])) 

size: 714260 shape: 10060 71


In [283]:
# unit test/development
# needed to see the col(s) of interest
pd.set_option('display.max_columns', 50)

In [284]:
# unit test/development
df.head()

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,pat_last_name,pat_first_name,pat_dob,pat_gender,pat_addr1,pat_addr2,pat_city,pat_state,pat_zip,dx1_code,dx2_code,status_date,status_code,sub_status,pres_last_name,...,ship_carrier,shiptracking_num,ship_location,ship_address,ship_city,ship_state,ship_zip,has_medical,primary_coverage_type,primary_payer_name,primary_payer_type,secondary_coverage_type,secondary_payer_name,secondary_payer_type,plan_paid_amt,pat_copay,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_run_timestamp,__metadata_app_version,__metadata_output_contract,__metadata_transform_timestamp
0,20181024115959,ACCREDO,1346208949,COM,279133432018102401,0,DIRECT,20181019120000,,27913343,,,,,,,,,0,L40.0,,20181024115959,CANCELLED,OTHER,,...,,,,,,,,Y,MEDICAL,GENERAL DIRECT,COMMERCIAL,,,,,,,,,,NNNNV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22
1,20181025115959,ACCREDO,1346208949,COM,278370982018102502,0,DIRECT,20181022120000,,27837098,,,,F,,,,,0,L40.0,,20181025115959,CANCELLED,INSURANCE OON,GREENBERG,...,,,,,,,,Y,MEDICAL,BROWN & TOLAND MEDICAL GRP,COMMERCIAL,,,,,,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22
2,20181029115959,ACCREDO,1346208949,COM,279181482018102903,0,DIRECT,20181024120000,,27918148,,,,M,,,,,0,L40.0,,20181029115959,CANCELLED,OTHER,SCIURBA,...,,,,,,,,Y,MEDICAL,GENERAL HORIZON BCBS NJ,COMMERCIAL,,,,,,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22
3,20181102115959,ACCREDO,1346208949,COM,267244982018110204,0,DIRECT,20181030120000,,26724498,,,,F,,,,,0,Q84,L40.0,20181102115959,CANCELLED,INSURANCE OON,KNUCKLES,...,,,,,,,,Y,MEDICAL,ANTHEM BCBS OF KENTUCKY,MEDICARE,,,,,,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22
4,20181106115959,ACCREDO,1346208949,COM,160618142018110605,0,DIRECT,20181102120000,,16061814,,,,F,,,,,0,696.1,,20181106115959,CANCELLED,OTHER,KORY,...,,,,,,,,Y,MEDICAL,EXPRESS SCRIPTS,COMMERCIAL,,,,,,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22


# <font color=red>**CALL**</font> THE TRANSFORM

In [292]:
### Use the variables above to execute your transformation.
### the final output needs to be a variable named final_dataframe
logger.info("CALL THE TRANSFORM - execute your transformation")

final_dataframe, final_fail, go = transform.master_patient_substatus(df)

if go==True:
    logger.info("CALL THE TRANSFORM -  go no go = GO")
elif go==False:
    logger.info("CALL THE TRANSFORM -  go no go = NO go")
else:
    go=False
    logger.info("CALL THE TRANSFORM -  go no go = unknown make it NO go")
    
    

2019-08-01 21:44:54,244 - core.transforms.master.master_patient_substatus - INFO - CALL THE TRANSFORM - execute your transformation
2019-08-01 21:44:54,246 - core.transforms.master.master_patient_substatus - INFO - try:
2019-08-01 21:44:54,249 - core.transforms.master.master_patient_substatus - INFO - df in  shape: 10060 71
2019-08-01 21:44:54,286 - core.transforms.master.master_patient_substatus - INFO - df in            rec_date pharm_code   pharm_npi transtype  \
0  2018103004:01:55        BRV  1083045140       COM   
1  2018103004:01:55        BRV  1497845317       COM   
2  2018103004:01:55        BRV  1497845317       COM   
3  2018103004:01:55        BRV  1497845317       COM   
4  2018103004:01:55        BRV  1083045140       COM   

          pharm_transaction_id trans_seq ref_source          ref_date  \
0  BRIOVARX_20181030_110343541         0     DIRECT  2018102616:24:58   
1  BRIOVARX_20181030_110401861         0     DIRECT  2018102708:20:12   
2  BRIOVARX_20181030_11040186

### *<font color=grey>unittest python*</font>

In [287]:
import unittest

def ut_shape(final_dataframe,df):
    """
    assertion will change based on coding state
    """
    return final_dataframe.shape == df.shape

class TestNotebook(unittest.TestCase):
    
    def test_ut_shape(self):
        
        self.assertEqual(ut_shape(final_dataframe,df),True)
                
"""
expect ... ok for now until I add some transform code since transform function is doing nothing yet

"""
# for development only
unittest.main(argv=[''], verbosity= 2, exit=False)        
    

test_ut_shape (__main__.TestNotebook) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.002s

OK


<unittest.main.TestProgram at 0x7fcedc3a6dd8>

In [289]:
# untit test/development look at the fails
#final_fail.head()
final_fail[transform.col_substatus]

Series([], Name: sub_status, dtype: object)

In [288]:
# untit test/development look at the pass(es)
#final_dataframe.head()
final_dataframe[transform.col_substatus]

0                  OTHER
1          INSURANCE OON
2                  OTHER
3          INSURANCE OON
4                  OTHER
5          INSURANCE OON
6                  OTHER
7                  OTHER
8                  OTHER
9          INSURANCE OON
10      INSURANCE DENIED
11                 OTHER
12           PATIENT END
13                 READY
14                 READY
15              SHIPMENT
16              SHIPMENT
17                 READY
18      PATIENT RESPONSE
19                 OTHER
20                 READY
21              SHIPMENT
22         INSURANCE OON
23                 READY
24                 READY
25              SHIPMENT
26              SHIPMENT
27              SHIPMENT
28         INSURANCE OON
29              SHIPMENT
              ...       
1925               OTHER
1926            SHIPMENT
1927     PATIENT CONTACT
1928            SHIPMENT
1929            SHIPMENT
1930               READY
1931               OTHER
1932               OTHER
1933     PATIENT CONTACT


# **publish**
### Writing to S3
Invoke the `publish()` command to write to a given contract. Some things to know:
- To invoke publish a contract must be at the grain of dataset. This is because file names will be set by the dataframe=\>parquet conversion. 
- publish only accepts a pandas dataframe.
- publish does not allow for timedelta data types at this time (this is missing functionality in pyarrow).
- publish handles partitioning the data as per contract, creating file paths, and creating the binary parquet files in S3, as well as the needed metadata. <br>
**- by default, all datasets include a single partition, \_\_metadata\_run\_id, the RunEvent ID of an executed pipeline**

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
if go==True:
    logger.info("PUBLISH - that's it - its a GO - just provide the final dataframe to the var final_dataframe and we take it from there")
    transform.publish_contract.publish(final_dataframe, run_id, session)
elif go==False:
    logger.info("PUBLISH -  go no go = NO go -  so DONT publish")
else:
    go=False
    logger.info("PUBLISH -  go no go = unknown make it NO go - so DONT publish")    
session.close()

***