In [1]:
# workaround via specifying an invalid value first
%config Application.log_level='WORKAROUND'
# => fails, necessary on Fedora 27, ipython3 6.2.1
%config Application.log_level='DEBUG'
import logging
logging.getLogger().setLevel(logging.DEBUG)
log = logging.getLogger()
log.debug('Test debug')

ERROR:root:The 'log_level' trait of an IPKernelApp instance must be any of (0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL'), but a value of 'WORKAROUND' <class 'str'> was specified.
DEBUG:root:Test debug


In [2]:
transform_id=40

In [3]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            dataset=db_transform.transformation_template.name)


DEBUG:git.cmd:Popen(['git', 'version'], cwd=/host/core/transforms, universal_newlines=False, shell=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/host/core/transforms, universal_newlines=False, shell=None)


2019-05-28 18:15:29,520 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-28 18:15:29,542 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-28 18:15:29,575 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-28 18:15:29,576 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-28 18:15:29,579 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-28 18:15:29,580 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-28 18:15:29,583 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-28 18:15:29,584 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-28 18:15:2

# CORE Cartridge Notebook :: symphony_health_association_refinement
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [4]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        ## YOUR properties go here!!
        ingest_source_transform: str = db_transform.variables.ingest_source_transform # The name of the dataset to pull from
        ingest_source_file_prefix: str = db_transform.variables.ingest_source_file_prefix # If from initial ingest, the file prefix name
        # The following follows possible approach of ingest mapping by individual vars
        rec_date_col: str = db_transform.variables.rec_date_col
        pharm_code_col: str = db_transform.variables.pharm_code_col
        pharm_npi: str = db_transform.variables.pharm_npi
        transtype: str = db_transform.variables.transtype
        pharm_transaction_id: str = db_transform.variables.pharm_transaction_id
        trans_seq: str = db_transform.variables.trans_seq
        ref_source: str = db_transform.variables.ref_source
        ref_date: str = db_transform.variables.ref_date
        program_id: str = db_transform.variables.program_id
        pharmacy_id: str = db_transform.variables.pharmacy_id
        pat_last_name: str = db_transform.variables.pat_last_name
        pat_first_name: str = db_transform.variables.pat_first_name
        pat_dob: str = db_transform.variables.pat_dob
        pat_gender: str = db_transform.variables.pat_gender
        pat_addr1: str = db_transform.variables.pat_addr1
        pat_addr2: str = db_transform.variables.pat_addr2
        pat_city: str = db_transform.variables.pat_city
        pat_state: str = db_transform.variables.pat_state
        pat_zip: str = db_transform.variables.pat_zip
        dx1_code: str = db_transform.variables.dx1_code
        dx2_code: str = db_transform.variables.dx2_code
        status_date: str = db_transform.variables.status_date
        status_code: str = db_transform.variables.status_code
        sub_status: str = db_transform.variables.sub_status
        pres_last_name: str = db_transform.variables.pres_last_name
        pres_first_name: str = db_transform.variables.pres_first_name
        pres_addr1: str = db_transform.variables.pres_addr1
        pres_addr2: str = db_transform.variables.pres_addr2
        pres_city: str = db_transform.variables.pres_city
        pres_state: str = db_transform.variables.pres_state
        pres_zip: str = db_transform.variables.pres_zip
        pres_phone: str = db_transform.variables.pres_phone
        pres_npi: str = db_transform.variables.pres_npi
        pres_dea: str = db_transform.variables.pres_dea
        facility_name: str = db_transform.variables.facility_name
        rxdate: str = db_transform.variables.rxdate
        rxnumber: str = db_transform.variables.rxnumber
        rxrefills: str = db_transform.variables.rxrefills
        rxfill: str = db_transform.variables.rxfill
        refill_remaining: str = db_transform.variables.refill_remaining
        prev_disp: str = db_transform.variables.prev_disp
        rx_ndc_number: str = db_transform.variables.rx_ndc_number
        medication: str = db_transform.variables.medication
        quantity: str = db_transform.variables.quantity
        day_supply: str = db_transform.variables.day_supply
        ship_date: str = db_transform.variables.ship_date
        ship_carrier: str = db_transform.variables.ship_carrier
        shiptracking_num: str = db_transform.variables.shiptracking_num
        ship_location: str = db_transform.variables.ship_location
        ship_address: str = db_transform.variables.ship_address
        ship_city: str = db_transform.variables.ship_city
        ship_state: str = db_transform.variables.ship_state
        ship_zip: str = db_transform.variables.ship_zip
        has_medical: str = db_transform.variables.has_medical
        primary_coverage_type: str = db_transform.variables.primary_coverage_type
        primary_payer_name: str = db_transform.variables.primary_payer_name
        primary_payer_type: str = db_transform.variables.primary_payer_type
        secondary_coverage_type: str = db_transform.variables.secondary_coverage_type
        secondary_payer_name: str = db_transform.variables.secondary_payer_name
        secondary_payer_type: str = db_transform.variables.secondary_payer_type
        plan_paid_amt: str = db_transform.variables.plan_paid_amt
        pat_copay: str = db_transform.variables.pat_copay
        copay_assist_amount: str = db_transform.variables.copay_assist_amount
        oth_payer_amt: str = db_transform.variables.oth_payer_amt
        xfer_pharmname: str = db_transform.variables.xfer_pharmname
        msa_patient_id: str = db_transform.variables.msa_patient_id
        msa_patient_bmap: str = db_transform.variables.msa_patient_bmap
        metadata_run_timestamp: str = db_transform.variables.metadata_run_timestamp
        metadata_app_version: str = db_transform.variables.metadata_app_version
        metadata_output_contract: str = db_transform.variables.metadata_output_contract

In [5]:
from core.logging import get_logger

In [6]:
transform = Transform()
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

In [7]:
column_renames = {
    transform.rec_date_col : 'rec_date',
    transform.pharm_code_col : 'pharm_code',
    transform.pharm_npi : 'pharm_npi',
    transform.transtype : 'transtype',
    transform.pharm_transaction_id : 'pharm_transaction_id',
    transform.trans_seq : 'trans_seq',
    transform.ref_source : 'ref_source',
    transform.ref_date : 'ref_date',
    transform.program_id : 'program_id',
    transform.pharmacy_id : 'pharmacy_id',
    transform.pat_last_name : 'pat_last_name',
    transform.pat_first_name : 'pat_first_name',
    transform.pat_dob : 'pat_dob',
    transform.pat_gender : 'pat_gender',
    transform.pat_addr1 : 'pat_addr1',
    transform.pat_addr2 : 'pat_addr2',
    transform.pat_city : 'pat_city',
    transform.pat_state : 'pat_state',
    transform.pat_zip : 'pat_zip',
    transform.dx1_code : 'dx1_code',
    transform.dx2_code : 'dx2_code',
    transform.status_date : 'status_date',
    transform.status_code : 'status_code',
    transform.sub_status : 'sub_status',
    transform.pres_last_name : 'pres_last_name',
    transform.pres_first_name : 'pres_first_name',
    transform.pres_addr1 : 'pres_addr1',
    transform.pres_addr2 : 'pres_addr2',
    transform.pres_city : 'pres_city',
    transform.pres_state : 'pres_state',
    transform.pres_zip : 'pres_zip',
    transform.pres_phone : 'pres_phone',
    transform.pres_npi : 'pres_npi',
    transform.pres_dea : 'pres_dea',
    transform.facility_name : 'facility_name',
    transform.rxdate : 'rxdate',
    transform.rxnumber : 'rxnumber',
    transform.rxrefills : 'rxrefills',
    transform.rxfill : 'rxfill',
    transform.refill_remaining : 'refill_remaining',
    transform.prev_disp : 'prev_disp',
    transform.rx_ndc_number : 'rx_ndc_number',
    transform.medication : 'medication',
    transform.quantity : 'quantity',
    transform.day_supply : 'day_supply',
    transform.ship_date : 'ship_date',
    transform.ship_carrier : 'ship_carrier',
    transform.shiptracking_num : 'shiptracking_num',
    transform.ship_location : 'ship_location',
    transform.ship_address : 'ship_address',
    transform.ship_city : 'ship_city',
    transform.ship_state : 'ship_state',
    transform.ship_zip : 'ship_zip',
    transform.has_medical : 'has_medical',
    transform.primary_coverage_type : 'primary_coverage_type',
    transform.primary_payer_name : 'primary_payer_name',
    transform.primary_payer_type : 'primary_payer_type',
    transform.secondary_coverage_type : 'secondary_coverage_type',
    transform.secondary_payer_name : 'secondary_payer_name',
    transform.secondary_payer_type : 'secondary_payer_type',
    transform.plan_paid_amt : 'plan_paid_amt',
    transform.pat_copay : 'pat_copay',
    transform.copay_assist_amount : 'copay_assist_amount',
    transform.oth_payer_amt : 'oth_payer_amt',
    transform.xfer_pharmname : 'xfer_pharmname',
    transform.msa_patient_id : 'msa_patient_id',
    transform.msa_patient_bmap : 'msa_patient_bmap',
    transform.metadata_run_timestamp : '__metadata_run_timestamp',
    transform.metadata_app_version : '__metadata_app_version',
    transform.metadata_output_contract : '__metadata_output_contract'
}

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transform takes the Symphony Health Association base ingested data and separates out useful columns and uppercases all the data.

### Transformation

In [8]:
import pandas as pd
from s3parq import fetch_diff

In [9]:
# Place your import contracts here
ingest_dataset = transform.ingest_source_transform + "/" + transform.ingest_source_file_prefix

ingest_contract = DatasetContract(branch=BRANCH_NAME,
                            parent=transform.pharmaceutical_company,
                            child=transform.brand,
                            state="ingest",
                            dataset=ingest_dataset)

logger.debug(f"Retrieving data from path : {ingest_contract.s3_path}")

2019-05-28 18:15:29,904 - core.transforms.ingest.symphony_health_association_refinement - DEBUG - Retrieving data from path : s3://ichain-dev/stephanie/stephanie/ilumya/ingest/initial_ingest/INTEGRICHAIN_SUN_WALGREENS_STATUSDISPENSE


In [10]:
# Using only the run timestamp at this point in time
diff_partition = "__metadata_run_timestamp"

# This transform must call s3parq directly as it is still working off of the initial ingest schema
final_dataframe = fetch_diff(input_bucket=ENV_BUCKET, 
                             input_key=ingest_contract.key, 
                             comparison_bucket=ENV_BUCKET, 
                             comparison_key=transform.publish_contract.key, 
                             partition=diff_partition)

logger.debug(f"Difference fetched, fetched dataframe shape : {final_dataframe.shape}")

2019-05-28 18:15:30,275 - core.transforms.ingest.symphony_health_association_refinement - DEBUG - Difference fetched, fetched dataframe shape : (0, 0)


In [11]:
logger.debug(f"Expecting the following columns : {list(column_renames.keys())}")
logger.debug(f"Dataframe has these columns : {list(final_dataframe.columns)}")

# First cut down to the necesarry columns in case of accidental extras - Pandas wont catch those
final_dataframe = final_dataframe[list(column_renames.keys())]

# Rename based on above created configuration variables
final_dataframe = final_dataframe.rename(column_renames, axis="columns")

# Uppercase any string columns in the dataframe
final_dataframe = final_dataframe.apply(lambda x: x.str.upper().str.strip() if isinstance(x, object) else x)

2019-05-28 18:15:30,287 - core.transforms.ingest.symphony_health_association_refinement - DEBUG - Expecting the following columns : ['Rec Date', 'Pharm Code', 'Pharm NPI', 'transType', 'Pharm Transaction Id', 'Trans Seq', 'Ref Source', 'Ref Date', 'Program ID', 'Pharmacy ID', 'Pat Last Name', 'Pat First Name', 'Pat DOB', 'PatGender', 'Pat Addr1', 'Pat Addr2', 'Pat City', 'Pat State', 'Pat Zip', 'Dx1 Code', 'Dx2 Code', 'Status Date', 'Status Code', 'Sub Status', 'Pres Last Name', 'Pres First Name', 'Pres Addr1', 'Pres Addr2', 'Pres City', 'Pres State', 'Pres Zip', 'Pres Phone', 'Pres NPI', 'Pres DEA', 'Facility Name', 'RxDate', 'RxNumber', 'RxRefills', 'RxFill', 'Refill Remaining', 'prev Disp', 'Rx NDC Number', 'Medication', 'Quantity', 'Day Supply', 'Ship Date', 'Ship Carrier', 'shipTracking Num', 'Ship Location', 'Ship Address', 'Ship City', 'Ship State', 'Ship Zip', 'Has Medical', 'Primary CoverageType', 'Primary Payer Name', 'Primary Payer Type', 'Secondary CoverageType', 'Secondary

KeyError: "None of [Index(['Rec Date', 'Pharm Code', 'Pharm NPI', 'transType',\n       'Pharm Transaction Id', 'Trans Seq', 'Ref Source', 'Ref Date',\n       'Program ID', 'Pharmacy ID', 'Pat Last Name', 'Pat First Name',\n       'Pat DOB', 'PatGender', 'Pat Addr1', 'Pat Addr2', 'Pat City',\n       'Pat State', 'Pat Zip', 'Dx1 Code', 'Dx2 Code', 'Status Date',\n       'Status Code', 'Sub Status', 'Pres Last Name', 'Pres First Name',\n       'Pres Addr1', 'Pres Addr2', 'Pres City', 'Pres State', 'Pres Zip',\n       'Pres Phone', 'Pres NPI', 'Pres DEA', 'Facility Name', 'RxDate',\n       'RxNumber', 'RxRefills', 'RxFill', 'Refill Remaining', 'prev Disp',\n       'Rx NDC Number', 'Medication', 'Quantity', 'Day Supply', 'Ship Date',\n       'Ship Carrier', 'shipTracking Num', 'Ship Location', 'Ship Address',\n       'Ship City', 'Ship State', 'Ship Zip', 'Has Medical',\n       'Primary CoverageType', 'Primary Payer Name', 'Primary Payer Type',\n       'Secondary CoverageType', 'Secondary PayerName', 'Secondary PayerType',\n       'Plan Paid Amt', 'Pat Copay', 'Copay Assist Amount', 'Oth Payer Amt',\n       'Xfer PharmName', 'MSA PATIENT ID', 'MSA PATIENT BMAP',\n       '__metadata_run_timestamp', '__metadata_app_version',\n       '__metadata_output_contract'],\n      dtype='object')] are in the [columns]"

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe)