In [1]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

2019-07-26 16:46:08,198 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-26 16:46:08,220 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-26 16:46:08,252 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-26 16:46:08,253 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-26 16:46:08,257 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-26 16:46:08,258 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-26 16:46:08,261 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-07-26 16:46:08,262 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-07-26 16:46:0

In [2]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
# config vars: this dataset
config_pharma = "sun" # the pharmaceutical company which owns {brand}
config_brand = "ilumya" # the brand this pipeline operates on
config_state = "enrich" # the state this transform runs in
config_name = "pending_enrichment" # the name of this transform, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_pharma = "sun"
input_brand = "ilumya"
input_state = "ingest"
input_name = "symphony_health_association_ingest_column_mapping"
input_branch = "sun-extract-validation" # if None, input_branch is automagically set to your working branch

In [3]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
transform_id = ids[0]
run_id = ids[1]

2019-07-26 16:46:09,115 - core.logging - DEBUG - Adding/getting mocks for specified configurations...
2019-07-26 16:46:09,141 - core.logging - DEBUG - Done. Creating mock run event and committing results to configuration mocker.


In [4]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

import pandas as pd
pd.options.display.max_rows=999
pd.options.display.max_columns=999

In [31]:
def pending_enrichment(df: pd.DataFrame, 
                       cols, 
                       pjh, 
                       statuses,
                       table_columns,
                       pending_substatuses):
    
    patient = cols.patient
    pharm = cols.pharm
    status_date = cols.status_date
    product = cols.product
    ic_status = cols.ic_status
    ic_substatus = cols.ic_substatus
    # pjh_name = cols.pjh

    pending_status = statuses.pending

    df.sort_values(by=[patient, pharm, product, status_date], ascending=[True, True, True, True], inplace=True)
    
    bvpa_df = df[df[cols.pjh] == pjh.bvpa]
    
    min_bvpa = bvpa_df.groupby([patient, pharm, product])[status_date].min().reset_index().rename(
        columns={status_date: 'Min_BVPA_Date'})
    
    max_bvpa = bvpa_df.groupby([patient, pharm, product])[status_date].max().reset_index().rename(
        columns={status_date: 'Max_BVPA_Date'})
    
    bvpa_intake_df = df[df[cols.pjh].isin([pjh.bvpa, pjh.intake])]
    
    max_bvpa_intake = bvpa_intake_df.groupby([patient, pharm, product])[status_date].max().reset_index().rename(
        columns={status_date: 'Max_BVPA_Intake_Date'})
    
    bvpa_fulfillment_df = df[df[cols.pjh].isin([pjh.bvpa, pjh.fulfillment])]
    
    min_bvpa_fulfillment = bvpa_fulfillment_df.groupby([patient, pharm, product])[
        status_date].min().reset_index().rename(columns={status_date: 'Min_BVPA_Fulfillment_Date'})
    
    df = pd.merge(df, min_bvpa, how='left', on=[patient, pharm, product]).fillna(value=df[status_date].min())
    
    df = pd.merge(df, max_bvpa_intake, how='left', on=[patient, pharm, product]).fillna(value=df[status_date].max())
    
    df = pd.merge(df, max_bvpa, how='left', on=[patient, pharm, product]).fillna(value=df[status_date].max())
    
    df = pd.merge(df, min_bvpa_fulfillment, how='left', on=[patient, pharm, product]).fillna(
        value=df[status_date].min())
    
    to_enrich_df = df[(df[ic_status] == pending_status) & (df[ic_substatus].isin(pending_substatuses))]
    
    to_enrich_df['Before_Min_BVPA_Fulfillment'] = np.where(
        to_enrich_df[status_date] < to_enrich_df['Min_BVPA_Fulfillment_Date'], 1, 0)
    
    to_enrich_df['Before_Min_BVPA'] = np.where(to_enrich_df[status_date] < to_enrich_df['Min_BVPA_Date'], 1, 0)
    
    to_enrich_df['After_Max_BVPA_Intake'] = np.where(to_enrich_df[status_date] > to_enrich_df['Max_BVPA_Intake_Date'],
                                                     1, 0)
    
    to_enrich_df['After_Max_BVPA'] = np.where(to_enrich_df[status_date] > to_enrich_df['Max_BVPA_Date'], 1, 0)

    to_enrich_df[cols.pjh] = np.where(to_enrich_df['Before_Min_BVPA_Fulfillment'] == 1, pjh.intake, np.where((to_enrich_df['Before_Min_BVPA'] == 0) & (to_enrich_df['After_Max_BVPA'] == 0), pjh.bvpa,np.where(to_enrich_df['After_Max_BVPA_Intake'] == 1, pjh.fulfillment,to_enrich_df[cols.pjh])))

    to_enrich_df = to_enrich_df[table_columns]
    
    enriched_ids = to_enrich_df['id'].values.tolist()
    
    df = df[~(df['id'].isin(enriched_ids))]
    
    df = df.append(to_enrich_df)
    
    return (df)

In [5]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''

    status_type: str
    id_cols: list
    ambig_list: list
    
    
transform = Transform()

In [6]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull from the configuration application instead
## For the last example, this could look like...
## transform.some_ratio = 0.6
## transform.site_name = "WALGREENS"

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

(clear out and replace with your description)

### Transformation

In [72]:
"""
************ FETCH DATA - TOUCH, BUT CAREFULLY **************
This cell will be turned off in production, as the input_contract will be handled by the pipeline.
"""

if not input_branch:
    input_branch = BRANCH_NAME
input_contract = DatasetContract(branch=input_branch, state=input_state, parent=input_pharma, child=input_brand, dataset=input_name)
run_filter = []
run_filter.append(dict(partition="__metadata_run_id", comparison="==", values=[3]))
# IF YOU HAVE PUBLISHED DATA MULTIPLE TIMES, uncomment the above line and change the int to the run_id to fetch.
# Otherwise, you will have duplicate values in your fetched dataset!
df = input_contract.fetch(filters=run_filter)

import pandas as pd
pd.options.display.max_columns=999

df.status_date = df.status_date.str[:8].astype(str)
df.ref_date = df.ref_date.str[:8].astype(str)

df.status_date = pd.to_datetime(df.status_date, infer_datetime_format=True, errors='coerce')
df.ref_date = pd.to_datetime(df.ref_date, infer_datetime_format=True, errors='coerce')

2019-07-26 17:46:50,527 - core.dataset_contract.DatasetContract - INFO - Fetching dataframe from s3 location s3://ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping.


In [82]:
bvpa_df = df[df.status_code == 'PENDING']

In [102]:
bvpa_dict = bvpa_df[bvpa_df.sub_status == 'OTHER'][['pharm_code','medication','msa_patient_id']].drop_duplicates().reset_index(drop=True).to_dict(orient='index')

min_bvpa = bvpa_df.groupby(['msa_patient_id','pharm_code','medication']).status_date.min().reset_index(drop=False).rename(columns={'status_date':'min_bvpa_date'})
max_bvpa = bvpa_df.groupby(['msa_patient_id','pharm_code','medication']).status_date.max().reset_index(drop=False).rename(columns={'status_date':'min_bvpa_date'})

In [32]:
def pending_enrich(df, ambig_list, status_col, id_col_list, status_type):
    bvpa_df = df[df[status_col] == status_type]

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,...,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_run_timestamp,__metadata_app_version,__metadata_output_contract,__metadata_transform_timestamp,__metadata_run_id
0,20181024115959,ACCREDO,1346208949,COM,279133432018102401,0,DIRECT,20181019120000,,27913343,...,,,,,NNNNV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22,3
1,20181025115959,ACCREDO,1346208949,COM,278370982018102502,0,DIRECT,20181022120000,,27837098,...,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22,3
2,20181029115959,ACCREDO,1346208949,COM,279181482018102903,0,DIRECT,20181024120000,,27918148,...,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22,3
3,20181102115959,ACCREDO,1346208949,COM,267244982018110204,0,DIRECT,20181030120000,,26724498,...,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22,3
4,20181106115959,ACCREDO,1346208949,COM,160618142018110605,0,DIRECT,20181102120000,,16061814,...,,,,,NNNVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:22,3


In [92]:
df.columns.values

array(['rec_date', 'pharm_code', 'pharm_npi', 'transtype',
       'pharm_transaction_id', 'trans_seq', 'ref_source', 'ref_date',
       'program_id', 'pharmacy_id', 'pat_last_name', 'pat_first_name',
       'pat_dob', 'pat_gender', 'pat_addr1', 'pat_addr2', 'pat_city',
       'pat_state', 'pat_zip', 'dx1_code', 'dx2_code', 'status_date',
       'status_code', 'sub_status', 'pres_last_name', 'pres_first_name',
       'pres_addr1', 'pres_addr2', 'pres_city', 'pres_state', 'pres_zip',
       'pres_phone', 'pres_npi', 'pres_dea', 'facility_name', 'rxdate',
       'rxnumber', 'rxrefills', 'rxfill', 'refill_remaining', 'prev_disp',
       'rx_ndc_number', 'medication', 'quantity', 'day_supply',
       'ship_date', 'ship_carrier', 'shiptracking_num', 'ship_location',
       'ship_address', 'ship_city', 'ship_state', 'ship_zip',
       'has_medical', 'primary_coverage_type', 'primary_payer_name',
       'primary_payer_type', 'secondary_coverage_type',
       'secondary_payer_name', 'secondary_

In [74]:
df = df.sort_values(['msa_patient_id', 'pharm_code', 'medication', 'status_date'],ascending=[True, True, True, True])

df.head()

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,pat_last_name,pat_first_name,pat_dob,pat_gender,pat_addr1,pat_addr2,pat_city,pat_state,pat_zip,dx1_code,dx2_code,status_date,status_code,sub_status,pres_last_name,pres_first_name,pres_addr1,pres_addr2,pres_city,pres_state,pres_zip,pres_phone,pres_npi,pres_dea,facility_name,rxdate,rxnumber,rxrefills,rxfill,refill_remaining,prev_disp,rx_ndc_number,medication,quantity,day_supply,ship_date,ship_carrier,shiptracking_num,ship_location,ship_address,ship_city,ship_state,ship_zip,has_medical,primary_coverage_type,primary_payer_name,primary_payer_type,secondary_coverage_type,secondary_payer_name,secondary_payer_type,plan_paid_amt,pat_copay,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_run_timestamp,__metadata_app_version,__metadata_output_contract,__metadata_transform_timestamp,__metadata_run_id
274,20181106 23:00:00,CVS,1043382302,COM,182176830,0,HUB,2018-10-19,1303801,9009919609,,,,M,,,,,30,L40.0,,2018-12-31,ACTIVE,SHIPMENT,CHAO,TOMAS,100 STONEFOREST DR,STE 320,WOODSTOCK,GA,30189,7705165199,1316003577,MC0707286,,20180918.0,81872197.0,0.0,0.0,0.0,,47335017795.0,ILUMYA SD PFS,1.0,28.0,20181106 23:00:00,UPS,1Z265561NW85358841,PRESCRIBER OFFICE,"100 STONE FOREST DRIVE,SUITE 320",WOODSTOCK,GA,30.0,,PHARMACY,,OTHER,,,,,,,,,2120001,VVVVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:27,3
275,20181220 23:00:00,CVS,1043382302,COM,183711690,0,HUB,2018-10-19,1303801,9009919609,,,,M,,,,,30,L40.0,,2018-12-31,ACTIVE,SHIPMENT,CHAO,TOMAS,100 STONEFOREST DR,STE 320,WOODSTOCK,GA,30189,7705165199,1316003577,MC0707286,,20180918.0,81872456.0,0.0,0.0,0.0,,47335017795.0,ILUMYA SD PFS,1.0,31.0,20181220 23:00:00,UPS,1Z265561NW86226400,PRESCRIBER OFFICE,"100 STONE FOREST DRIVE,100 STONE FOREST DRIVE ...",WOODSTOCK,GA,30.0,,PHARMACY,,OTHER,,,,,,,,,2120001,VVVVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:27,3
3293,20181106 23:00:00,CVS,1043382302,COM,182176830,0,HUB,2018-10-19,1303801,9009919609,,,,M,,,,,30,L40.0,,2018-12-31,ACTIVE,SHIPMENT,CHAO,TOMAS,100 STONEFOREST DR,STE 320,WOODSTOCK,GA,30189,7705165199,1316003577,MC0707286,,20180918.0,81872197.0,0.0,0.0,0.0,,47335017795.0,ILUMYA SD PFS,1.0,28.0,20181106 23:00:00,UPS,1Z265561NW85358841,PRESCRIBER OFFICE,"100 STONE FOREST DRIVE,SUITE 320",WOODSTOCK,GA,30.0,,PHARMACY,,OTHER,,,,,,,,,2120001,VVVVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:27,3
3294,20181220 23:00:00,CVS,1043382302,COM,183711690,0,HUB,2018-10-19,1303801,9009919609,,,,M,,,,,30,L40.0,,2018-12-31,ACTIVE,SHIPMENT,CHAO,TOMAS,100 STONEFOREST DR,STE 320,WOODSTOCK,GA,30189,7705165199,1316003577,MC0707286,,20180918.0,81872456.0,0.0,0.0,0.0,,47335017795.0,ILUMYA SD PFS,1.0,31.0,20181220 23:00:00,UPS,1Z265561NW86226400,PRESCRIBER OFFICE,"100 STONE FOREST DRIVE,100 STONE FOREST DRIVE ...",WOODSTOCK,GA,30.0,,PHARMACY,,OTHER,,,,,,,,,2120001,VVVVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:27,3
932,20190308 23:00:00,CVS,1043382302,COM,901165655620190308000000,0,HUB,2019-03-08,1337729,9011656556,,,,M,,,,,8,,,2019-03-08,PENDING,NEW,PRADEEP,MEERA,347 MT PLEASANT AVE,STE 103,WEST ORANGE,NJ,7052,9735712121,1346529948,MP3244314,,,,,,,,,,,,,,,,,,,,,MEDICAL,,,,,,,,,,,2120004,VVVVV,2019-07-01 13:25:07,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-07-01 13:35:27,3


In [47]:
pj_dict = (
    df[['pharm_npi','medication']]
    .drop_duplicates()
    .dropna()
    .reset_index(drop=True)
    .to_dict(orient='index')
)

In [48]:
def return_df(df,dct,key):
    col_set_1,col_set_2 = dct[key].items()
    return df[(df[col_set_1[0]] == col_set_1[1]) & (df[col_set_2[0]] == col_set_2[1]) & (df['status_code'] == 'PENDING')].shape

In [49]:
df = df.drop(labels=['pat_last_name', 'pat_first_name', 'pat_dob', 'pat_gender', 'pat_addr1', 'pat_addr2', 'pat_city', 'pat_state', 'pat_zip', 'dx1_code', 'dx2_code','pres_last_name',
                     'pres_first_name', 'pres_addr1', 'pres_addr2', 'pres_city', 'pres_state', 'pres_zip', 'pres_phone', 'pres_npi', 'pres_dea', 'facility_name', 'rxdate', 'rxnumber',
                     'rxrefills', 'rxfill', 'refill_remaining', 'prev_disp', 'rx_ndc_number'],axis=1)

In [None]:
by=[patient, pharm, product, status_date], ascending=[True, True, True, True]

In [104]:
import unittest

def shape_status(final_dataframe,df):
    return df.shape == final_dataframe.shape

def substatus_cleaned(final_dataframe,sub_col,sub_status):
    return final_dataframe[final_dataframe[sub_col] == sub_status] == 0

class TestNotebook(unittest.TestCase):
    
    def test_shape_status(self):
        self.assertEqual(shape_status(final_dataframe,df),True)
        
    def test_substatus_cleaned(final_dataframe,'sub_status','OTHER')
    
unittest.main(argv=[''],verbosity=2,exit=False)

In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()