In [1]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

2019-08-12 19:39:06,169 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-08-12 19:39:06,198 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-08-12 19:39:06,235 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-08-12 19:39:06,236 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-08-12 19:39:06,240 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-08-12 19:39:06,242 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-08-12 19:39:06,252 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-08-12 19:39:06,253 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-08-12 19:39:0

In [2]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
# config vars: this dataset
config_pharma = "sun" # the pharmaceutical company which owns {brand}
config_brand = "ilumya" # the brand this pipeline operates on
config_state = "enrich" # the state this transform runs in
config_name = "accredo_cancel_before_active" # the name of this transform, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_pharma = "sun"
input_brand = "ilumya"
input_state = "ingest"
input_name = "symphony_health_association_ingest_column_mapping"
input_branch = "sun-extract-validation" # if None, input_branch is automagically set to your working branch

In [3]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
transform_id = ids[0]
run_id = ids[1]

2019-08-12 19:39:06,438 - core.logging - DEBUG - Adding/getting mocks for specified configurations...
2019-08-12 19:39:06,470 - core.logging - DEBUG - Done. Creating mock run event and committing results to configuration mocker.


In [4]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::accredo_cancel_before_active
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [5]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    # Column aliases
    status_date: str # Status date column
    ref_date: str # Referral date column
    patient: str # Patient ID column
    pharm: str # Pharmacy Name column
    product: str # Medication Name column
    status: str # Status column
    substatus: str # Substatus column
    ic_status: str # IntegriChain Status column
    ic_substatus: str # IntegriChain Substatus column
    pjh: str # Patient Journey Hierarchy column
        
    # Possible status values
    pending: str # Pending status 'PENDING'
    active: str # Active status 'ACTIVE'
    cancelled: str # Cancelled status 'CANCELLED'
    discontinued: str # Discontinued status 'DISCONTINUED'
        
    # Possible substatus values
    pending_new: str # New substatus when status is 'PENDING'
    shipped: str # Shipment substatus when status is 'ACTIVE'
    
    # Possible PJH values
    bvpa: str # BV/PA pjh
    intake: str # Intake pjh
    fulfillment: str # Fulfillment pjh
    transfer: str # Transferred pjh
    payer: str # Payer pjh

transform = Transform()

In [6]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull from the configuration application instead
## For the last example, this could look like...
## transform.some_ratio = 0.6
## transform.site_name = "WALGREENS"

# Vars
transform.patient = 'msa_patient_id'
transform.pharm = 'pharm_code'
transform.product = 'medication'
transform.ref_date = 'rec_date'
transform.status_date = 'status_date'
transform.status = 'status_code'
transform.substatus = 'sub_status'
transform.ic_status = 'integrichain_status'
transform.ic_substatus = 'integrichain_sub_status'
transform.pjh = 'Patient_Journey_Hierarchy'

# Values
transform.pending = 'PENDING'
transform.active = 'ACTIVE'
transform.cancelled = 'CANCELLED'
transform.discontinued = 'DISCONTINUED'
transform.pending_new = 'PENDING'
transform.shipped = 'SHIPMENT'
transform.bvpa = 'BV/PA'
transform.intake = 'Intake'
transform.fulfillment = 'Fulfillment'
transform.transfer = 'Transferred'
transform.payer = 'Payer'

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

### Transformation

In [7]:
"""
************ FETCH DATA - TOUCH, BUT CAREFULLY **************
This cell will be turned off in production, as the input_contract will be handled by the pipeline.
"""

if not input_branch:
    input_branch = BRANCH_NAME
input_contract = DatasetContract(branch=input_branch, state=input_state, parent=input_pharma, child=input_brand, dataset=input_name)
run_filter = []
run_filter.append(dict(partition="__metadata_run_id", comparison="==", values=[3]))
# IF YOU HAVE PUBLISHED DATA MULTIPLE TIMES, uncomment the above line and change the int to the run_id to fetch.
# Otherwise, you will have duplicate values in your fetched dataset!
final_dataframe = input_contract.fetch(filters=run_filter)

2019-08-12 19:39:07,019 - core.dataset_contract.DatasetContract - INFO - Fetching dataframe from s3 location s3://ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping.


### Set up column mapping

In [8]:
"""
This cell just stores the transform properties as variable names, in order
to avoid having to change cells in the future.
"""

patient = transform.patient
pharm = transform.pharm
product = transform.product
ref_date = transform.ref_date
status_date = transform.status_date
status = transform.status
substatus = transform.substatus
ic_status = transform.ic_status
ic_substatus = transform.ic_substatus
pjh = transform.pjh

pending = transform.pending
active = transform.active
cancelled = transform.cancelled
discontinued = transform.discontinued
pending_new = transform.pending_new
shipped = transform.shipped
bvpa = transform.bvpa
intake = transform.intake
fulfillment = transform.fulfillment
transfer = transform.transfer
payer = transform.payer

### Impute PJH

In [9]:
"""
This cell brings in the Patient Journey Hierarchy information from a local file.
This is just a placeholder to ensure data processes work as expected until the
true ingest data can be used.
"""

import os

import numpy as np
import pandas as pd

pd.options.display.max_columns=999

df = final_dataframe.copy()

# Standardize date columns to datetime format and cut extra chars
df[status_date] = df[status_date].str[:8].astype(str)
df[ref_date] = df[ref_date].str[:8].astype(str)
df[status_date] = pd.to_datetime(
    df[status_date],
    infer_datetime_format=True,
    errors='coerce'
)
df[ref_date] = pd.to_datetime(
    df[ref_date],
    infer_datetime_format=True,
    errors='coerce'
)

# Read in status mappings
os.chdir('{}'.format(os.path.expanduser('~')))
status_config = pd.read_csv('status_mapping.csv')

# Format status mappings
status_config.loc[:, 'statusCode'] = status_config.statusCode.str.upper()
status_config.loc[:, 'subStatus'] = status_config.subStatus.str.upper()
status_config.loc[:,'integrichain_sub_status'] = status_config.integrichain_sub_status.str.upper()
status_config.loc[:,'integrichain_status'] = status_config.integrichain_status.str.upper()
status_config.loc[:,'Patient_Journey_Hierarchy'] = status_config.Patient_Journey_Hierarchy.str.upper()
status_config = status_config.rename(columns={'statusCode':'status_code','subStatus':'sub_status'})

# Bring in status mappings and enforce columns
df[substatus] = df[substatus].str.replace('PRESCRIBERHOLD','PRESCRIBER HOLD')

df = pd.merge(df, status_config, on=['status_code','sub_status'])

df = df[['rec_date', 'pharm_code', 'pharm_npi', 'transtype', 'pharm_transaction_id', 'trans_seq', 'ref_source', 'ref_date', 'program_id', 'pharmacy_id', 'pat_last_name', 'pat_first_name', 'pat_dob', 'pat_gender', 
         'pat_addr1', 'pat_addr2', 'pat_city', 'pat_state', 'pat_zip', 'dx1_code', 'dx2_code', 'status_date', 'status_code', 'sub_status', 'integrichain_status','integrichain_sub_status', 'Patient_Journey_Hierarchy', 
         'pres_last_name', 'pres_first_name', 'pres_addr1', 'pres_addr2', 'pres_city', 'pres_state', 'pres_zip', 'pres_phone', 'pres_npi', 'pres_dea', 'facility_name', 'rxdate', 'rxnumber', 'rxrefills', 'rxfill', 
         'refill_remaining', 'prev_disp', 'rx_ndc_number', 'medication', 'quantity', 'day_supply', 'ship_date', 'ship_carrier', 'shiptracking_num', 'ship_location', 'ship_address', 'ship_city', 'ship_state', 'ship_zip', 
         'has_medical', 'primary_coverage_type', 'primary_payer_name', 'primary_payer_type', 'secondary_coverage_type', 'secondary_payer_name', 'secondary_payer_type', 'plan_paid_amt', 'pat_copay', 'copay_assist_amount', 
         'oth_payer_amt', 'xfer_pharmname', 'msa_patient_id', 'msa_patient_bmap', '__metadata_run_timestamp', '__metadata_app_version', '__metadata_output_contract', '__metadata_transform_timestamp', '__metadata_run_id']]

In [10]:
# Sort values for logic later
df = (
    df
    .sort_values(by=[
            patient,
            pharm,
            product,
            status_date
    ])
)

In [11]:
# Filter down to accredo and find new substatuses
accredo_df = df[df[pharm] == 'ACCREDO']

new_substatus = (
    accredo_df
    .drop_duplicates(subset=[
        patient,
        pharm,
        product,
        ic_status,
        ic_substatus,
        pjh
    ])
)

In [12]:
# Create fields for next patient (requires the sort earlier)
new_substatus.loc[:, 'next_patient_id'] = new_substatus[patient].shift(-1)
new_substatus.loc[:, 'next_product'] = new_substatus[product].shift(-1)
new_substatus.loc[:, 'next_IC_substatus'] = (
    np.where(
        # Pick out patient/medication rows that appear in the next patient fields
        (
            (new_substatus[patient] == new_substatus['next_patient_id']) &
            (new_substatus[product] == new_substatus['next_product'])
        ),
        # Assign the next status if they do
        new_substatus[ic_substatus].shift(-1),
        # Assign a unique identifier saying it is the last
        'Last Status Reported'
    )
)
new_substatus.loc[:, 'next_IC_status'] = (
    np.where(
        # Pick out patient/medication rows that appear in the next patient fields
        (
            (new_substatus[patient] == new_substatus['next_patient_id']) &
            (new_substatus[product] == new_substatus['next_product'])
        ),
        new_substatus[ic_status].shift(-1),
        'Last Status Reported'
    )
)
new_substatus.loc[:, 'next_substatus_date'] = new_substatus[status_date].shift(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [13]:
""" THIS IS A CHECK CELL """
new_substatus[[patient, pharm, product, ic_status, ic_substatus, pjh, 'next_patient_id', 'next_product', 'next_IC_substatus', 'next_substatus_date']].head(10)

Unnamed: 0,msa_patient_id,pharm_code,medication,integrichain_status,integrichain_sub_status,Patient_Journey_Hierarchy,next_patient_id,next_product,next_IC_substatus,next_substatus_date
9525,2120025,ACCREDO,ILUMYA,PENDING,NEW,INTAKE,2120025,ILUMYA,OTHER,2019-06-20
268,2120025,ACCREDO,ILUMYA,CANCELLED,OTHER,PROVIDER,2410002,ILUMYA,Last Status Reported,2019-06-18
10649,2410002,ACCREDO,ILUMYA,PENDING,OTHER,FULFILLMENT,2440008,ILUMYA,Last Status Reported,2019-06-18
9523,2440008,ACCREDO,ILUMYA,PENDING,NEW,INTAKE,2440008,ILUMYA,OTHER,2019-06-20
269,2440008,ACCREDO,ILUMYA,CANCELLED,OTHER,PROVIDER,2540010,ILUMYA,Last Status Reported,2019-04-04
7951,2540010,ACCREDO,ILUMYA,PENDING,PATIENT CONTACT,FULFILLMENT,2660008,ILUMYA,Last Status Reported,2019-04-03
7946,2660008,ACCREDO,ILUMYA,PENDING,PATIENT CONTACT,FULFILLMENT,2660008,ILUMYA 100 MG/ML PFS 1 ML,Last Status Reported,2019-04-04
5018,2660008,ACCREDO,ILUMYA 100 MG/ML PFS 1 ML,ACTIVE,SHIPMENT,FULFILLMENT,2660008,ILUMYA 100 MG/ML PFS 1 ML,READY,2019-04-17
2747,2660008,ACCREDO,ILUMYA 100 MG/ML PFS 1 ML,ACTIVE,READY,FULFILLMENT,2910021,ILUMYA,Last Status Reported,2019-03-20
10327,2910021,ACCREDO,ILUMYA,PENDING,OTHER,FULFILLMENT,2910021,ILUMYA,INSURANCE OON,2019-03-22


In [14]:
# Obtain the last status date for each accredo patient
new_substatus2 = (
    accredo_df
    .groupby([
        patient,
        pharm,
        product,
        ic_status,
        ic_substatus,
        pjh
    ])[status_date]
    .max()
    .reset_index()
    .rename(columns={status_date: 'Last_Status_Date'})
)

In [15]:
""" THIS IS A CHECK CELL """
new_substatus2.head()

Unnamed: 0,msa_patient_id,pharm_code,medication,integrichain_status,integrichain_sub_status,Patient_Journey_Hierarchy,Last_Status_Date
0,2120025,ACCREDO,ILUMYA,CANCELLED,OTHER,PROVIDER,2019-06-20
1,2120025,ACCREDO,ILUMYA,PENDING,NEW,INTAKE,2019-06-19
2,2410002,ACCREDO,ILUMYA,PENDING,OTHER,FULFILLMENT,2019-06-18
3,2440008,ACCREDO,ILUMYA,CANCELLED,OTHER,PROVIDER,2019-06-20
4,2440008,ACCREDO,ILUMYA,PENDING,NEW,INTAKE,2019-06-18


In [16]:
# Merge to original new_substatus to get last status date
new_substatus = pd.merge(
    new_substatus,
    new_substatus2,
    how='inner',
    on=[patient, pharm, product, ic_status, ic_substatus, pjh]
)

In [17]:
""" THIS IS A CHECK CELL """
new_substatus.iloc[:5, -5:]

Unnamed: 0,next_product,next_IC_substatus,next_IC_status,next_substatus_date,Last_Status_Date
0,ILUMYA,OTHER,CANCELLED,2019-06-20,2019-06-19
1,ILUMYA,Last Status Reported,Last Status Reported,2019-06-18,2019-06-20
2,ILUMYA,Last Status Reported,Last Status Reported,2019-06-18,2019-06-18
3,ILUMYA,OTHER,CANCELLED,2019-06-20,2019-06-18
4,ILUMYA,Last Status Reported,Last Status Reported,2019-04-04,2019-06-20


In [18]:
# Create days column to show how many days patient has been in their status
new_substatus['days_in_current'] = (
    (new_substatus['next_substatus_date'] - new_substatus[status_date]) /
    np.timedelta64(1, 'D')
)

# BUG HERE: days in current is negative

In [19]:
""" THIS IS A CHECK CELL """
new_substatus.iloc[:5, -6:]

Unnamed: 0,next_product,next_IC_substatus,next_IC_status,next_substatus_date,Last_Status_Date,days_in_current
0,ILUMYA,OTHER,CANCELLED,2019-06-20,2019-06-19,1.0
1,ILUMYA,Last Status Reported,Last Status Reported,2019-06-18,2019-06-20,-2.0
2,ILUMYA,Last Status Reported,Last Status Reported,2019-06-18,2019-06-18,0.0
3,ILUMYA,OTHER,CANCELLED,2019-06-20,2019-06-18,2.0
4,ILUMYA,Last Status Reported,Last Status Reported,2019-04-04,2019-06-20,-77.0


In [21]:
# Flag patients that need enrichment
new_substatus['Needs_Enrichment'] = (
    np.where(
        # Pick only rows that are pending/new, cancelled, or discontinued
        (
            (
                (new_substatus[ic_status] == pending) &
                (new_substatus[ic_substatus] == pending_new)
            ) |
            (new_substatus[ic_status].isin([cancelled, discontinued]))
        ) &
        # Pick only those with the next status as active/shipment
        (new_substatus['next_IC_status'] == active) &
        (new_substatus['next_IC_substatus'] == shipped) &
        # Pick only those with same patient/medication as the next
        (new_substatus[patient].astype(str) == new_substatus['next_patient_id'].astype(str)) &
        (new_substatus[product].astype(str) == new_substatus['next_product'].astype(str)) &
        # Pick only those that have been in their pending/new, cancelled,
        # or discontinued status for more than 2 days
        (new_substatus['days_in_current'] > 2),
        1,
        0
    )
)

# BUG: coming back as nothing needing enrichment

In [26]:
""" THIS IS A CHECK CELL """
new_substatus[new_substatus.Needs_Enrichment == 1]

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,pat_last_name,pat_first_name,pat_dob,pat_gender,pat_addr1,pat_addr2,pat_city,pat_state,pat_zip,dx1_code,dx2_code,status_date,status_code,sub_status,integrichain_status,integrichain_sub_status,Patient_Journey_Hierarchy,pres_last_name,pres_first_name,pres_addr1,pres_addr2,pres_city,pres_state,pres_zip,pres_phone,pres_npi,pres_dea,facility_name,rxdate,rxnumber,rxrefills,rxfill,refill_remaining,prev_disp,rx_ndc_number,medication,quantity,day_supply,ship_date,ship_carrier,shiptracking_num,ship_location,ship_address,ship_city,ship_state,ship_zip,has_medical,primary_coverage_type,primary_payer_name,primary_payer_type,secondary_coverage_type,secondary_payer_name,secondary_payer_type,plan_paid_amt,pat_copay,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_run_timestamp,__metadata_app_version,__metadata_output_contract,__metadata_transform_timestamp,__metadata_run_id,next_patient_id,next_product,next_IC_substatus,next_IC_status,next_substatus_date,Last_Status_Date,days_in_current,Needs_Enrichment


In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()