In [1]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

2019-08-08 13:58:15,313 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-08-08 13:58:15,340 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-08-08 13:58:15,378 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-08-08 13:58:15,379 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-08-08 13:58:15,383 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-08-08 13:58:15,385 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-08-08 13:58:15,388 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-08-08 13:58:15,390 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-08-08 13:58:1

In [2]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
# config vars: this dataset
config_pharma = "sun" # the pharmaceutical company which owns {brand}
config_brand = "ilumya" # the brand this pipeline operates on
config_state = "enrich" # the state this transform runs in
config_name = "fill_rate_schema" # the name of this transform, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_pharma = "sun"
input_brand = "ilumya"
input_state = "ingest"
input_name = "symphony_health_association_ingest_column_mapping"
input_branch = "sun-extract-validation" # if None, input_branch is automagically set to your working branch

In [3]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
transform_id = ids[0]
run_id = ids[1]

2019-08-08 13:58:15,530 - core.logging - DEBUG - Adding/getting mocks for specified configurations...
2019-08-08 13:58:15,556 - core.logging - DEBUG - Done. Creating mock run event and committing results to configuration mocker.


In [4]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [121]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    
    pat_id: str # Patiend ID column
    pharm_name: str # Pharmacy Name column
    brand_id: str # Brand Name column
    status_date: str # Status Date column
    ref_date: str # Referral Date column
    transaction_id: str # Transaction ID column
    ref_source: str # Referral Source column
    provider_fn: str # Provider First Name column
    provider_ln: str # Provider Last Name column
    provider_s: str # Provider State column
    payer: str # Payer Name column
    payer_type: str # Payer Type column
    shipment_status: str # string of shipment status. Should be something like 'SHIPMENT'
    transfered_status: str # string of transfered status. Should be something like 'TRANSFERRED'
    cancelled_status: str # string of cancelled status. Should be something like 'CANCELLED'
    ic_status: str # column name of integrichain status
    ic_sub_status: str # column name of integrichain sub status
    pjh: str # column name of patient journey heirarchy
    shipdate: str # string of shipment date. Should be something like 'shipdate'
    firstfilldate: str # string of first fill date. Should be something like 'firstfilldate'
    canceldate: str # string of cancel date. Should be something like 'canceldate'
    cancelsubstatus: str # string of cancel sub status. Should be something like 'cancelsubstatus'
    cancelreason: str # string of cancel reason. Should be something like 'cancelreason'
    referralstatus: str # string of referral status. Should be something like 'referralstatus'
    filled: str # column name for created one hot column if PJ has been filled. Should be something like 'filled'
    transferred: str # column name for created one hot column if PJ has been transferred. Should be something like 'transferred'
    cancelled: str # column name for created one hot column if PJ has been cancelled. Should be something like 'cancelled'
    stillopen: str # column name for created one hot column if PJ is still open. Should be something like 'stillopen'
    
    
    def fill_rate_schema(self,df):
        
        col_ids = [self.pat_id, self.pharm_name, self.brand_id]
        
        df = df.sort_values([self.pat_id, self.pharm_name, self.brand_id, self.status_date],ascending=[True, True, True, True])
        
        df, max_status_date = self._return_groupby_max(df,col_ids,self.status_date)
        
        df['adjusted_transaction_id'] = df[transform.transaction_id].str.extract(r'(\d+$)')
        df['adjusted_transaction_id'] = df.adjusted_transaction_id.apply(lambda x: x[0:19])
        df['adjusted_transaction_id'] = pd.to_numeric(df.adjusted_transaction_id)
        
        df['max_date'] = np.where(df[self.status_date] == df[max_status_date],1,0)
        
        df = self._return_groupby_min_status_date(df,[self.pat_id, self.pharm_name, self.brand_id, self.ic_sub_status],'min_status_date')
        
        df = df[df.max_date == 1][[self.pat_id, self.pharm_name, self.brand_id, self.transaction_id, 'adjusted_transaction_id', 'min_status_date',
                                   self.status_date, self.ref_source, self.provider_fn, self.provider_ln, self.provider_s, self.payer, self.payer_type, 
                                   self.status_date, self.ref_date, self.ic_status, self.ic_sub_status, self.pjh]].drop_duplicates()
        
        df, max_transaction_id = self._return_groupby_max(df,col_ids,'adjusted_transaction_id')
        
        df['max_transaction'] = np.where(df.adjusted_transaction_id == df[max_transaction_id],1,0)
        
        df = df[df.max_transaction == 1].drop(labels=['max_transaction',max_transaction_id,'adjusted_transaction_id'],axis=1)
        
        df[self.filled] = np.where((df[self.ic_sub_status] == self.shipment_status),1,0)
        df[self.transferred] = np.where((df[self.pjh] == self.transfered_status),1,0)
        df[self.cancelled] = np.where((df[self.ic_status] == self.cancelled_status) & (df[self.pjh] != self.transfered_status),1,0)
        df[self.stillopen] = np.where((df.filled == 0) & (df.transferred == 0) & (df.cancelled == 0),1,0)
        
        df = df.drop_duplicates(subset=[self.pat_id, self.pharm_name, self.brand_id, self.filled, self.transferred, self.cancelled, self.stillopen]).reset_index(drop=True)
        
        df.loc[df[self.filled] == 1,'firstfilldate'] = df['min_status_date']
        df['firstfilldate'] = pd.to_datetime(df['firstfilldate'])
        df.loc[:,'shipdate'] = df['firstfilldate']

        df.loc[df[self.cancelled] == 1,'canceldate'] = df['min_status_date']
        df['canceldate'] = pd.to_datetime(df['canceldate'])
        
        df.loc[df.filled == 1, self.referralstatus] = 'FILLED'
        df.loc[df.cancelled == 1, self.referralstatus] = 'CANCELLED'
        df.loc[(df.transferred == 1) | (df.stillopen == 1), self.referralstatus] = ' OPEN'
        
        return df
        
        df = df[[self.transaction_id,self.pat_id, self.pharm_name, self.brand_id, self.ref_source, self.provider_fn, self.provider_ln, self.provider_s, self.payer, 
                 self.payer_type, self.ic_status, self.ic_sub_status, self.status_date, self.ref_date, self.shipdate, self.firstfilldate, self.canceldate, self.cancelsubstatus, 
                 self.cancelreason, self.referralstatus, self.filled, self.transferred, self.cancelled, self.stillopen]]
        
        # This needs to be the last change made before returning.
        df = df.rename(columns={self.transaction_id:'transactionid', self.pat_id:'patientid', self.pharm_name:'pharmname', self.brand_id:'brand', self.ref_source:'ref_source', 
                                self.provider_fn:'providerfirstname', self.provider_ln:'providerlastname', self.provider_s:'providerstate', self.payer:'payer', self.payer_type:'payertype',
                                self.ic_status:'statuscode', self.ic_sub_status:'substatus', self.status_date:'statusdate', self.ref_date:'referraldate'})
        
        return df
    
    
    def _return_groupby_min_status_date(self,df,group_by_cols,col_rename):
        min_date = (
            df
            .groupby(group_by_cols)
            [self.status_date]
            .min()
            .reset_index(drop=False)
            .rename(columns={self.status_date:col_rename})
        )

        df = pd.merge(df,min_date,on=group_by_cols,how='left')
        
        return df
    
    
    @staticmethod
    def _return_groupby_max(df,group_by_cols,series):
        
        max_col_name = 'max_{0}'.format(series)
        
        max_df = (
            df
            .groupby(group_by_cols)
            [series]
            .max()
            .reset_index(drop=False)
            .rename(columns={series:max_col_name})
        )
        
        return pd.merge(df,max_df,on=group_by_cols), max_col_name


transform = Transform()

In [122]:
transform.pat_id = 'msa_patient_id'
transform.pharm_name = 'pharm_code'
transform.brand_id = 'medication'
transform.status_date = 'status_date'
transform.ref_date = 'ref_date'
transform.transaction_id = 'pharm_transaction_id'
transform.ref_source = 'ref_source'
transform.provider_fn = 'pres_first_name'
transform.provider_ln = 'pres_last_name'
transform.provider_s = 'pres_state'
transform.payer = 'primary_payer_name'
transform.payer_type = 'primary_payer_type'
transform.shipment_status = 'SHIPMENT'
transform.transfered_status = 'TRANSFERRED'
transform.cancelled_status = 'CANCELLED'
transform.ic_status = 'integrichain_status'
transform.ic_sub_status = 'integrichain_sub_status'
transform.pjh = 'Patient_Journey_Hierarchy'
transform.shipdate = 'shipdate'
transform.firstfilldate = 'firstfilldate'
transform.canceldate = 'canceldate'
transform.cancelsubstatus = 'cancelsubstatus'
transform.cancelreason = 'cancelreason'
transform.referralstatus = 'referralstatus'
transform.filled = 'filled'
transform.transferred = 'transferred'
transform.cancelled = 'cancelled'
transform.stillopen = 'stillopen'

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

![Fill Rate Schema](assets/FillRateSchema.png)



### Transformation

In [112]:
"""
************ FETCH DATA - TOUCH, BUT CAREFULLY **************
This cell will be turned off in production, as the input_contract will be handled by the pipeline.
"""

if not input_branch:
    input_branch = BRANCH_NAME
input_contract = DatasetContract(branch=input_branch, state=input_state, parent=input_pharma, child=input_brand, dataset=input_name)
run_filter = []
# run_filter.append(dict(partition="run_id", comparison="==", values=[1]))
# IF YOU HAVE PUBLISHED DATA MULTIPLE TIMES, uncomment the above line and change the int to the run_id to fetch.
# Otherwise, you will have duplicate values in your fetched dataset!
df = input_contract.fetch(filters=run_filter)

import os

import numpy as np
import pandas as pd

pd.options.display.max_columns=999

df.status_date = df.status_date.str[:8].astype(str)
df.ref_date = df.ref_date.str[:8].astype(str)

df.status_date = pd.to_datetime(df.status_date, infer_datetime_format=True, errors='coerce')
df.ref_date = pd.to_datetime(df.ref_date, infer_datetime_format=True, errors='coerce')

os.chdir('{}'.format(os.path.expanduser('~')))
status_config = pd.read_csv('status_mapping.csv')

status_config.loc[:,'statusCode'] = status_config.statusCode.str.upper()
status_config.loc[:,'subStatus'] = status_config.subStatus.str.upper()
status_config.loc[:,'integrichain_sub_status'] = status_config.integrichain_sub_status.str.upper()
status_config.loc[:,'integrichain_status'] = status_config.integrichain_status.str.upper()
status_config.loc[:,'Patient_Journey_Hierarchy'] = status_config.Patient_Journey_Hierarchy.str.upper()

status_config = status_config.rename(columns={'statusCode':'status_code','subStatus':'sub_status'})

df.sub_status = df.sub_status.str.replace('PRESCRIBERHOLD','PRESCRIBER HOLD')

df = pd.merge(df,status_config,on=['status_code','sub_status'])

df = df[['rec_date', 'pharm_code', 'pharm_npi', 'transtype', 'pharm_transaction_id', 'trans_seq', 'ref_source', 'ref_date', 'program_id', 'pharmacy_id', 'pat_last_name', 'pat_first_name', 'pat_dob', 'pat_gender', 
         'pat_addr1', 'pat_addr2', 'pat_city', 'pat_state', 'pat_zip', 'dx1_code', 'dx2_code', 'status_date', 'status_code', 'sub_status', 'integrichain_status','integrichain_sub_status', 'Patient_Journey_Hierarchy', 
         'pres_last_name', 'pres_first_name', 'pres_addr1', 'pres_addr2', 'pres_city', 'pres_state', 'pres_zip', 'pres_phone', 'pres_npi', 'pres_dea', 'facility_name', 'rxdate', 'rxnumber', 'rxrefills', 'rxfill', 
         'refill_remaining', 'prev_disp', 'rx_ndc_number', 'medication', 'quantity', 'day_supply', 'ship_date', 'ship_carrier', 'shiptracking_num', 'ship_location', 'ship_address', 'ship_city', 'ship_state', 'ship_zip', 
         'has_medical', 'primary_coverage_type', 'primary_payer_name', 'primary_payer_type', 'secondary_coverage_type', 'secondary_payer_name', 'secondary_payer_type', 'plan_paid_amt', 'pat_copay', 'copay_assist_amount', 
         'oth_payer_amt', 'xfer_pharmname', 'msa_patient_id', 'msa_patient_bmap', '__metadata_run_timestamp', '__metadata_app_version', '__metadata_output_contract', '__metadata_transform_timestamp', '__metadata_run_id']]

2019-08-08 16:37:10,621 - core.dataset_contract.DatasetContract - INFO - Fetching dataframe from s3 location s3://ichain-dev/sun-extract-validation/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping.


In [123]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe
final_dataframe = transform.fill_rate_schema(df)

In [124]:
final_dataframe

Unnamed: 0,msa_patient_id,pharm_code,medication,pharm_transaction_id,min_status_date,status_date,ref_source,pres_first_name,pres_last_name,pres_state,primary_payer_name,primary_payer_type,status_date.1,ref_date,integrichain_status,integrichain_sub_status,Patient_Journey_Hierarchy,filled,transferred,cancelled,stillopen,firstfilldate,shipdate,canceldate,referralstatus
0,2120001,CVS,ILUMYA SD PFS,183711690,2018-12-31,2018-12-31,HUB,TOMAS,CHAO,GA,,OTHER,2018-12-31,2018-10-19,ACTIVE,SHIPMENT,FULFILLMENT,1,0,0,0,2018-12-31,2018-12-31,NaT,FILLED
1,2120006,CVS,ILUMYA SD PFS,901177673820190405000000,2019-04-05,2019-04-05,DIRECT,RUSSELL,COHEN,NY,,,2019-04-05,2019-03-15,CANCELLED,PATIENT RESPONSE,PATIENT,0,0,1,0,NaT,NaT,2019-04-05,CANCELLED
2,2120009,BRV,ILUMYA 100MG/ML PFS INJ,BRIOVARX_20190306_118503541,2019-01-04,2019-03-05,HUB,ROBERT,GREENBERG,CA,UHC E AND I,COMMERCIAL,2019-03-05,2019-01-03,PENDING,BENEFITS,BV/PA,0,0,0,1,NaT,NaT,NaT,OPEN
3,2120012,BRV,ILUMYA 100MG/ML PFS INJ,BRIOVARX_20190517_112180852,2018-12-18,2019-05-16,DIRECT,PAUL,MALLARI,MA,CAREMARK,MEDICARE D,2019-05-16,2018-11-12,CANCELLED,OTHER,PROVIDER,0,0,1,0,NaT,NaT,2018-12-18,CANCELLED
4,2120024,BRV,ILUMYA 100MG/ML PFS INJ,BRIOVARX_20181128_113184881,2018-11-27,2018-11-27,DIRECT,GENA,GORE,IN,UHC E AND I,COMMERCIAL,2018-11-27,2018-11-20,PENDING,OTHER,FULFILLMENT,0,0,0,1,NaT,NaT,NaT,OPEN
5,2120025,ACCREDO,ILUMYA,280363282019062013,2019-06-20,2019-06-20,HUB,TIMOTHY,ANDERS,OH,ANTHEM PRESC MGMT - EXPRESS SCRIPTS,COMMERCIAL,2019-06-20,2019-06-19,CANCELLED,OTHER,PROVIDER,0,0,1,0,NaT,NaT,2019-06-20,CANCELLED
6,2120026,CGN,ILUMYA INJ 100MG/ML,14115382,2019-04-12,2019-04-12,PHARM,CLIVE,LIU,WA,CIGNA,,2019-04-12,2018-12-03,DISCONTINUED,INSURANCE DENIED,PAYER,0,0,0,1,NaT,NaT,NaT,OPEN
7,2120026,WAG,ILUMYA 100MG/ML PFS 1ML,806743554918,2019-04-12,2019-04-12,DIRECT,CLIVE,LIU,WA,,OTHER,2019-04-12,2019-04-11,PENDING,PRESCRIBER,FULFILLMENT,0,0,0,1,NaT,NaT,NaT,OPEN
8,2120046,CVS,ILUMYA SD PFS,901079636720190502000000,2019-05-02,2019-05-02,DIRECT,LAUREN,BOUDREAUX,WA,,,2019-05-02,2019-01-10,PENDING,OTHER,FULFILLMENT,0,0,0,1,NaT,NaT,NaT,OPEN
9,2120052,WAG,ILUMYA 100MG/ML PFS 1ML,616712029452,2018-11-08,2018-12-03,DIRECT,ROBERT,POSNICK,NH,FEP,GOVERNMENT,2018-12-03,2018-10-29,ACTIVE,SHIPMENT,FULFILLMENT,1,0,0,0,2018-11-08,2018-11-08,NaT,FILLED


In [13]:
import unittest

def no_multiple_types(df):
    return df[df.filled + df.transferred + df.cancelled + df.stillopen != 1].shape[0] == 0

def duplicates_exist(df):
    return df[col_ids].drop_duplicates().shape[0] == df.shape[0]

class TestNotebook(unittest.TestCase):
    
    def test_no_multiple_types(self):
        self.assertEqual(no_multiple_types(final_dataframe),True)
        
    def test_duplicates_exist(self):
        self.assertEqual(duplicates_exist(final_dataframe),True)
        
unittest.main(argv=[''], verbosity=2, exit=False)

test_duplicates_exist (__main__.TestNotebook) ... ERROR
test_no_multiple_types (__main__.TestNotebook) ... ok

ERROR: test_duplicates_exist (__main__.TestNotebook)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-13-de976a93bf77>", line 15, in test_duplicates_exist
    self.assertEqual(duplicates_exist(final_dataframe),True)
  File "<ipython-input-13-de976a93bf77>", line 7, in duplicates_exist
    return df[col_ids].drop_duplicates().shape[0] == df.shape[0]
NameError: name 'col_ids' is not defined

----------------------------------------------------------------------
Ran 2 tests in 0.016s

FAILED (errors=1)


<unittest.main.TestProgram at 0x7f2b180694a8>

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()