In [2]:
transform_id = 1

"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


2019-07-19 13:49:02,887 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-19 13:49:02,914 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-19 13:49:02,951 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-19 13:49:02,953 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-19 13:49:02,955 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-19 13:49:02,957 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-19 13:49:02,960 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-07-19 13:49:02,961 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-07-19 13:49:0

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [5]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Include your input dataset(s) here. Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    #ingest_source_transform: str = db_transform.variables.ingest_source_transform # The name of the dataset to pull from
    #ingest_source_file_prefix: str = db_transform.variables.ingest_source_file_prefix # If from initial ingest, the file prefix name
    #filesystem_path: str = db_transform.variables.filesystem_path ## incoming file path
    #input_transform: str = db_transform.variables.input_transform # The name of the transform to input source data from
    #secret_name: str = db_transform.variables.secret_name # secret name
    #secret_type_of: str = db_transform.variables.secret_type_of # secret type

In [6]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead

transform = Transform()

#transform.name = 'DS_289_s3'
#transform.brand = 'ofev'
#transform.state = 'enrich'
#transform.pharmaceutical_company = 'bi'
##transform.filesystem_path = 's3://ichain-dev/preprocess/bi/ofev/ingest/01_load_raw_and_map_headers'
#transform.filesystem_path = 's3://ichain-dev/schafrn/seed-data/bi-all-529-data/01_load_raw_and_map_headers'
##transform.filesystem_path = 's3://ichain-dev/sun-extract-prod-vars/sun/ilumya/ingest/symphony_health_association_ingest_column_mapping/__metadata_run_id=4'
#
#
#transform.secret_name = 'bi'
#transform.secret_type_of = 'database'

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

Cancelled Before Active enrichment.
Assigns hierarchy values in cases where cancelled status is reported before first active shipment.  This is used as part of the TTFF enrichment.  See logic diagram below:

<img src='assets/cancel_before_active.png'>

### Transformation

In [5]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

In [7]:
import re

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from s3fs import S3FileSystem
import mysql.connector as mysql

from core.secret import Secret

In [8]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

In [18]:
# Place your import contracts here

ingest_contract = DatasetContract(
    branch='sun-extract-validation',
    parent='sun',
    child='ilumya',
    state='ingest',a
    dataset='symphony_health_association_ingest_column_mapping'
)

In [19]:
ingest_contract.bucket

'ichain-dev'

In [22]:
from s3parq import fetch

run_id = 4

run_filter = [{"partition": "__metadata_run_id", "comparison": "==", "values": [run_id]}]
df = fetch(bucket=ingest_contract.bucket, key=ingest_contract.key, filters=run_filter)

In [24]:
df.head()

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,pat_last_name,pat_first_name,pat_dob,pat_gender,pat_addr1,pat_addr2,pat_city,pat_state,pat_zip,dx1_code,dx2_code,status_date,status_code,sub_status,pres_last_name,pres_first_name,pres_addr1,pres_addr2,pres_city,pres_state,pres_zip,pres_phone,pres_npi,pres_dea,facility_name,rxdate,rxnumber,rxrefills,rxfill,refill_remaining,prev_disp,rx_ndc_number,medication,quantity,day_supply,ship_date,ship_carrier,shiptracking_num,ship_location,ship_address,ship_city,ship_state,ship_zip,has_medical,primary_coverage_type,primary_payer_name,primary_payer_type,secondary_coverage_type,secondary_payer_name,secondary_payer_type,plan_paid_amt,pat_copay,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_run_timestamp,__metadata_app_version,__metadata_output_contract,__metadata_transform_timestamp,__metadata_run_id
0,20190103 23:00:00,CVS,1043382302,COM,901040097220190103000000,0,DIRECT,20181206 23:00:00,,9010400972,,,,F,,,,,46,,,20190103 23:00:00,CANCELLED,INSURANCE DENIED,MARTIN,DAVID,2101 JACKSON ST,STE 201,ANDERSON,IN,46016,7656490161,1952309510,AM6082969,,,,,,,,,,,,,,,,,,,,,MEDICAL,,,,,,,,,,,,VVNVV,2019-06-26 15:28:20,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-06-26 15:33:32,4
1,20190103 23:00:00,CVS,1013998921,COM,901040428420190103000000,0,HUB,20181206 23:00:00,1420657.0,9010404284,,,,M,,,,,90,,,20190103 23:00:00,CANCELLED,PRESCRIBER END,YAMAUCHI,PAUL,2001 SANTA MONICA,BLVD STE 1160W,SANTA MONICA,CA,90404,3108294104,1720074537,BY5527366,,,,,,,,,,,,,,,,,,,,,MEDICAL,,,,,,,,,,,,VVNVV,2019-06-26 15:28:20,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-06-26 15:33:32,4
2,20190124 23:00:00,CVS,1902887805,COM,901040735320190124000000,0,HUB,20181206 23:00:00,1329832.0,9010407353,,,,M,,,,,77,,,20190124 23:00:00,PENDING,PA,PEREZ,EDWARD,21800 KATY FWY,SUITE 200,KATY,TX,77449,7137711100,1881755650,BP5749417,,,,,,,,,,,,,,,,,,,,,MEDICAL,,,,,,,,,,,,VVNVV,2019-06-26 15:28:20,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-06-26 15:33:32,4
3,20190128 23:00:00,CVS,1902887805,COM,901044975420190128000000,0,DIRECT,20181211 23:00:00,,9010449754,,,,F,,,,,42,,,20190128 23:00:00,PENDING,PRESCRIBER,CASE,JEFFREY,110 MED CTR DR,,PADUCAH,KY,42003,2704432471,1710916176,AC1380524,,,,,,,,,,,28.0,,,,,,,,,,MEDICAL,,,,,,,,,,,,VVNVV,2019-06-26 15:28:20,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-06-26 15:33:32,4
4,20190112 23:00:00,CVS,1518948413,COM,901045114420190112000000,0,DIRECT,20181211 23:00:00,,9010451144,,,,F,,,,,10,,,20190112 23:00:00,CANCELLED,PATIENT RESPONSE,CHEN,CHRISTINE,220 E 161ST STREET,,BRONX,NY,10451,7182929197,1649518127,MC2928779,,,,,,,,,,,28.0,,,,,,,,,,MEDICAL,,,,,,,,,,,,VVNVV,2019-06-26 15:28:20,0.0.11,s3://ichain-dev/sun-extract-validation/sun/ilu...,2019-06-26 15:33:32,4


In [25]:
ingest_contract.parent

'sun'

In [27]:
df.shape

(13206, 72)

## CUSTOMER PARAMETERS

In [28]:
if ingest_contract.parent == 'sun':
    trans_id = 'pharm_transaction_id'
    brand = 'medication'
    patient_id = 'msa_patient_id'
    pharmacy = 'pharm_code'
    status_date = 'status_date'
    referral_date = 'ref_date'
    status =  'status_code'
    substatus =  'sub_status'
    datetime = '%Y%m%d'#%H%M%S'

elif ingest_contract.parent == 'bi':
    trans_id = 'pmcTransactionId'
    brand = 'medication'
    patient_id = 'pmcPatientID'
    pharmacy = 'pharmName'
    status_date = 'statusDate'
    referral_date = 'refDate'
    status =  'statusCode'
    substatus =  'custStatusCode'
    datetime = '%Y-%b-%d'

In [29]:
def clean_data(cust_input_df, datetime, trans_id, brand, patient_id, pharmacy, status_date, referral_date, status, substatus):
    
    # Extract and map relevant columns
    cust_df = cust_input_df.loc[:,[trans_id,
                                   patient_id,
                                   pharmacy,
                                   brand,
                                   status_date,
                                   referral_date,
                                   status,
                                   substatus]]

    cust_df = cust_df.rename(columns={trans_id:'trans_id',
                                      patient_id:'patient_id',
                                      pharmacy:'pharmacy',
                                      brand:'brand',
                                      status_date:'status_date',
                                      referral_date:'referral_date',
                                      status:'status_code',
                                      substatus:'substatus_code'})
    
    ## Convert dates to datetime format
    cust_df.status_date = cust_df.status_date.str[:8].astype(str)
    cust_df.referral_date = cust_df.referral_date.str[:8].astype(str)
#    cust_df.status_date = cust_df.status_date.str.replace(' ', '', regex=False)
#    cust_df.status_date = cust_df.status_date.str.replace(':', '', regex=False)
#    cust_df.referral_date = cust_df.referral_date.str.replace(' ', '', regex=False)
#    cust_df.referral_date = cust_df.referral_date.str.replace(':', '', regex=False)
    
    cust_df.status_date = pd.to_datetime(cust_df.status_date, format=datetime, errors='coerce')
    cust_df.referral_date = pd.to_datetime(cust_df.referral_date, format=datetime, errors='coerce')
    
    ## Extract brand from medication
    cust_df = cust_df.dropna(subset=['brand'])
    cust_df['brand'] = cust_df['brand'].apply(lambda x: x.split()[0].strip())
    
    cust_df.status_code = cust_df.status_code.str.upper()
    cust_df.substatus_code = cust_df.substatus_code.str.upper()
    
    min_status_dates=cust_df.groupby(['patient_id','pharmacy','brand'])['status_date'].min().reset_index().rename(columns={'status_date':'min_status_date'})
    
    cust_df = pd.merge(cust_df, min_status_dates, how='inner', on=['patient_id','pharmacy','brand'])
    
    cust_df['referral_date'] = cust_df['referral_date'].fillna(cust_df['min_status_date'])
    
    cust_df = cust_df.drop(['min_status_date'],axis=1).drop_duplicates()
    
    cust_df.sort_values(by=['patient_id', 'pharmacy', 'brand', 'status_date','status_code'], ascending=[True, True, True, True, False], inplace=True)
    cust_df = cust_df.reset_index(drop=True)

    # Output clean customer file
    return cust_df

In [30]:
clean_df = clean_data(
    df,
    datetime,
    trans_id,
    brand,
    patient_id,
    pharmacy,
    status_date,
    referral_date,
    status,
    substatus
)

clean_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code
0,182176830,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT
1,183711690,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT
2,901177673820190325000000,2120006,CVS,ILUMYA,2019-03-25,2019-03-15,PENDING,PATIENT CONTACT
3,901177673820190405000000,2120006,CVS,ILUMYA,2019-04-05,2019-03-15,CANCELLED,PATIENT RESPONSE
4,BRIOVARX_20190104_118503541,2120009,BRV,ILUMYA,2019-01-03,2019-01-03,PENDING,OTHER


In [31]:
clean_df.shape

(10095, 8)

In [33]:
sun_df = clean_df.assign()

# Patient Journey ID
sun_df['pj_id'] = sun_df.groupby(['patient_id','pharmacy','brand']).grouper.group_info[0]

# Patient Journey Step
sun_df['pj_step'] = sun_df.index

sun_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step
0,182176830,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT,0,0
1,183711690,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT,0,1
2,901177673820190325000000,2120006,CVS,ILUMYA,2019-03-25,2019-03-15,PENDING,PATIENT CONTACT,1,2
3,901177673820190405000000,2120006,CVS,ILUMYA,2019-04-05,2019-03-15,CANCELLED,PATIENT RESPONSE,1,3
4,BRIOVARX_20190104_118503541,2120009,BRV,ILUMYA,2019-01-03,2019-01-03,PENDING,OTHER,2,4


In [34]:
## IDENTIFY FIRST ACTIVE SHIPMENTS

actives = (
    sun_df
    .query('substatus_code == "SHIPMENT"')[['pj_id','pj_step','status_date']]
    .drop_duplicates().rename(columns={'pj_step':'active_step','status_date':'active_status_date'})
)

actives = (
    actives
    .groupby(['pj_id'])['active_step','active_status_date']
    .first()
    .reset_index()
)

actives.head()

Unnamed: 0,pj_id,active_step,active_status_date
0,0,0,2018-12-31
1,6,37,2019-01-23
2,8,42,2019-02-04
3,9,45,2018-11-08
4,10,47,2019-02-12


In [36]:
cancel_before_active_df = (
    pd.merge(sun_df,
             actives,
             how='left',
             on=['pj_id']
            )
    .dropna()
)

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date
0,182176830,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT,0,0,0.0,2018-12-31
1,183711690,2120001,CVS,ILUMYA,2018-12-31,2018-10-19,ACTIVE,SHIPMENT,0,1,0.0,2018-12-31
27,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23
28,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23
29,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23


In [37]:
cancel_before_active_df['active_cancel_diff'] = np.where(cancel_before_active_df['status_code']=='CANCELLED',
                                                         (cancel_before_active_df['active_status_date'] - cancel_before_active_df['status_date'])/np.timedelta64(1,'D'),
                                                         np.nan
                                                        )

cancel_before_active_df = cancel_before_active_df[cancel_before_active_df.pj_id
                                                  .isin(cancel_before_active_df
                                                        .query('active_cancel_diff > 0')
                                                        .pj_id
                                                        .drop_duplicates()
                                                        .tolist()
                                                       )
                                                 ]


cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff
27,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0
28,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,
29,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,
30,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,
31,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,


In [38]:
cancel_before_active_df['prior_status'] = np.where((cancel_before_active_df['status_code']=='CANCELLED'),
                                                   np.where((cancel_before_active_df['pj_id'] == cancel_before_active_df['pj_id'].shift(1)),
                                                            cancel_before_active_df['status_code'].shift(1),
                                                            'no_prior_status'),
                                                   np.NaN
                                                  )

cancel_before_active_df['prior_status_diff'] = np.where((cancel_before_active_df['status_code']=='CANCELLED'),
                                                        np.where((cancel_before_active_df['pj_id'] == cancel_before_active_df['pj_id'].shift(1)),
                                                                 (cancel_before_active_df['status_date'] - cancel_before_active_df['status_date'].shift(1))/np.timedelta64(1,'D'),
                                                                 pd.NaT),
                                                        pd.NaT
                                                       )

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff,prior_status,prior_status_diff
27,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0,no_prior_status,NaT
28,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,,,NaT
29,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,,,NaT
30,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,,,NaT
31,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,,,NaT


In [39]:
cancel_before_active_df.query('status_code == "CANCELLED"').substatus_code.drop_duplicates()

27                  OTHER
269      PATIENT RESPONSE
926           PATIENT END
1162          TRANSFER SP
1211       PRESCRIBER END
1437     INSURANCE DENIED
2581        INSURANCE OON
4324    PATIENT FINANCIAL
Name: substatus_code, dtype: object

In [40]:
cancel_before_active_df['hierarchy'] = np.where(cancel_before_active_df['status_code'] != 'CANCELLED',
                                                None,
                                                np.where(cancel_before_active_df['active_cancel_diff'] > 60,
                                                         'REMOVE FROM TTFF',
                                                         np.where((cancel_before_active_df['prior_status_diff'] > 60) |
                                                                  (cancel_before_active_df['prior_status'] == 'no_prior_status'),
                                                                  'NO STATUS CLARITY',
                                                                  np.where(cancel_before_active_df['substatus_code'] == 'INSURANCE DENIED',
                                                                           'BVPA',
                                                                           'ADOPT PRIOR STATUS HIERARCHY'
                                                                          )
                                                                 )
                                                        )
                                               )

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff,prior_status,prior_status_diff,hierarchy
27,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0,no_prior_status,NaT,NO STATUS CLARITY
28,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,,,NaT,
29,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,,,NaT,
30,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,,,NaT,
31,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,,,NaT,


In [41]:
cancel_before_active_df = cancel_before_active_df.reset_index(drop=True)

cancel_before_active_df['prior_status_diff'] = cancel_before_active_df.groupby(['pj_id'], sort=False).apply(lambda x: x.prior_status_diff.bfill()).reset_index(drop=True)

cancel_before_active_df['hierarchy'] = np.where((cancel_before_active_df['prior_status_diff'] > 60) &
                                                (cancel_before_active_df['status_code'] != 'CANCELLED'),
                                                'REMOVE FROM TTFF',
                                                cancel_before_active_df['hierarchy']
                                               )

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff,prior_status,prior_status_diff,hierarchy
0,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0,no_prior_status,NaT,NO STATUS CLARITY
1,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,,,NaT,
2,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,,,NaT,
3,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,,,NaT,
4,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,,,NaT,


In [42]:
cancel_before_active_df['hierarchy'] = cancel_before_active_df.groupby(['pj_id'], sort=False).apply(lambda x: x.hierarchy.bfill()).reset_index(drop=True)

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff,prior_status,prior_status_diff,hierarchy
0,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0,no_prior_status,NaT,NO STATUS CLARITY
1,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,,,NaT,
2,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,,,NaT,
3,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,,,NaT,
4,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,,,NaT,


In [43]:
cancel_before_active_df['hierarchy'] = np.where((cancel_before_active_df['hierarchy'] != 'REMOVE FROM TTFF') &
                                                (cancel_before_active_df['status_code'] != 'CANCELLED'),
                                                cancel_before_active_df['status_code'] + ' - '+ cancel_before_active_df['substatus_code'],
                                                cancel_before_active_df['hierarchy']
                                               )


In [44]:
cancel_before_active_df['hierarchy'] = np.where(cancel_before_active_df['pj_step'] >= cancel_before_active_df['active_step'],
                                                'ACTIVE - SHIPMENT',
                                                cancel_before_active_df['hierarchy']
                                               )

In [45]:
cancel_before_active_df['hierarchy'] = np.where(cancel_before_active_df['hierarchy'] == 'ADOPT PRIOR STATUS HIERARCHY',
                                                None,
                                                cancel_before_active_df['hierarchy']
                                               )

cancel_before_active_df['hierarchy'] = cancel_before_active_df.groupby(['pj_id'], sort=False).apply(lambda x: x.hierarchy.ffill()).reset_index(drop=True)

cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,active_step,active_status_date,active_cancel_diff,prior_status,prior_status_diff,hierarchy
0,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,37.0,2019-01-23,42.0,no_prior_status,NaT,NO STATUS CLARITY
1,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,37.0,2019-01-23,,,NaT,PENDING - PATIENT CONTACT
2,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,37.0,2019-01-23,,,NaT,PENDING - BENEFITS
3,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,37.0,2019-01-23,,,NaT,PENDING - BENEFITS
4,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,37.0,2019-01-23,,,NaT,PENDING - BENEFITS


In [46]:
cancel_before_active_df = cancel_before_active_df.drop(['active_step','active_status_date','active_cancel_diff','prior_status','prior_status_diff'], axis = 1)
cancel_before_active_df.head()

Unnamed: 0,trans_id,patient_id,pharmacy,brand,status_date,referral_date,status_code,substatus_code,pj_id,pj_step,hierarchy
0,10819246,2120026,CGN,ILUMYA,2018-12-12,2018-12-03,CANCELLED,OTHER,6,27,NO STATUS CLARITY
1,11240428,2120026,CGN,ILUMYA,2018-12-31,2018-12-03,PENDING,PATIENT CONTACT,6,28,PENDING - PATIENT CONTACT
2,11443176,2120026,CGN,ILUMYA,2019-01-08,2018-12-03,PENDING,BENEFITS,6,29,PENDING - BENEFITS
3,11551269,2120026,CGN,ILUMYA,2019-01-11,2018-12-03,PENDING,BENEFITS,6,30,PENDING - BENEFITS
4,11585205,2120026,CGN,ILUMYA,2019-01-14,2018-12-03,PENDING,BENEFITS,6,31,PENDING - BENEFITS


In [47]:
final_dataframe = cancel_before_active_df.assign()

In [48]:
#final_dataframe.to_csv('ds_289_test_output.csv',index_label='index')

### Publish

In [51]:
ds_contract = DatasetContract(
    branch=transform.branch,
    parent=transform.pharmaceutical_company,
    child=transform.brand,
    state=transform.state,
    dataset='DS-289'
)

ds_contract.s3_path

's3://ichain-dev/ds_288/sitwell/cornballer/raw/DS-289'

In [131]:
prev_run_id = ds_contract.fetch().__metadata_run_id.max()
prev_run_id

2019-07-03 15:58:59,091 - core.dataset_contract.DatasetContract - INFO - Fetching dataframe from s3 location s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/metrics/bi_sequencing_test.


3

In [132]:
run_id = int(prev_run_id + 1)
run_id

4

In [133]:
transform.publish_contract.key

'add_bi_sequencing_nb/bi/ofev/metrics/bi_sequencing_test'

In [134]:
transform.publish_contract.brand = 'ofev'
transform.publish_contract.dataset = 'bi_sequencing_test'
transform.publish_contract.state = 'metrics'
transform.publish_contract.customer = 'bi'

In [135]:
transform.publish_contract.key

'add_bi_sequencing_nb/bi/ofev/metrics/bi_sequencing_test'

In [136]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id)

2019-07-03 15:59:14,648 - core.dataset_contract.DatasetContract - INFO - Publishing dataframe to s3 location s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/metrics/bi_sequencing_test.
2019-07-03 15:59:14,651 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-03 15:59:14,673 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-03 15:59:14,677 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-03 15:59:14,678 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-03 15:59:14,680 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-03 15:59:14,681 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-03 15:59:14,683 - core.helpers.configuration_mocker.Configurati

KeyError: 'No RunEvent found with id=4.'

In [None]:
df_fetch = ds_contract.fetch(filters=[{'partition':'__metadata_run_id','comparison':'==','values':[run_id]}])

In [84]:
df_fetch.head()

Unnamed: 0,trans_id,patient_id,brand,pharmacy,referral_date,status_date,status_code,substatus_code,days_in_current,prev_substatus_date,prev_status_code,prev_substatus_code,days_in_previous,next_substatus_date,next_status_code,next_substatus_code,days_in_next,__metadata_run_timestamp,__metadata_app_version,__metadata_transform_timestamp,__metadata_output_contract,__metadata_run_id
0,1230730,2221,OFEV,ACRO Pharmaceutical Services,2014-12-02,2014-12-08,Denied,DN10,554.0,NaT,FIRST,FIRST,,2016-06-14,Shipment,S01,748.0,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
1,13427567,2221,OFEV,ACRO Pharmaceutical Services,2016-06-06,2016-06-14,Shipment,S01,748.0,2014-12-08,Denied,DN10,554.0,2018-07-02,Discontinued,DC07,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
2,31537145,2221,OFEV,ACRO Pharmaceutical Services,2018-03-15,2018-07-02,Discontinued,DC07,,2016-06-14,Shipment,S01,748.0,NaT,LAST,LAST,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
3,1226420,2221,OFEV,ACS Pharmacy,2014-12-11,2014-12-11,Pending,P03,5.0,NaT,FIRST,FIRST,,2014-12-16,Shipment,S01,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
4,1251068,2221,OFEV,ACS Pharmacy,2014-12-11,2014-12-16,Shipment,S01,,2014-12-11,Pending,P03,5.0,NaT,LAST,LAST,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3


In [150]:
df_fetch.__metadata_run_id.drop_duplicates()

0    3
Name: __metadata_run_id, dtype: int64

In [151]:
df_fetch.head()

Unnamed: 0,trans_id,patient_id,brand,pharmacy,referral_date,status_date,status_code,substatus_code,days_in_current,prev_substatus_date,prev_status_code,prev_substatus_code,days_in_previous,next_substatus_date,next_status_code,next_substatus_code,days_in_next,__metadata_run_timestamp,__metadata_app_version,__metadata_transform_timestamp,__metadata_output_contract,__metadata_run_id
0,1230730,2221,OFEV,ACRO Pharmaceutical Services,2014-12-02,2014-12-08,Denied,DN10,554.0,NaT,FIRST,FIRST,,2016-06-14,Shipment,S01,748.0,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
1,13427567,2221,OFEV,ACRO Pharmaceutical Services,2016-06-06,2016-06-14,Shipment,S01,748.0,2014-12-08,Denied,DN10,554.0,2018-07-02,Discontinued,DC07,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
2,31537145,2221,OFEV,ACRO Pharmaceutical Services,2018-03-15,2018-07-02,Discontinued,DC07,,2016-06-14,Shipment,S01,748.0,NaT,LAST,LAST,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
3,1226420,2221,OFEV,ACS Pharmacy,2014-12-11,2014-12-11,Pending,P03,5.0,NaT,FIRST,FIRST,,2014-12-16,Shipment,S01,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
4,1251068,2221,OFEV,ACS Pharmacy,2014-12-11,2014-12-16,Shipment,S01,,2014-12-11,Pending,P03,5.0,NaT,LAST,LAST,,2019-07-03 15:11:44,0.0.11,2019-07-03 15:11:44,s3://ichain-dev/add_bi_sequencing_nb/bi/ofev/m...,3
