In [None]:
"""************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET, BATCH_JOB_QUEUE
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract
import logging
logging.getLogger().setLevel(logging.DEBUG)
log = logging.getLogger()

session = SessionHelper().session
db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)
    

***
# CORE Cartridge Notebook::[enrich_patient_journey_hierarchy]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

# TRANSFORM

In [None]:
""" 
CONFIGURATION ********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>
e.g.
class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""
"""
imports
"""
import pandas as pd
from core.logging import get_logger
 
class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    ''' 
    col_status: str = db_transform.variables.col_status # Column containing the status
    col_substatus: str = db_transform.variables.col_substatus # Column containing the substatus
    customer_name: str = db_transform.variables.customer_name # Name of pharmaceutical company
        
    def enrich_status_substatus():
        customer_name = transform.customer_name
        try:
            if customer_name=='sun':
                enrich_patient_journey_hierarchy_dict = Transform.enrich_patient_journey_hierarchy_sun()
                df_enrich_patient_journey_hierarchy = pd.DataFrame.from_dict(enrich_patient_journey_hierarchy_dict)
            elif customer_name=='bi':
                # tbd 
                # = Transform.master_COL_bi()
                pass
            elif customer_name=='alkermes':
                # tbd
                # = Transform.master_COL_alkermes()
                pass
            else:
                logger.exception('expecting customer name as sun bi or alkermes')
                raise Exception('expecting customer name as sun bi or alkermes') 
        except Exception as e:
            go = False 
            logger.exception(f'exception:{e}')
            raise Exception(f'raise exception:{e}')    
        return df_enrich_patient_journey_hierarchy   
    
    def patient_journey_hierarchy_sun():
        # need to input/ define for ic-gold mapping
        # temporary until furture User story defines
        # IC - GOLD persistence solution
        # kept for consistency with the tranforms built
        # for prior sprint
        # This is for documentation at the moment!
        patient_journey_hierarchy_dict = {}
        patient_journey_hierarchy_dict[1]='BV/PA'
        patient_journey_hierarchy_dict[2]='FINANCIAL'
        patient_journey_hierarchy_dict[3]='FULFILLMENT'
        patient_journey_hierarchy_dict[4]='INTAKE'
        patient_journey_hierarchy_dict[5]='PATIENT'
        patient_journey_hierarchy_dict[6]='PAYER'
        patient_journey_hierarchy_dict[7]='PROVIDER'
        patient_journey_hierarchy_dict[8]='TRANSFERRED'     
        return patient_journey_hierarchy_dict 
    
    def enrich_patient_journey_hierarchy_sun():
        enrich_patient_journey_hierarchy_dict = {'status':['ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','ACTIVE','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','CANCELLED','DENIED','DENIED','DENIED','DENIED','DENIED','DENIED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','DISCONTINUED','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING','PENDING'],'substatus':['HOLD OTHER ','HOLD RTS','INSURANCE HOLD','MATERIAL','PATENT RESPONSE','PRESCRIBER','PT HOLD','READY','SHIPMENT','TREATMENT DELAY','ALT THERAPY','INSURANCE COPAY','INSURANCE DENIED','INSURANCE OON','INSURANCE OTHER','OTHER','PATIENT DECEASED','PATIENT END','PATIENT FINANCIAL','PATIENT RESPONSE','PRESCRIBER END','TRANSFER HUB','TRANSFER SP','DOSAGE','FORMULARY','OTHER','PA','QUANTITY','STEP EDIT','ALT THERAPY','INS OON ','INS OTHER','INSURANCE COPAY','INSURANCE DENIED','OTHER','PATIENT DECEASED','PATIENT END','PATIENT FINANCIAL','PATIENT RESPONSE','PRESCRIBER END','SERVICES END','THERAPY COMPLETE','THERAPY END','TRANSFER HUB','TRANSFER SP','APPEAL','BENEFITS','COPAY ASSISTANCE','DELAY','FOUNDATION','INFORMATION','INVENTORY HOLD','NEW','OTHER','PA','PATIENT  RESPONSE','PATIENT CONTACT','PATIENT FINANCIAL','PATIENT HOLD','PRESCRIBER','PRESCRIBER  HOLD','THERAPY HOLD'],'patient_journey_hierarchy':['FULFILLMENT','FULFILLMENT','PAYER','FULFILLMENT','PATIENT','PROVIDER','PATIENT','FULFILLMENT','FULFILLMENT','FULFILLMENT','PROVIDER','PAYER','PAYER','PAYER','PAYER','PROVIDER','PATIENT','PATIENT','PATIENT','PATIENT','PROVIDER','TRANSFERRED','TRANSFERRED','PAYER','PAYER','PAYER','PAYER','PAYER','PAYER','PROVIDER','PAYER','PAYER','PAYER','PAYER','PROVIDER','PATIENT','PATIENT','PATIENT','PATIENT','PROVIDER','PROVIDER','PATIENT','PROVIDER','TRANSFERRED','TRANSFERRED','BV/PA','BV/PA','FINANCIAL','FULFILLMENT','FINANCIAL','INTAKE','FULFILLMENT','INTAKE','FULFILLMENT','BV/PA','FULFILLMENT','FULFILLMENT','FINANCIAL','FULFILLMENT','FULFILLMENT','FULFILLMENT','FULFILLMENT']}
        return enrich_patient_journey_hierarchy_dict

    
    def enrich_patient_journey_hierarchy(self,df):
        try:        
            enriched_col_name = 'patient_journey_hierarchy'
            go = False # assume things are not working YET.
           
            dffail = pd.DataFrame() # initialize df for fails
            
            logger.info('try:')      
            
            # log metadata           
            # df in
            dfShape = df.shape
            logger.info(f'df in  shape: {dfShape[0]} {dfShape[1]}')
            logger.info(f'df in {df.head()}') 
            
            # are we expecting certain column names? YES 
            statusColNameExpected = transform.col_status
            substatusColNameExpected = transform.col_substatus
            
            logger.info(f'expecting column name  status   as:{statusColNameExpected}')
            logger.info(f'expecting column name  substatus as:{substatusColNameExpected}')
            columnNamesArr = df.columns.values.tolist()
            logger.info(f'df column names:{columnNamesArr}')
            
            if statusColNameExpected in columnNamesArr and substatusColNameExpected in columnNamesArr:
 
                # apply Upper Case to col(s) values of interest
                # apply strip  to col(s) values of interest
                # apply other cleanup for substaus
                df[statusColNameExpected]= df[statusColNameExpected].apply(lambda x: x.upper() if x is not None else x)   
                df[statusColNameExpected]= df[statusColNameExpected].apply(lambda x: x.strip() if x is not None else x)
                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.upper() if x is not None else x)   
                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.strip() if x is not None else x)
                df[substatusColNameExpected]= df[substatusColNameExpected].apply(lambda x: x.replace('_',' ').replace('\r', '').replace('\t', '').replace('\w', '') if x is not None else x)
                
                
                # enrich mapping
                df_enrich_patient_journey_hierarchy = pd.DataFrame()
                df_enrich_patient_journey_hierarchy = Transform.enrich_status_substatus()
                
                # Merge 
                # apply enrich selection for the columns of interest
                
                try:
                    df = pd.merge(df,df_enrich_patient_journey_hierarchy
                                  ,how='left',left_on=['status_code','sub_status']
                                  ,right_on=['status','substatus'],validate='m:1',indicator=True)
                except pd.errors.MergeError as e:
                    go = False
                    logger.exception(f'try merge exception:{e}')
                    raise Exception(str(e))
                
                df.drop(['status','substatus'], axis=1,inplace = True)
                
                # create pass and fail dataframes                              
                # what fails             
                # _merge
                dffail = df.loc[df['_merge'] == 'left_only']
                dffail.drop(['_merge','patient_journey_hierarchy'], axis=1,inplace = True)

                # what passes
                df = df.loc[df['_merge'] == 'both']
                df.drop(['_merge'], axis=1,inplace = True)
                                              
                # meta data log for what comes out of the function pass and fail df
                dfOutSize = df.size
                dfOutShape = df.shape
                dffailSize = dffail.size
                dffailShape = dffail.shape
                logger.info(f'df in   shape: {dfShape[0]} {dfShape[1]}')                
                logger.info(f'df pass shape: {dfOutShape[0]} {dfOutShape[1]}')
                logger.info(f'df fail shape: {dffailShape[0]} {dffailShape[1]}')
                logger.info(f'df pass {df.head()}')
                logger.info(f'df fail {dffail.head()}')  
                go = True
            else:
                go = False  
                logger.exception('expecting column names for patient status substatus if/else exception raise')
                raise Exception('expecting column names for patient status substatus if/else exception raise')              
        except Exception as e:
            go = False  
            logger.exception(f'exception:{e}')
            raise Exception(str(e))
        else:
            pass
        finally:
            pass
        return df,dffail,go
                

transform = Transform()
logger = get_logger(f'core.transforms.{transform.state}.{transform.name}')

In [None]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull from the configuration application instead
## For the last example, this could look like...
## transform.some_ratio = 0.6
## transform.site_name = "WALGREENS"

transform.customer_name = 'sun'
transform.col_status = 'status_code'
transform.col_substatus = 'sub_status'

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

## Planned

We map status/sub-status combinations to a patient journey model in order to provide actionable insights to our customers.

Definition of Done:

- by status/sub-status combination - map to a patient journey bucket defined in a bridge table.
- process to kick out any statuses/sub-statuses that are NOT mapped. 

**Transform assumes MASTER patient status and MASTER substatus transforms at a minimum have been run.**



## *<font color=grey>unit test development only*</font>
*<font color=grey>The next **5** cells will be deleted in production.* </font>

In [None]:
### Use the variables above to execute your transformation.
### the final output needs to be a variable named final_dataframe
logger.info("CALL THE TRANSFORM - execute your transformation - the final output needs to be a variable named final_dataframe")

final_dataframe, final_fail, go = transform.enrich_patient_journey_hierarchy(df)

if go==True:
    logger.info("CALL THE TRANSFORM -  go no go = GO")
elif go==False:
    logger.info("CALL THE TRANSFORM -  go no go = NO go")
else:
    go=False
    logger.info("CALL THE TRANSFORM -  go no go = unknown make it NO go")
        

### *<font color=grey>unittest python*</font>

# **publish**
### Writing to S3
Invoke the `publish()` command to write to a given contract. Some things to know:
- To invoke publish a contract must be at the grain of dataset. This is because file names will be set by the dataframe=\>parquet conversion. 
- publish only accepts a pandas dataframe.
- publish does not allow for timedelta data types at this time (this is missing functionality in pyarrow).
- publish handles partitioning the data as per contract, creating file paths, and creating the binary parquet files in S3, as well as the needed metadata. <br>
**- by default, all datasets include a single partition, \_\_metadata\_run\_id, the RunEvent ID of an executed pipeline**

In [None]:

if go==True:
    logger.info("PUBLISH - that's it - its a GO - just provide the final dataframe to the var final_dataframe and we take it from there")
    transform.publish_contract.publish(final_dataframe, run_id, session)
elif go==False:
    logger.info("PUBLISH -  go no go = NO go -  so DONT publish")
else:
    go=False
    logger.info("PUBLISH -  go no go = unknown make it NO go - so DONT publish")    
session.close()

***