In [None]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

In [None]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::Patient Status Date Standardization
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [None]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    input_transform: str = db_transform.variables.input_transform # The source data to pull from
    status_date: str = db_transform.variables.status_date # String format of the status date. Blank default to attempt to auto-read.
    transaction_date: str = db_transform.variables.transaction_date # String format of the transaction date. Blank default to attempt to auto-read.
    referral_date: str = db_transform.variables.referral_date # String format of the referral date. Blank default to attempt to auto-read.
    patient_dob: str = db_transform.variables.patient_dob # String format of the patient date of birth. Blank default to attempt to auto-read.
    rx_date: str = db_transform.variables.rx_date # String format of the rx date. Blank default to attempt to auto-read.
    ship_date: str = db_transform.variables.ship_date # String format of the ship date. Blank default to attempt to auto-read.
    primary_prior_auth_expiration_date: str = db_transform.variables.primary_prior_auth_expiration_date # String format of the primary prior auth expiration date. Blank default to attempt to auto-read.
    patient_consent_date: str = db_transform.variables.patient_consent_date # String format of the patient consent date. Blank default to attempt to auto-read.
    enroll_received_date: str = db_transform.variables.enroll_received_date # String format of the enroll received date. Blank default to attempt to auto-read.
    fitness_for_duty_ship_date: str = db_transform.variables.fitness_for_duty_ship_date # String format of the fitness for duty ship date. Blank default to attempt to auto-read.
    triage_date: str = db_transform.variables.triage_date # String format of the triage date. Blank default to attempt to auto-read.

transform = Transform()

In [None]:
date_columns = [
    "status_date",
    "transaction_date",
    "referral_date",
    "patient_dob",
    "rx_date",
    "ship_date",
    "primary_prior_auth_expiration_date",
    "patient_consent_date",
    "enroll_received_date",
    "fitness_for_duty_ship_date",
    "triage_date"
]

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transform takes the predetermined date columns and applies the given date format to the strings to turn them into a standard date.

### Transformation

In [None]:
from core.logging import get_logger

import dateutil.parser as parser
from operator import attrgetter
import pandas as pd
import re
from typing import Optional

In [None]:
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

In [None]:
def date_parse_and_format(date_string: Optional[str] = ""):
    try:
        if type(date_string) is pd.Timestamp:
            return date_string
        elif date_string is None or date_string == "" or not isinstance(date_string, str):
            return pd.NaT
        elif date_string == "Under18":
            # Given by some manufacturers, may get adjust handling later
            return pd.NaT

        date_string = re.sub('\W+', '', date_string)
        date_string = date_string.replace("_","")

        date_object = parser.parse(date_string)
        formatted_date_string = date_object.strftime("%Y%m%d")
        return date_object
    except:
        # Error catch like this due to the hazards of attempting real logs with 500,000+ rows
        logger.error(f"Failed to auto-parse date: {date_string}")
        raise


def date_predifined_format(date_format: str, date_string: Optional[str] = ""):
    try:
        if type(date_string) is pd.Timestamp:
            return date_string
        elif date_string is None or date_string == "" or not isinstance(date_string, str):
            return pd.NaT
        elif date_string == "Under18":
            # Given by some manufacturers, may get adjust handling later
            return pd.NaT

        date_object = datetime.strptime(date_string, date_format)
        formatted_date_string = date_object.strftime("%Y%m%d")
        return date_object
    except:
        # Error catch like this due to the hazards of attempting real logs with 500,000+ rows
        logger.error(f"Failed to parse date: {date_string}   with format {date_format}")
        raise

In [None]:
input_contract = DatasetContract(branch=BRANCH_NAME,
                                state=db_transform.pipeline_state.pipeline_state_type.name,
                                parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                                child=db_transform.pipeline_state.pipeline.brand.name,
                                dataset=transform.input_transform)

run_filter = dict(partition="run_id", comparison="==", values=[run_id])

final_dataframe = input_contract.fetch(filters=run_filter)

In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

for date_column in date_columns:
    # List matches transform variables. Much more straightforward than the inverse
    date_column_attr = attrgetter(date_column)
    transform_date = date_column_attr(transform)
    
    # If theres not a format, auto-parse, otherwise parse by the given
    if transform_date == "":
        final_dataframe[date_column] = final_dataframe[date_column].map(date_parse_and_format)
    else:
        final_dataframe[date_column] = final_dataframe[date_column].map(date_predifined_format, transform_date)
        
    # Floor the date to avoid weird errors later
    # Date doesn't transfer but datetime does, but not all data has times so equal out the lack of precision
    final_dataframe[date_column] = final_dataframe[date_column].dt.floor('D')

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()