In [None]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

In [None]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::[Referral Date Enrichment]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [None]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    trans_id: str = db_transform.variables.trans_id
    product: str = db_transform.variables.product
    patient: str = db_transform.variables.patient
    pharm: str = db_transform.variables.pharm
    status_date: str = db_transform.variables.status_date
    ref_date: str = db_transform.variables.ref_date
    status: str = db_transform.variables.status
    substatus: str = db_transform.variables.substatus
    shipment_var: str = db_transform.variables.shipment_var
    input_transform: str = db_transform.variables.input_transform # The name of the transform to input source data from
    
    def load_clean_file(self, df):
        ''
        # Convert dates to datetime format
        df[self.status_date] = df[self.status_date].str[:8].astype(str)
        df[self.ref_date] = df[self.ref_date].str[:8].astype(str)
        
        datetime = '%Y%m%d'
        
        df[self.status_date] = pd.to_datetime(df[self.status_date], format=datetime, errors='coerce')
        df[self.ref_date] = pd.to_datetime(df[self.ref_date], format=datetime, errors='coerce')

        ## Extract brand from medication
        df[self.product] = df['medication'].apply(lambda x: x.split()[0].strip() if not pd.isnull(x) else x)

        ## Convert status codes to uppercase
        df[self.status] = df[self.status].str.upper()
        df[self.substatus] = df[self.substatus].str.upper()
        
        return df
        
    def referral_date_enrichment(self, df, table_columns, ref_date_enrichment_threshold):
        """
        """
        min_date_df = (
            df
            .groupby([self.patient, self.pharm, self.product])[self.status_date].min()
            .reset_index()
            .rename(columns={self.status_date: 'First_Status_Date'})
        )
        
        df = pd.merge(df, min_date_df, how='left', 
                      on=[self.patient, self.pharm, self.product])
        if status_code == shipment_var
        
        min_ref_date_df = (
            df
            .groupby([self.patient, self.pharm, self.product])[self.ref_date].min()
            .reset_index()
            .rename(columns={self.ref_date: 'First_Ref_Date'})
        )
        
        df = pd.merge(df, min_ref_date_df, how='left', 
                      on=[self.patient, self.pharm, self.product])

        df['min_ref_day_diff'] = (df['First_Status_Date'] - df['First_Ref_Date']) / np.timedelta64(1, 'D')
        df['ref_day_diff'] = (df['First_Status_Date'] - df[self.ref_date]) / np.timedelta64(1, 'D')

        to_enrich_df = df[(df['min_ref_day_diff'] > ref_date_enrichment_threshold) & (df['ref_day_diff'] > ref_date_enrichment_threshold)]
        to_enrich_df[self.ref_date] = to_enrich_df['First_Status_Date']
        to_enrich_df = to_enrich_df[table_columns]
        enriched_ids = to_enrich_df[self.trans_id].values.tolist()
        df = df[~(df[self.trans_id].isin(enriched_ids))]
        df = df.append(to_enrich_df)
        df = df[table_columns]
        return (df)

transform = Transform()

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This Transform provides the preliminary enrichment format. Takes patient data in from S3 and adjusts all possibly inaccurate reference dates that are greater than 60 days away from the status date.

### Transformation

In [None]:
### Retrieve current dataset from contract
from core.dataset_diff import DatasetDiff

diff = DatasetDiff(db_transform.id)
df = diff.get_diff(transform_name=transform.input_transform, values=[run_id])

In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe
columns = list(df.columns)
df = transform.load_clean_file(df)
final_dataframe= transform.referral_date_enrichment(df, columns, 60)

## Data Validation

In [None]:
import unittest

def shape_status(final_dataframe,df):
    """
    Make sure df shape doesn't change,
    This is a test:
    >>> shape_status(final_dataframe,df)
    True
    """
    return final_dataframe.shape == df.shape
    
def check_threshold(final_dataframe, df):
    """
    Make sure there are not any first status dates greater than 60 days from referral date
    This is a test:
    >>> check_new_rows(final_dataframe, df)
    True
    """
    tester = final_dataframe.copy()   
    return (pend & (worked['ref_date']==(worked['status_date']))) 

class TestNotebook(unittest.TestCase):

    def test_shape_status(self):
        self.assertEqual(shape_status(final_dataframe,df),True)
    
    #def test_check_threshold(self):
    #    self.assertEqual(check_threshold(final_dataframe, df),True)
    
unittest.main(argv=[''], verbosity=2, exit=False)

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()