In [None]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

In [None]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
# config vars: this dataset
config_pharma = "sun" # the pharmaceutical company which owns {brand}
config_brand = "ilumya" # the brand this pipeline operates on
config_state = "raw" # the state this transform runs in
config_name = "fill_null_long_pat_id" # the name of this transform, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_pharma = "sun"
input_brand = "ilumya"
input_state = "raw"
input_name = "upstream"
input_branch = None # if None, input_branch is automagically set to your working branch

In [None]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
transform_id = ids[0]
run_id = ids[1]

In [None]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [None]:
""" 
********* VARIABLES - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<variable_name>: <data_type> #<comment explaining what the value is to future us>

e.g.

class Transform(DbTransform):
    some_ratio: float
    site_name: str

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    '''
    YOUR properties go here!!
    Variable properties should be assigned to the exact name of
    the transformation as it appears in the Jupyter notebook filename.
    '''
    
    col_1: str #This column is for the brand/medication. Used for identification purposes
    col_2: str #This column is the for the pharmacy code  Used for identification purposes
    col_3: str #This column is for the SP-ID.  Used for identification purposes and to fill in null values where there is no Long-ID
    col_null: str #This column is for the Long-ID I.E. the column where null values are to be filled in
    
    def fill_null_long_pat_id(self,df):
        
        # Creates a dictionary of all variables used as identifiers where Long-ID is null
        unique_id_dict = (
            df[df[self.col_null].isna()]
            [[self.col_1,self.col_2,self.col_3]]
            .dropna()
            .drop_duplicates()
            .reset_index(drop=True)
            .to_dict(orient='index')
        )
        
        for key in unique_id_dict.keys():
            
            mask = (
                (df[self.col_1] == unique_id_dict[key][self.col_1])
                & (df[self.col_2] == unique_id_dict[key][self.col_2])
                & (df[self.col_3] == unique_id_dict[key][self.col_3])
            )
            
            if (df.loc[mask,self.col_null].unique().shape[0] == 1) & (df.loc[mask,self.col_null].unique()[0] == None):
                df.loc[mask,self.col_null] = (
                    unique_id_dict[key][self.col_3]
                )

            else:
                df.loc[mask,self.col_null] = (
                    df
                    .loc[mask,self.col_null]
                    .bfill()
                    .ffill()
                )

        return df
    
transform = Transform()

In [None]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull from the configuration application instead
## For the last example, this could look like...
## transform.some_ratio = 0.6
## transform.site_name = "WALGREENS"

transform.col_1 = 'medication'
transform.col_2 = 'pharm_code'
transform.col_3 = 'pharmacy_id'
transform.col_null = 'msa_patient_id'

1. If L-ID is null through history use SP-ID. If L-ID is null in past backfill with current L-ID

2. If null and two L-ID use the most recent next status

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

Replaces Null values in Longitudinal Patient ID with either the SP-ID or with the most closest following Longitudinal Patient ID

### Transformation

In [None]:
import unittest

def shape_status(final_dataframe,df):
    """
    Make sure df shape doesn't change,
    This is a test:
    >>> shape_status(final_dataframe,df)
    True
    """
    return final_dataframe.shape == df.shape
    
def nulls_removed(final_dataframe):
    """
    Make sure df does not contain Null Values
    This is a test:
    >>> shape_status(final_dataframe,df)
    True
    """
    return final_dataframe[final_dataframe.msa_patient_id.isna()].shape[0] == 0

class TestNotebook(unittest.TestCase):

    def test_shape_status(self):
        self.assertEqual(shape_status(final_dataframe,df),True)
    
    def test_nulls_removed(self):
        self.assertEqual(nulls_removed(final_dataframe),True)
    
unittest.main(argv=[''], verbosity=2, exit=False)

In [None]:
"""
************ FETCH DATA - TOUCH, BUT CAREFULLY **************
This cell will be turned off in production, as the input_contract will be handled by the pipeline.
"""

if not input_branch:
    input_branch = BRANCH_NAME
input_contract = DatasetContract(branch=input_branch, state=input_state, parent=input_pharma, child=input_brand, dataset=input_name)
run_filter = []
# run_filter.append(dict(partition="run_id", comparison="==", values=[1]))
# IF YOU HAVE PUBLISHED DATA MULTIPLE TIMES, uncomment the above line and change the int to the run_id to fetch.
# Otherwise, you will have duplicate values in your fetched dataset!
final_dataframe = input_contract.fetch(filters=run_filter)

In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe
final_dataframe = transform.fill_null_long_pat_id(df)

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session)
session.close()