In [1]:
transform_id = 3

"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)

2019-07-10 16:31:03,473 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-10 16:31:03,498 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-10 16:31:03,557 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-10 16:31:03,558 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-10 16:31:03,562 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-10 16:31:03,564 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-10 16:31:03,568 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-07-10 16:31:03,569 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-07-10 16:31:0

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [2]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        ## YOUR properties go here!!
        remote_path: str = db_transform.variables.filesystem_path # The path to follow on the remote server
        prefix: str = db_transform.variables.prefix # The prefix of files to get on the remote server
        secret_name: str = db_transform.variables.secret_name # The name of the secret in Secret Manager for the remote server
        secret_type_of: str = db_transform.variables.secret_type_of # The type of the secret in Secret Manager for the remote server

In [None]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead

In [40]:
import pandas as pd
import pyarrow as pa
from s3parq import fetch
from s3fs import S3FileSystem
from core.logging import get_logger
from encodings.aliases import aliases
from Ipython.core.interactivershell import InteractiveShell

pd.options.display.max_columns = 999
InteractiveShell.ast_node_interactivity = 'all'

transform = Transform()

In [208]:
run_id = 5
ingest_contract = DatasetContract(branch='sun-extract-prod-vars',
                                  parent='sun',
                                  child='ilumya',
                                  state='ingest',
                                  dataset='symphony_health_association_ingest_column_mapping')

run_filter = [{'partition':'__metadata_run_id', 'comparison':'==', 'values':[run_id]}]

df = fetch(bucket=ingest_contract.bucket, key=ingest_contract.key, filters=run_filter)
df = df.reset_index(drop=True)

null_df = df[df.ref_date.isna()]

In [179]:
def get_unique_id(df=df):
    df = df[df.ref_date.isna()]
    unique_id_dict = {}
    for i, item in enumerate(df.index):
        pharma_id = df.loc[item,'pharmacy_id']
        brand = df.loc[item,'medication']
        pat_id = df.loc[item,'msa_patient_id']

        if pharma_id is not None and brand is not None and pat_id is not None:
            unique_id_dict[item] = {'pharma_id': pharma_id, 'brand': brand, 'pat_id': pat_id}
    return unique_id_dict

In [210]:
def populate_ref_date(df=df,unique_id_dict=unique_id_dict)
    for key in unique_id_dict.keys():
        mask = ((df.index < key) & (df.pharmacy_id == unique_id_dict[key]['pharma_id']) & (df.medication == unique_id_dict[key]['brand']) & (df.msa_patient_id == unique_id_dict[key]['pat_id']))
        patient_journey_df = df[mask]
        if patient_journey_df.ref_date.unique().shape[0] == 1:
            df.at[key,'ref_date'] = patient_journey_df.ref_date.unique()[0]
        # elif
    return df

transform.filesystem_path = 's3://ichain-dev/schafrn/seed-data/bi-all-529-data/01_load_raw_and_map_headers'

def pandas_from_parquet_s3(file_path):
    
    s3 = S3FileSystem()
    df = (
        pa.parquet
        .ParquetDataset(file_path, filesystem=s3)
        .read_pandas()
        .to_pandas()
    )
    
    return df

bi_df = pandas_from_parquet_s3(transform.filesystem_path)

bi_df.head()

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

(clear out and replace with your description)

### Transformation

In [None]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

In [None]:
unique_id_dict = get_unique_id(null_df)
final_dataframe = populate_ref_date()

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id)