In [1]:
transform_id = 14

In [3]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch: str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/host/core/transforms, universal_newlines=False, shell=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/host/core/transforms, universal_newlines=False, shell=None)


2019-05-28 16:06:23,449 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-28 16:06:23,473 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-28 16:06:23,505 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-28 16:06:23,506 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-28 16:06:23,510 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-28 16:06:23,511 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-28 16:06:23,517 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-28 16:06:23,518 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-28 16:06:2

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [4]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        input_transform: str = db_transform.variables.input_transform # The name of the transform to input source data from
        index_col: str = db_transform.variables.index_col # The index column to map NDCs in the source dataset (default is rx_ndc_number)
        secret_name: str = db_transform.variables.secret_name # The name of the secret in Secret Manager for platform2
        secret_type_of: str = db_transform.variables.secret_type_of # The type of the secret in Secret Manager for platform2
        ## YOUR properties go here!!

In [5]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead

transform = Transform()
#transform.input_transform = "symphony_health_association_refinement" # final ingest transform not written by me
#transform.index_col = "rx_ndc_number"
#transform.secret_name = "platform2"
#transform.secret_type_of = "database"

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transformation maps the NDC IDs found in a source dataset to product information found in IC master on platform2. The credentials used for platform2 are stored and read from an AWS secret.

### Transformation

In [20]:
### Retrieve current dataset from contract

import core.helpers.contract_creator as cc
from core.dataset_diff import DatasetDiff
import s3parq
import pandas as pd

diff_partition = "__metadata_run_timestamp"
ingest_contract = cc.get_relative_contract(t_name=transform.input_transform, contract=transform.publish_contract)
#df = s3parq.fetch_parq.fetch(bucket=deleteme.bucket, key=deleteme.key)
#diff = DatasetDiff(db_transform.id)
#df = diff.get_diff(transform_name=transform.input_transform)
df = s3parq.fetch(bucket=ENV_BUCKET, key=ingest_contract.key)

2019-05-28 16:25:25,605 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-28 16:25:25,624 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-28 16:25:25,630 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-28 16:25:25,631 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-28 16:25:25,635 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-28 16:25:25,636 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-28 16:25:25,642 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-28 16:25:25,643 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-28 16:25:2

In [21]:
df.head()

Unnamed: 0,rec_date,pharm_code,pharm_npi,transtype,pharm_transaction_id,trans_seq,ref_source,ref_date,program_id,pharmacy_id,...,plan_paid_amt,pat_copay,copay_assist_amount,oth_payer_amt,xfer_pharmname,msa_patient_id,msa_patient_bmap,__metadata_app_version,__metadata_output_contract,__metadata_run_timestamp
0,20181024115959,ACCREDO,1346208949,COM,279133432018102401,0,DIRECT,20181019120000,,27913343,...,,,,,,,NNNNV,0.0.11,s3://ichain-dev/stephanie/stephanie/ilumya/ing...,2019-05-28 15:22:56
1,20181025115959,ACCREDO,1346208949,COM,278370982018102502,0,DIRECT,20181022120000,,27837098,...,,,,,,,NNNVV,0.0.11,s3://ichain-dev/stephanie/stephanie/ilumya/ing...,2019-05-28 15:22:56
2,20181029115959,ACCREDO,1346208949,COM,279181482018102903,0,DIRECT,20181024120000,,27918148,...,,,,,,,NNNVV,0.0.11,s3://ichain-dev/stephanie/stephanie/ilumya/ing...,2019-05-28 15:22:56
3,20181102115959,ACCREDO,1346208949,COM,267244982018110204,0,DIRECT,20181030120000,,26724498,...,,,,,,,NNNVV,0.0.11,s3://ichain-dev/stephanie/stephanie/ilumya/ing...,2019-05-28 15:22:56
4,20181106115959,ACCREDO,1346208949,COM,160618142018110605,0,DIRECT,20181102120000,,16061814,...,,,,,,,NNNVV,0.0.11,s3://ichain-dev/stephanie/stephanie/ilumya/ing...,2019-05-28 15:22:56


In [22]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

from core.secret import Secret
import mysql.connector

secret = Secret(name=transform.secret_name, type_of=transform.secret_type_of, mode="read").__dict__
HOSTNAME = 'platform2.integrichain.com'

# stub call to platform2 until we have a sandbox equivalent of platform2
def product_master_details(unique_ndcs):
    data = {'brand':['ILUMYA', 'ODOMZO', 'YONSA'], 'strength':['100mg/1mL', '200mg', '125mg'], 'prod_grp_id':[54826, 51105, 52813]}
    df = pd.DataFrame(data, index=['47335017795', '47335030383', '47335040181'])
    df.index.name = "rx_ndc_number"
    return df

    '''
def product_master_details(unique_ndcs):
    platform2 = mysql.connector.connect(host=HOSTNAME, user=secret['user'], passwd=secret['password'], port=secret['port'], charset='utf8')
    sql = ("""
                select pg.brand
                    , pg.strength
                    #, pg.packsize
                    , pm.prod_grp_id
                    , pm.prod_cust_id as rx_ndc_number
                from pts.product_master as pm
                    inner join pts.product_group as pg
                        on pm.prod_grp_id = pg.id
                where pm.cust_id = 'SUN' 
                    and pm.prod_cust_id in {}
    """.format(unique_ndcs))
    df = pd.read_sql(sql=sql, con=platform2, index_col=transform.index_col)
    platform2.close()
    return df

    try:
        platform2 = mysql.connector.connect(host=HOSTNAME, user=secret['user'], passwd=secret['password'], port=secret['port'], charset='utf8')
        sql = ("""
                select pg.brand
                    , pg.strength
                    #, pg.packsize
                    , pm.prod_grp_id
                    , pm.prod_cust_id as rx_ndc_number
                from pts.product_master as pm
                    inner join pts.product_group as pg
                        on pm.prod_grp_id = pg.id
                where pm.cust_id = 'SUN' 
                    and pm.prod_cust_id in {}
        """.format(unique_ndcs))
        df = pd.read_sql(sql=sql, con=HOSTNAME, index_col=transform.index_col)
        platform2.close()
        return df
    
    except:
        print('Operation Aborted: Encountered an error.')
        pass
    '''

2019-05-28 16:25:32,736 - core.secret.Secret - DEBUG - Secret idenditifier dev/database/platform2/read.


In [23]:
unique_ndcs = tuple(df[~pd.isnull(df[transform.index_col])][transform.index_col].unique())
product_details = product_master_details(unique_ndcs)

In [24]:
df = df.merge(right=product_details, on='rx_ndc_number', how='left')
final_dataframe = df

### Publish

In [25]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe)

2019-05-28 16:25:43,399 - core.dataset_contract.DatasetContract - INFO - Publishing dataframe to s3 location s3://ichain-dev/stephanie/stephanie/ilumya/master/symphony_health_association_map_product_ndcs.
2019-05-28 16:25:43,402 - s3parq.publish_parq - INFO - Checking params...
2019-05-28 16:25:43,405 - s3parq.publish_parq - INFO - Params valid.
2019-05-28 16:25:43,426 - s3parq.publish_parq - INFO - Writing to S3...
