In [1]:
transform_id = 14

schafrn/seed-data/sun-data-seed/ingest/symphony_health_association_refinement -> when you make the contract, its branch=schafrn,parent=seed-data,child=sun-data-seed,state=ingest,dataset=symphony_health_association_refinement

In [2]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract
import pandas as pd

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch: str = "schafrn"
    #branch: str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch="schafrn",
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent="seed-data",
                            child="sun-data-seed",
                            dataset=db_transform.transformation_template.name)
    '''
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            dataset=db_transform.transformation_template.name)
    '''


2019-05-17 13:28:25,184 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-17 13:28:25,208 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-17 13:28:25,236 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-17 13:28:25,237 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-17 13:28:25,241 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-17 13:28:25,242 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-17 13:28:25,246 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-17 13:28:25,248 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-17 13:28:2

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [3]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        input_transform: str = db_transform.variables.input_transform # The name of the transform to input source data from
        index_col: str = db_transform.variables.index_col # The index column to map NDCs in the source dataset (default is rx_ndc_number)
        secret_name: str = db_transform.variables.secret_name # The name of the secret in Secret Manager for platform2
        secret_type_of: str = db_transform.variables.secret_type_of # The type of the secret in Secret Manager for platform2
        ## YOUR properties go here!!

In [4]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead

transform = Transform()
transform.input_transform = "symphony_health_association_refinement" # final ingest transform not written by me
transform.index_col = "rx_ndc_number"
transform.secret_name = "platform2"
transform.secret_type_of = "database"

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transformation maps the NDC IDs found in a source dataset to product information found in IC master on platform2. The credentials used for platform2 are stored and read from an AWS secret.

### Transformation

In [5]:
### Retrieve current dataset from contract

import core.helpers.contract_creator as cc
from core.dataset_diff import DatasetDiff
import s3parq

deleteme = cc.get_relative_contract(t_name=transform.input_transform, contract=transform.publish_contract)
print(transform.publish_contract.key)
print(deleteme.bucket + " " + deleteme.key)
df = s3parq.fetch_parq.fetch(bucket=deleteme.bucket, key=deleteme.key)
#diff = DatasetDiff(db_transform.id)
#df = diff.get_diff(transform_name=transform.input_transform)

2019-05-17 13:28:28,921 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-17 13:28:28,943 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-17 13:28:28,948 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-17 13:28:28,951 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-17 13:28:28,956 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-17 13:28:28,960 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-17 13:28:28,965 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-17 13:28:28,969 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-17 13:28:2

In [11]:
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe

from core.secret import Secret
import pandas as pd
import mysql.connector

secret = Secret(name=transform.secret_name, type_of=transform.secret_type_of, mode="read").__dict__
HOSTNAME = '172.18.0.204'

def product_master_details(unique_ndcs):
    platform2 = mysql.connector.connect(host=HOSTNAME, user=secret['user'], passwd=secret['password'], port=secret['port'], charset='utf8')
    sql = ("""
                select pg.brand
                    , pg.strength
                    #, pg.packsize
                    , pm.prod_grp_id
                    , pm.prod_cust_id as rx_ndc_number
                from pts.product_master as pm
                    inner join pts.product_group as pg
                        on pm.prod_grp_id = pg.id
                where pm.cust_id = 'SUN' 
                    and pm.prod_cust_id in {}
    """.format(unique_ndcs))
    df = pd.read_sql(sql=sql, con=HOSTNAME, index_col=transform.index_col)
    platform2.close()
    return df
    '''
    try:
        platform2 = mysql.connector.connect(host=HOSTNAME, user=secret['user'], passwd=secret['password'], port=secret['port'], charset='utf8')
        sql = ("""
                select pg.brand
                    , pg.strength
                    #, pg.packsize
                    , pm.prod_grp_id
                    , pm.prod_cust_id as rx_ndc_number
                from pts.product_master as pm
                    inner join pts.product_group as pg
                        on pm.prod_grp_id = pg.id
                where pm.cust_id = 'SUN' 
                    and pm.prod_cust_id in {}
        """.format(unique_ndcs))
        df = pd.read_sql(sql=sql, con=HOSTNAME, index_col=transform.index_col)
        platform2.close()
        return df
    
    except:
        print('Operation Aborted: Encountered an error.')
        pass
    '''

2019-05-17 13:48:32,934 - core.secret.Secret - DEBUG - Secret idenditifier dev/database/platform2/read.


In [12]:
unique_ndcs = tuple(df[~pd.isnull(df[transform.index_col])][transform.index_col].unique())
product_details = product_master_details(unique_ndcs)

print(secret['user'], secret['password'], HOSTNAME)

InterfaceError: 2003: Can't connect to MySQL server on '172.18.0.204:3306' (113 No route to host)

In [None]:
df = df.merge(right=product_details, on='rx_ndc_number', how='left')
final_dataframe = df

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe)