In [2]:
transform_id = 1

In [3]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

session = SessionHelper().session
db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            dataset=db_transform.transformation_template.name)


2019-08-09 15:09:43,409 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-08-09 15:09:43,411 - core.helpers.session_helper.SessionHelper - INFO - Forcing postgres instead of configuration mocker...
2019-08-09 15:09:43,415 - core.helpers.session_helper.SessionHelper - INFO - Done. Created dev session.


# CORE Cartridge Notebook :: symphony_health_association_refinement
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [None]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        ## YOUR properties go here!!
        ingest_source_transform: str = db_transform.variables.ingest_source_transform # The name of the dataset to pull from

In [4]:
from core.logging import get_logger

In [5]:
transform = Transform()
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

NameError: name 'Transform' is not defined

In [6]:
int_columns = ['pharm_npi', 'pharm_ncpdp', 'pharm_zip', 'txn_id', 'txn_seq', 'long_pat_id', 'pat_zip', 'hcp_zip', 'hcp_phone', 'hcp_npi', 'rx_fills', 'rx_fill_num', 'rx_refills_rem', 'prev_disp', 'ndc', 'qty_disp', 'days_supply', 'ship_zip', 'prim_payer_bin', 'prim_payer_iin', 'prim_payer_pcn', 'sec_payer_bin', 'sec_payer_iin', 'sec_payer_pcn', 'agg_ship_id', 'ref_num']
int_as_type = {col: 'int64' for col in int_columns}
# e.g.: int_as_type = {'pharm_npi': 'int64', 'pharm_ncpdp': 'int64', etc...}

float_columns = ['prim_plan_paid', 'sec_plan_paid', 'prim_copay', 'prim_coins', 'prim_deductible', 'prim_pat_resp', 'sec_copay', 'sec_coins', 'sec_deductible', 'sec_pat_resp', 'copay_as_amt', 'oth_payer_amt', 'prim_ cost_amt']
float_as_type = {col: 'float64' for col in int_columns}
# e.g.: float_as_type = {'prim_plan_paid': 'float64', 'sec_plan_paid': 'float64', etc...}



### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transform takes patient status data and converts all number columns to either int or float.  We have a specified mapping of columns to their data types here: https://docs.google.com/spreadsheets/d/18MRVquLrHSNarSjXLtJF_igiEPMnjBRX

The reason we need this is because we pull in all data as a string, so number columns need to be converted back to their respective types.

### Transformation

In [7]:
import pandas as pd
import numpy as np
import boto3
from s3parq import fetch

In [None]:
### Retrieve current dataset from contract
from core.dataset_diff import DatasetDiff

diff = DatasetDiff(db_transform.id)
final_dataframe = diff.get_diff(transform_name=transform.input_transform, values=[run_id])

In [9]:
ingest_contract = DatasetContract(branch='dc-628_fix_sun_mappings',
                                  parent='sun',
                                  child='ilumya',
                                  state='ingest',
                                  dataset='symphony_health_association_ingest_column_mapping')

run_filter = [{'partition':'__metadata_run_id', 'comparison':'==', 'values':[10]}]

final_dataframe = fetch(bucket=ingest_contract.bucket, key=ingest_contract.key, filters=run_filter)

In [14]:
final_dataframe.columns

Index(['rec_date', 'pharm_code', 'pharm_npi', 'transtype',
       'pharm_transaction_id', 'trans_seq', 'ref_source', 'ref_date',
       'program_id', 'pharmacy_id', 'pat_last_name', 'pat_first_name',
       'pat_dob', 'pat_gender', 'pat_addr1', 'pat_addr2', 'pat_city',
       'pat_state', 'pat_zip', 'dx1_code', 'dx2_code', 'status_date',
       'status_code', 'sub_status', 'pres_last_name', 'pres_first_name',
       'pres_addr1', 'pres_addr2', 'pres_city', 'pres_state', 'pres_zip',
       'pres_phone', 'pres_npi', 'pres_dea', 'facility_name', 'rxdate',
       'rxnumber', 'rxrefills', 'rxfill', 'refill_remaining', 'prev_disp',
       'rx_ndc_number', 'medication', 'quantity', 'day_supply', 'ship_date',
       'ship_carrier', 'shiptracking_num', 'ship_location', 'ship_address',
       'ship_city', 'ship_state', 'ship_zip', 'has_medical',
       'primary_coverage_type', 'primary_payer_name', 'primary_payer_type',
       'secondary_coverage_type', 'secondary_payer_name',
       'secondary_

In [15]:
int_as_type

{'pharm_npi': 'int64',
 'pharm_ncpdp': 'int64',
 'pharm_zip': 'int64',
 'txn_id': 'int64',
 'txn_seq': 'int64',
 'long_pat_id': 'int64',
 'pat_zip': 'int64',
 'hcp_zip': 'int64',
 'hcp_phone': 'int64',
 'hcp_npi': 'int64',
 'rx_fills': 'int64',
 'rx_fill_num': 'int64',
 'rx_refills_rem': 'int64',
 'prev_disp': 'int64',
 'ndc': 'int64',
 'qty_disp': 'int64',
 'days_supply': 'int64',
 'ship_zip': 'int64',
 'prim_payer_bin': 'int64',
 'prim_payer_iin': 'int64',
 'prim_payer_pcn': 'int64',
 'sec_payer_bin': 'int64',
 'sec_payer_iin': 'int64',
 'sec_payer_pcn': 'int64',
 'agg_ship_id': 'int64',
 'ref_num': 'int64'}

In [11]:
#  Convert all int columns
final_dataframe = final_dataframe.astype(int_as_type)

#  Conert all float columns
final_dataframe = final_dataframe.astype(float_as_type)

KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session, False)
session.close()