In [1]:
transform_id = 2

In [2]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.raw_contract import RawContract, download_s3_object
from core.dataset_contract import DatasetContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            dataset=db_transform.transformation_template.name)


2019-07-16 19:41:29,871 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-16 19:41:29,901 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-16 19:41:29,961 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-16 19:41:29,965 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-16 19:41:29,978 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-16 19:41:29,979 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-16 19:41:29,982 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-07-16 19:41:29,984 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-07-16 19:41:2

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [3]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        delimiter: str = db_transform.variables.delimiter # The input file delimiter
        skip_rows: int = db_transform.variables.skip_rows # the number of rows to skip before reading
        encoding: str = db_transform.variables.encoding # The file encoding
        input_file_prefix: str = db_transform.variables.input_file_prefix # The prefix of input files to read in

## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead
transform = Transform()
transform.delimiter = ", "
transform.skip_rows = 0
transform.encoding = "utf8"
transform.input_file_prefix = "BRIOVA_"
transform.dataset_name = "init_ingest_patient_status"

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transformation takes a collection of raw files based on location and their prefix, then creates a dataset from the data and outputs it to S3. The delimiter and encodings can be set, as well as a number of rows at the top to be skipped.

### Transformation

In [4]:
### Setup important imports
import pandas as pd
from pandas.errors import EmptyDataError
import os
import tempfile
from os import path
from encodings.aliases import aliases
from core.logging import get_logger

In [5]:
### Setup for transformation object
transform = Transform()
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")
final_dataframe = pd.DataFrame()

In [6]:
#stephanie/stephanie/ilumya/raw/
input_contract = RawContract(branch="stephanie",
                            parent="stephanie",
                            child="ilumya")

logger.debug(f"Ingesting files from path : {input_contract.s3_path}")

2019-07-16 19:54:51,519 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting files from path : s3://ichain-dev/stephanie/stephanie/ilumya/raw/


In [7]:
def ingest_file(f, filename, input_contract):
    input_s3_url = input_contract.s3_path_with_filename(filename)
    logger.debug(f"Ingesting file: {filename}")
    df = pd.read_csv(f, dtype="str", encoding=transform.encoding, compression="infer", sep=transform.delimiter, skiprows=transform.skip_rows)
    return df

In [12]:
transform.input_file_prefix = '.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE'

In [13]:
run_id=1

In [21]:
%%time
### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe
valid_encodings = set(aliases.keys()) | set(aliases.values())
if transform.encoding not in valid_encodings:
    invalid_encoding_message = f"initial_ingest_configuration {transform.id} has invalid encoding: {transform.encoding}"
    logger.critical(invalid_encoding_message)
    raise ValueError(invalid_encoding_message)
if len(transform.delimiter) < 1:
    invalid_delimiter_message = f"initial_ingest_configuration {transform.id} has no delimiter"
    logger.critical(invalid_delimiter_message)
    raise ValueError(invalid_delimiter_message)

in_files = input_contract.list_files(transform.input_file_prefix)

for f in in_files:
    filename = path.basename(f)
    with input_contract.download_raw_file(filename) as fn:
        try:
            transform.publish_contract.dataset = transform.name + "/" + transform.input_file_prefix + "/" + filename
            transform.publish_contract.partitions = []
            df = ingest_file(fn, filename, input_contract)
#             transform.publish_contract.publish(df, run_id)
        except EmptyDataError as e:
            pass

['stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20181001_01_299.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190124_02_266.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190125_01_267.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190128_01_264.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190129_01_263.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190130_01_269.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190131_01_273.txt', 'stephanie/stephanie/ilumya/raw/.upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STAT

2019-07-18 18:49:05,644 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20181001_01_299.txt
           Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0    20181024115959    ACCREDO  1346208949       COM   279133432018102401   
1    20181025115959    ACCREDO  1346208949       COM   278370982018102502   
2    20181029115959    ACCREDO  1346208949       COM   279181482018102903   
3    20181102115959    ACCREDO  1346208949       COM   267244982018110204   
4    20181106115959    ACCREDO  1346208949       COM   160618142018110605   
5    20181107115959    ACCREDO  1346208949       COM   267244982018110706   
6    20181108115959    ACCREDO  1346208949       COM   160850092018110807   
7    20181109115959    ACCREDO  1346208949       COM   270756892018110908   
8    20181113115959    ACCREDO  1346208949       COM   273885592018111309   
9    20181114115959    ACCREDO  1346208949   

2019-07-18 18:49:05,941 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190124_02_266.txt
         Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0  20190123115959    ACCREDO  1346208949       COM   280632942019012301   
1  20190123115959    ACCREDO  1346208949       COM   280684312019012302   
2  20190123115959    ACCREDO  1346208949       COM   277722152019012303   
3  20190123115950    ACCREDO  1538281837       COM   274297502019012307   
4  20190123020727    ACCREDO  1346208949       COM           2633266305   
5  20190123115959    ACCREDO  1346208949       COM   278071622019012306   
6  20190123120000    ACCREDO  1346208949       COM   159915182019012304   

  Trans Seq Ref Source        Ref Date Program ID Pharmacy ID  ...  \
0         0     DIRECT  20190121120000        NaN    28063294  ...   
1         0     DIRECT  20190123120000        NaN    28068431  ...   
2   

         Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0  20190128010104    ACCREDO  1346208949       COM   280730582019012801   
1  20190128084457    ACCREDO  1346208949       COM   280176922019012802   
2  20190128090129    ACCREDO  1346208949       COM   159915182019012803   
3  20190128121301    ACCREDO  1346208949       COM           2182187107   
4  20190128120000    ACCREDO  1346208949       COM   267244982019012805   
5  20190128014911    ACCREDO  1346208949       COM           2767243806   
6  20190128115950    ACCREDO  1639375066       COM   267689452019012804   

  Trans Seq Ref Source        Ref Date Program ID Pharmacy ID  ...  \
0         0     DIRECT  20190124120000        NaN    28073058  ...   
1         0     DIRECT  20190128120000        NaN    28017692  ...   
2         0     DIRECT  20190123120000        NaN    15991518  ...   
3         0     DIRECT  20180511120000        NaN    21821871  ...   
4         0     DIRECT  20190128120000        NaN

2019-07-18 18:49:07,162 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190201_01_278.txt
         Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0  20190131014733    ACCREDO  1346208949       COM   280849132019013101   
1  20190131120000    ACCREDO  1346208949       COM   280363282019013102   
2  20190131115950    ACCREDO  1346208949       COM   279165712019013105   
3  20190131030547    ACCREDO  1346208949       COM   276818652019013104   
4  20190131122811    ACCREDO  1346208949       COM   278570712019013103   

  Trans Seq Ref Source        Ref Date Program ID Pharmacy ID  ...  \
0         0     DIRECT  20190131120000        NaN    28084913  ...   
1         0     DIRECT  20190131120000        NaN    28036328  ...   
2         0     DIRECT  20181023120000        NaN    27916571  ...   
3         0     DIRECT  20190131120000        NaN    27681865  ...   
4         0   

         Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0  20190205012224    ACCREDO  1346208949       COM           2416757901   
1  20190205043116    ACCREDO  1346208949       COM   280125262019020503   
2  20190205033834    ACCREDO  1346208949       COM   257284222019020502   

  Trans Seq Ref Source        Ref Date Program ID Pharmacy ID  ...  \
0         0     DIRECT  20181218120000        NaN    24167579  ...   
1         0     DIRECT  20190102120000        NaN    28012526  ...   
2         0     DIRECT  20190205120000        NaN    25728422  ...   

  Secondary CoverageType              Secondary PayerName Secondary PayerType  \
0                    NaN                              NaN                 NaN   
1                MEDICAL  ILUMYA COPAY ASSIST-RELAYHEALTH                Cash   
2                    NaN                              NaN                 NaN   

  Plan Paid Amt Pat Copay Copay Assist Amount Oth Payer Amt Xfer PharmName  \
0           NaN

2019-07-18 18:49:08,273 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190211_01_314.txt
         Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0  20190209123027    ACCREDO  1346208949       COM   280955252019020908   
1  20190209110833    ACCREDO  1346208949       COM   204899882019020907   
2  20190208115959    ACCREDO  1346208949       COM   279519262019020803   
3  20190208015612    ACCREDO  1346208949       COM           2802015009   
4  20190208115959    ACCREDO  1346208949       COM   251522792019020805   
5  20190208115959    ACCREDO  1346208949       COM   257284222019020806   
6  20190208094507    ACCREDO  1346208949       COM   280984012019020801   
7  20190208095822    ACCREDO  1346208949       COM   280896122019020802   
8  20190208115959    ACCREDO  1346208949       COM   280125262019020804   

  Trans Seq Ref Source        Ref Date Program ID Pharmacy ID  .

2019-07-18 18:49:09,218 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190214_01_332.txt
          Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0   20190213115959    ACCREDO  1346208949       COM   281052692019021309   
1   20190213105502    ACCREDO  1346208949       COM   248351272019021314   
2   20190213081056    ACCREDO  1346208949       COM   187096502019021305   
3   20190213083159    ACCREDO  1346208949       COM   187472752019021306   
4   20190213031323    ACCREDO  1346208949       COM   281068102019021302   
5   20190213115959    ACCREDO  1346208949       COM   281052692019021310   
6   20190213034744    ACCREDO  1346208949       COM   280416312019021303   
7   20190213031302    ACCREDO  1346208949       COM   281052692019021301   
8   20190213120000    ACCREDO  1346208949       COM   266680632019021311   
9   20190213095106    ACCREDO  1346208949       COM    

2019-07-18 18:49:09,580 - core.transforms.ingest.initial_ingest - DEBUG - Ingesting file: .upload.sha_dry_run_test_raw_fetch.INTEGRICHAIN_SUN_ACCREDO_STATUSDISPENSE_20190218_01_344.txt
          Rec Date Pharm Code   Pharm NPI transType Pharm Transaction Id  \
0   20190216115959    ACCREDO  1346208949       COM   281068102019021611   
1   20190215115959    ACCREDO  1346208949       COM   204429872019021506   
2   20190215044621    ACCREDO  1346208949       COM           2794438815   
3   20190215040239    ACCREDO  1346208949       COM   166486612019021503   
4   20190217030401    ACCREDO  1346208949       COM   281070562019021712   
5   20190216012121    ACCREDO  1346208949       COM   281125602019021609   
6   20190215022028    ACCREDO  1346208949       COM   280976712019021502   
7   20190215042115    ACCREDO  1346208949       COM   281040902019021514   
8   20190216113156    ACCREDO  1346208949       COM   280865542019021610   
9   20190217092738    ACCREDO  1346208949       COM    

KeyboardInterrupt: 