In [None]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.dataset_contract import DatasetContract

session = SessionHelper().session
db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: DatasetContract = DatasetContract(branch=BRANCH_NAME,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            state=db_transform.pipeline_state.pipeline_state_type.name,
                            dataset=db_transform.transformation_template.name)


# CORE Cartridge Notebook :: symphony_health_association_refinement
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [None]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
        ## YOUR properties go here!!
        input_transform: str = db_transform.variables.input_transform # The name of the dataset to pull from
        number_columns: str = db_transform.variables.number_columns # Columns in our dataframe which need to be converted to numbers
        number_columns = number_columns.split(',') # We reassign the string variable to be a list of strings
        float64_as_type = {col: 'float_64' for col in number_columns}
        

In [None]:
from core.logging import get_logger

In [None]:
transform = Transform()
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transform takes patient status data and converts all number columns to float64 pandas datatype.  We have a specified mapping of columns to their data types here: https://docs.google.com/spreadsheets/d/18MRVquLrHSNarSjXLtJF_igiEPMnjBRX

The reason we need this is because we pull in all data as a string, so number columns need to be converted back to their respective types.

### Transformation

In [None]:
import pandas as pd
import numpy as np
import boto3
from s3parq import fetch
import logging

In [None]:
### Retrieve current dataset from contract
from core.dataset_diff import DatasetDiff

diff = DatasetDiff(db_transform.id)
final_dataframe = diff.get_diff(transform_name=transform.input_transform, values=[run_id])

In [None]:
#  Convert all columns that need to be numbers to float64
try:
    final_dataframe = final_dataframe.astype(transform.float64_as_type)
except KeyError:  # One of our columns are not in the dataset, we'll convert them one by one
    for int_col in transform.number_columns:
        try:
            final_dataframe = final_dataframe.astype({int_col: 'float64'})
        except KeyError:
            print(f'{int_col} was not a column in our dataframe')


### Publish

In [None]:
## that's it - just provide the final dataframe to the var final_dataframe and we take it from there
transform.publish_contract.publish(final_dataframe, run_id, session, False)
session.close()