In [1]:
transform_id = 28

In [2]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.raw_contract import RawContract

db_transform = SessionHelper().session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name


2019-05-28 13:38:08,480 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-28 13:38:08,505 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-28 13:38:08,537 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-28 13:38:08,538 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-28 13:38:08,541 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-28 13:38:08,542 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-28 13:38:08,547 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-28 13:38:08,548 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-28 13:38:0

# CORE Cartridge Notebook::[transform name here]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [3]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    input_transform: str = db_transform.variables.input_transform # name of transformation to pull dataset from
    prefix: str = db_transform.variables.prefix # file prefix to publish to ftp
    suffix: str = db_transform.variables.suffix # file suffix to publish to ftp
    filetype: str = db_transform.variables.filetype # filetype to publish to ftp (DO NOT INCLUDE . IN FILETYPE)
    separator: str = db_transform.variables.separator # single character separator for output file
    compression: bool = db_transform.variables.compression # if true, published file will be compressed as gzip
    date_format: str = db_transform.variables.date_format # string formatting for datetime
    remote_path: str = db_transform.variables.remote_path # path to publish to on FTP server
    secret_name: str = db_transform.variables.secret_name # AWS secret name containing FTP credentials
    secret_type_of: str = db_transform.variables.secret_type_of # AWS secret type of, should almost always be "FTP"

In [12]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead

transform = Transform()
transform.input_transform = "symphony_health_association_filter_shipment_only"
transform.prefix = "PrvtLbl_INTEGRI_SUN"
transform.suffix = "001_NPHI"
transform.filetype = "dat"
transform.separator = "|"
transform.compression = False
transform.date_format = "%Y%m%d_%Y%m%d%H%M%S"
transform.remote_path = "test/"
transform.secret_name = "natie-test"
transform.secret_type_of = "FTP"

### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transformation publishes a dataset in S3 to an external FTP server. The credentials for the FTP server should be stored securely in an AWS Secret, with the secret_name and secret_type_of provided to the transformation.

### Transformation

In [13]:
### Retrieve current dataset from contract

import core.helpers.contract_creator as cc
from core.dataset_diff import DatasetDiff
import s3parq
import pandas as pd

diff = DatasetDiff(db_transform.id)
df = diff.get_diff(transform_name=transform.input_transform)

2019-05-28 13:45:04,347 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-05-28 13:45:04,373 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-05-28 13:45:04,377 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-05-28 13:45:04,378 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-05-28 13:45:04,383 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-05-28 13:45:04,385 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-05-28 13:45:04,390 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-05-28 13:45:04,391 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-05-28 13:45:0

In [17]:
from core.helpers import file_mover
from core.secret import Secret
import tempfile, datetime

if len(transform.separator) != 1:
    raise ValueError("Error: Separator must be a single character.")

if transform.filetype.find(".") != -1:
    raise ValueError("Error: Filetype should not contain '.'")
    
brand = transform.brand.upper()
prefix = "/" + transform.prefix + brand
suffix = transform.suffix
filetype = "." + transform.filetype.lower()

ts = datetime.datetime.now()
time = ts.strftime(transform.date_format)

filename = '_'.join([prefix, time, suffix]) + filetype
if transform.compression:
    filename += '.gz'

with tempfile.TemporaryDirectory() as temp_dir:
    filename = temp_dir + filename
    
    if transform.compression:
        df.fillna('')\
            .to_csv(filename,
                    sep=transform.separator,
                    header=True,
                    index=False,
                    compression='gzip'
                   )
    else:
        df.fillna('')\
            .to_csv(filename,
                    sep=transform.separator,
                    header=True,
                    index=False
                   )

    ftp_secret = Secret(name=transform.secret_name, type_of=transform.secret_type_of, mode="write")
    file_mover.publish_file(local_path=filename, remote_path=transform.remote_path, secret=ftp_secret)

2019-05-28 13:55:23,567 - core.secret.Secret - DEBUG - Secret idenditifier dev/FTP/natie-test/write.
2019-05-28 13:55:23,701 - core.helpers.file_mover.FileMover - DEBUG - Connecting to host: 172.30.0.165 on port: 2221
2019-05-28 13:55:25,834 - paramiko.transport - ERROR - Exception: Error reading SSH protocol banner
2019-05-28 13:55:25,837 - paramiko.transport - ERROR - Traceback (most recent call last):
2019-05-28 13:55:25,839 - paramiko.transport - ERROR -   File "/usr/local/lib/python3.6/site-packages/paramiko/transport.py", line 2138, in _check_banner
2019-05-28 13:55:25,841 - paramiko.transport - ERROR -     buf = self.packetizer.readline(timeout)
2019-05-28 13:55:25,843 - paramiko.transport - ERROR -   File "/usr/local/lib/python3.6/site-packages/paramiko/packet.py", line 367, in readline
2019-05-28 13:55:25,844 - paramiko.transport - ERROR -     buf += self._read_timeout(timeout)
2019-05-28 13:55:25,846 - paramiko.transport - ERROR -   File "/usr/local/lib/python3.6/site-package

SSHException: Error reading SSH protocol banner

### Publish

In [None]:
ftp_secret.user, ftp_secret.password, ftp_secret.port, ftp_secret.host

In [None]:
## Files are published to FTP in this transformation. This transformation does not publish to a contract in S3.