In [3]:
from core.helpers.session_helper import SessionHelper
session = SessionHelper().session

2019-07-22 17:26:40,752 - core.helpers.session_helper.SessionHelper - INFO - Creating session for dev environment...
2019-07-22 17:26:40,780 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating administrator mocks.
2019-07-22 17:26:40,788 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating administrator mocks.
2019-07-22 17:26:40,790 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating pharmaceutical company mocks.
2019-07-22 17:26:40,798 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating pharmaceutical company mocks.
2019-07-22 17:26:40,800 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating brand mocks.
2019-07-22 17:26:40,805 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Done generating brand mocks.
2019-07-22 17:26:40,808 - core.helpers.configuration_mocker.ConfigurationMocker - DEBUG - Generating segment mocks.
2019-07-22 17:26:4

In [4]:
"""
************ CONFIGURATION - PLEASE TOUCH **************
Pipeline Builder configuration: creates configurations from variables specified here!!
This cell will be off in production as configurations will come from the configuration postgres DB.
"""
# config vars: this dataset
config_pharma = "bi" # the pharmaceutical company which owns {brand}
config_brand = "bi_test3" # the brand this pipeline operates on
config_state = "raw" # the state this transform runs in
config_name = "extract_from_ftp" # the name of this transform, which is the name of this notebook without .ipynb

# input vars: dataset to fetch. Recall that a contract published to S3 has a key format branch/pharma/brand/state/name
input_pharma = "pharma"
input_brand = "brand"
input_state = "raw"
input_name = "upstream"
input_branch = None # if None, input_branch is automagically set to your working branch


In [5]:
"""
************ SETUP - DON'T TOUCH **************
Populating config mocker based on config parameters...
"""
import core.helpers.pipeline_builder as builder

ids = builder.build(config_pharma, config_brand, config_state, config_name, session)
transform_id = ids[0]
run_id = ids[1]


2019-07-22 17:27:59,934 - core.logging - DEBUG - Adding/getting mocks for specified configurations...
2019-07-22 17:27:59,959 - core.logging - DEBUG - Done. Creating mock run event and committing results to configuration mocker.


In [6]:
"""
************ SETUP - DON'T TOUCH **************
This section imports data from the configuration database
and should not need to be altered, molested or otherwise messed with. 
~~These are not the droids you are looking for~~
"""
from core.constants import BRANCH_NAME, ENV_BUCKET
from core.helpers.session_helper import SessionHelper
from core.models.configuration import Transformation
from dataclasses import dataclass
from core.raw_contract import RawContract

db_transform = session.query(Transformation).filter(Transformation.id == transform_id).one()

@dataclass
class DbTransform:
    id: int = db_transform.id ## the instance id of the transform in the config app
    name: str = db_transform.transformation_template.name ## the transform name in the config app
    state: str = db_transform.pipeline_state.pipeline_state_type.name ## the pipeline state, one of raw, ingest, master, enhance, enrich, metrics, dimensional
    branch:str = BRANCH_NAME ## the git branch for this execution 
    brand: str = db_transform.pipeline_state.pipeline.brand.name ## the pharma brand name
    pharmaceutical_company: str = db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name # the pharma company name
    publish_contract: RawContract = RawContract(branch=BRANCH_NAME,
                            parent=db_transform.pipeline_state.pipeline.brand.pharmaceutical_company.name,
                            child=db_transform.pipeline_state.pipeline.brand.name,
                            state=state)


# CORE Cartridge Notebook::[Extract Transform]
![CORE Logo](assets/coreLogo.png) 

---
## Keep in Mind
Good Transforms Are...
- **singular in purpose:** good transforms do one and only one thing, and handle all known cases for that thing. 
- **repeatable:** transforms should be written in a way that they can be run against the same dataset an infinate number of times and get the same result every time. 
- **easy to read:** 99 times out of 100, readable, clear code that runs a little slower is more valuable than a mess that runs quickly. 
- **No 'magic numbers':** if a variable or function is not instantly obvious as to what it is or does, without context, maybe consider renaming it.

## Workflow - how to use this notebook to make science
#### Data Science
1. **Document your transform.** Fill out the _description_ cell below describing what it is this transform does; this will appear in the configuration application where Ops will create, configure and update pipelines. 
1. **Define your config object.** Fill out the _configuration_ cell below the commented-out guide to define the variables you want ops to set in the configuration application (these will populate here for every pipeline). 
2. **Build your transformation logic.** Use the transformation cell to do that magic that you do. 
![caution](assets/cautionTape.png)

### Configuration

In [11]:
""" 
********* CONFIGURATION - PLEASE TOUCH ********* 
This section defines what you expect to get from the configuration application 
in a single "transform" object. Define the vars you need here, and comment inline to the right of them 
for all-in-one documentation. 
Engineering will build a production "transform" object for every pipeline that matches what you define here.

@@@ FORMAT OF THE DATA CLASS IS: @@@ 

<value_name>: <data_type> #<comment explaining what the value is to future us>

~~These ARE the droids you are looking for~~
"""

class Transform(DbTransform):
    pass
        ## YOUR properties go here!!
#         remote_path: str = db_transform.variables.remote_path # The path to follow on the remote server
#         prefix: str = db_transform.variables.prefix # The prefix of files to get on the remote server
#         secret_name: str = db_transform.variables.secret_name # The name of the secret in Secret Manager for the remote server
#         secret_type_of: str = db_transform.variables.secret_type_of # The type of the secret in Secret Manager for the remote server

In [12]:
## Please place your value assignments for development here!!
## This cell will be turned off in production and Engineering will set to pull form the configuration application instead
#*****************************
#This is an example of how we are going to go about this for now 
transform = Transform()
transform.remote_path = "/home/bi/SpecilatyAnalytics"
#transform.prefix = "IPF_PAT_20190404"
transform.prefix = "IPF_PAT_20190"
transform.secret_name = "bi"
transform.secret_type_of = "FTP"
#****************************
# This is a more abstract example
# transform = Transform()
# transform.remote_path = "/test/"
# transform.prefix = "important_file_"
# transform.secret_name = "dev_sftp"
# transform.secret_type_of = "FTP"


### Description
What does this transformation do? be specific.

![what does your transform do](assets/what.gif)

This transform gets raw files from the client's remote server and places them in our S3 Bucket. Configurations specify the remote path to the files and a prefix to filter the files by before fetching, as well as the secret name and secret type so it can get the connection info for the remote server out of Secrets Manager.

### Transformation

In [13]:
### Setup important outside imports here! (pandas, etc)
from core.helpers import file_mover
from core.logging import get_logger
from core import secret
import os
import tempfile

In [18]:
### Setup your transform dataclass object
extract = transform
logger = get_logger(f"core.transforms.{transform.state}.{transform.name}")

### Use the variables above to execute your transformation. the final output needs to be a variable named final_dataframe
def file_needs_update(output_contract: RawContract,local_file_path: str,local_file_modified_time: str)-> bool:
        """ Check if file needs to be pushed
            File is only considered to need to be pushed if it does not exist or has been modified since last push
        """
        try:
            s3_last_modified = output_contract.get_raw_file_metadata(local_file_path)['source_modified_time']
            logger.info(f"File last modified on S3 at {s3_last_modified}")
            logger.info(f"File last modified on SFTP at {local_file_modified_time}")
            if (float(s3_last_modified) < float(local_file_modified_time)):
                return True
            else:
                return False
        except KeyError as e:
            logger.info(f"File not found on S3, uploading.")
            return True

source_secret = secret.Secret(name=extract.secret_name, type_of=extract.secret_type_of, mode="read")

# Get list of remote files
remote_files = file_mover.list_remote_files(extract.remote_path, secret=source_secret)

# Loop through each remote file and get/move to local
with tempfile.TemporaryDirectory() as tmp_dir:
    for remote_file in remote_files:
        file_mover.get_file(
            tmp_dir=tmp_dir,
            prefix=extract.prefix,
            remote_path=extract.remote_path,
            remote_file=remote_file,
            secret=source_secret
        )
        # Once the local file has been transferred, check if it needs to be pushed to s3
        # TODO: Clear temp dir after each file goes to s3
        # TODO: Validate timestamp against sftp server file to avoid sftp of files again 
        # TODO: Try using Python's get file method to transfer files in file_mover instead of moving them line by line
        for local_file in os.listdir(f"{tmp_dir}"):
            if remote_file.filename in local_file:
                local_file_path = os.path.join(tmp_dir,local_file)
                local_file_modified_time = os.stat(local_file_path).st_mtime

                if (file_needs_update(output_contract=extract.publish_contract,
                                        local_file_path=local_file_path,
                                        local_file_modified_time=local_file_modified_time
                                        )):
                    extract.publish_contract.publish_raw_file(local_file_path)

        


2019-07-22 17:43:27,526 - core.secret.Secret - DEBUG - Secret idenditifier dev/FTP/bi/read.
2019-07-22 17:43:27,704 - core.helpers.file_mover.FileMover - DEBUG - Connecting to host: 192.168.201.202 on port: 22
2019-07-22 17:43:27,722 - paramiko.transport - INFO - Connected (version 1.99, client OpenSSH_3.9p1)
2019-07-22 17:43:27,847 - paramiko.transport - INFO - Authentication (password) successful!
2019-07-22 17:43:27,896 - paramiko.transport.sftp - INFO - [chan 0] Opened sftp connection (server version 3)
2019-07-22 17:43:27,991 - paramiko.transport.sftp - INFO - [chan 0] sftp session closed.
REMOTE FILE:  1532701309
2019-07-22 17:43:27,996 - core.helpers.file_mover.FileMover - DEBUG - Connecting to host: 192.168.201.202 on port: 22
2019-07-22 17:43:28,013 - paramiko.transport - INFO - Connected (version 1.99, client OpenSSH_3.9p1)
2019-07-22 17:43:28,182 - paramiko.transport - INFO - Authentication (password) successful!
2019-07-22 17:43:28,232 - paramiko.transport.sftp - INFO - [ch

KeyboardInterrupt: 

### Publish

In [None]:
"""
***** This does not apply to extract *****
"""