In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


<IPython.core.display.Javascript object>

In [2]:
from lib.p3_ProcessLogger import ProcessLogger
cell_log = ProcessLogger() 

 # Project: Adopt a Drain
 * Author: James Wilfong, wilfongjt@gmail.com
 
## Basics
* data processed in a local clone of source-data 
* intermediate files are put into source-data repo
* the final data.world data set name is the same as the raw-data file name
* the source-data repo folders /raw-data, /clean-data, /notebook are updated during the process

## Raw-data Process
* input: raw-data/ 
* use python via jupyter notebook to manipulate data into usable file
* update results to github
* output: clean-data/

## GIT Process
* input: clean-data/
* process: add, commit, push files from raw-data/, clean-data/, notebook/ folders
* output: GitHub source-data repo

## Data.World Process
* input: GitHub source-data/clean-data/
* process: transfer github clean-data/ to data.world
* output: data.world

## Table of Contents
* [Introduction](#intro)

* [Data Wrangling](#wrangling_steps)


## Jupyter Notebook
Launch Jupyter Notebook from scripts/app-name folder.

<a id='intro'></a>
## Introduction
* why adopt a drain


<a id='prerequisites'></a>
## Prerequisites
* create [Github](#github) repository to hold raw data
* create [Data World](#data-world) account
* [Notebook Config](#notebook-config)
* [Environment Variable Setup](#env-setup)

<a id='data-world'></a>
## Dataworld
* Set up an account
* DW_AUTH_TOKEN value comes from your [data.world](https://data.world/) account-settings-advanced-Admin.
* Application data is stored in data.world
* A Data.world dataset is mostly read-only
* A Data.world is updated via file replacement


<a id='github'></a>
## Github

* raw-data is loaded from the remote source-data repo on Github
* raw-data is stored in the /raw-data folder of the source-data repo
* raw-data is pushed to the remote source-data repo before running this notebook

<a id='env-setup'></a>
## Environment Variable Setup

* Create a file .env and put in the /scripts/<app-name>/ folder
* .env does not get included in the github repository. Exclude .env from github in the .gitignore file
* Add environment variables to .env file
    * GH_URL=https://raw.githubusercontent.com/Wilfongjt/source-data/master/raw-data/
    * DW_DB_URL=https://api.data.world/v0/datasets/wilfongjt/
    * DW_USER=your-data-world-user-name
    * DW_AUTH_TOKEN=dataworld-adm-token




In [3]:
cell_log.clear()
cell_log.collect('## Load Packages')
# import dotenv
cell_log.collect('* Load environment variables')
from settings import *
import interface
cell_log.collect('* Import third party packages')

from datadotworld.client import _swagger
# from datadotworld.client.api import RestApiError
# import datadotworld as dw

import numpy as np 
import pandas as pd

import pprint
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv # read and write csv files
from IPython.display import display, HTML
from IPython.display import Markdown
from pprint import pprint
# import time
import os
import subprocess

# convenience functions -- cleaning
cell_log.collect('* Import custom packages')
from lib.p3_CellCounts import CellCounts
import lib.p3_clean as clean
from lib.p3_configuration import get_configuration
import lib.p3_explore as explore
import lib.p3_gather as gather # gathering functions
import lib.p3_helper_functions as helper
import lib.p3_map as maps

Markdown('''{}'''.format(cell_log.getMarkdown()))

settings


## Load Packages
* Load environment variables
* Import third party packages
* Import custom packages

In [4]:
cell_log.clear()
cell_log.collect('## Helper Functions ')
cell_log.collect('')
cell_log.collect('* get_app_name( script_folder_name )')
def get_app_name(scripts_path):
    '''
    returns application name from script path
    '''
    rc = ''
    pth = scripts_path.split('/')
    rc = pth[len(pth)-1]
    return rc 
cell_log.collect('* get_repo_folder( script_folder_name )')
def get_repo_folder(scripts_path):
    '''
    returns path to the repo folder from script path
    '''
    rc = ''
    rc = scripts_path.replace('/' + get_app_name(scripts_path), '').replace('/scripts','')
    return rc
cell_log.collect('* get_raw_data_folder( script_folder_name )')
def get_raw_data_folder(scripts_path):
    '''
    returns path to raw data from script path
    '''
    return get_repo_folder(scripts_path) + '/raw-data/' + get_app_name(scripts_path)
cell_log.collect('* get_clean_data_folder( script_folder_name )')    
def get_clean_data_folder(scripts_path):
    '''
    returns path to clean data from script path
    '''
    rc = get_repo_folder(scripts_path) + '/clean-data/' + get_app_name(scripts_path)
    if not os.path.exists(rc):
        os.makedirs(rc)
    return rc
def getSourceData(tblDef):
    '''
    returns the original raw data as pandas dataframe
    '''
    return pd.read_csv(tblDef["local_raw"])
Markdown('''{}'''.format(cell_log.getMarkdown()))

## Helper Functions 

* get_app_name( script_folder_name )
* get_repo_folder( script_folder_name )
* get_raw_data_folder( script_folder_name )
* get_clean_data_folder( script_folder_name )

In [5]:
%env

cell_log.clear()
cell_log.collect("<a id='notebook-config'></a>")
cell_log.collect("## Notebook Config")
# ------------ environment variable magic

# Install a pip packages in the current Jupyter kernel
# ------------ Python-dotenv
cell_log.collect("* python-dotenv")
import sys
!{sys.executable} -m pip install python-dotenv
# ------------ data.world API 
cell_log.collect("* datadotworld")
!{sys.executable} -m pip install datadotworld[pandas]

Markdown('''{}'''.format(cell_log.getMarkdown()))

[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


<a id='notebook-config'></a>
## Notebook Config
* python-dotenv
* datadotworld

# Process
## Prepare Data
* download github repo with data
* put new file in raw-data/
* make copy of this jupyter notebook 
* configure to transform raw-data/ into clean-data/
* put clean data into clean/ folder
* push final changes to github
## Load Data
* 

<a id='wrangling_steps'></a>
# Data Wrangling


<a id='wrangle-process'></a>
## Process

<a id="download"></a>
# Download Data

## Configuration

In [6]:
cell_log.clear()
# dev stops github and data.world updates
# prod allows github and data.world updates
# MODE='prod' # dev, prod
'''
------------- configure raw-data
'''

table_name = 'gr_drains'
cell_log.collect('table_name: {}'.format(table_name))

ext = 'csv'
cell_log.collect('* ext: {}'.format(ext))

title_name = 'GRB Storm Drains'
cell_log.collect('* table_name: {}'.format(title_name))

desc = 'Storm Drains of the Grand River Basin, Michigan'
cell_log.collect('* desc: {}'.format(desc))

cell_log.collect('* DW_USER: {}'.format(DW_USER))
cell_log.collect('* DW_DB_URL: {}'.format(DW_DB_URL))
cell_log.collect('* GH_URL: {}'.format(GH_URL))


'''
-------------- setup GitHub
'''
LOCAL_REPO_BRANCH = 'refresh-data'

'''
------------- setup constants to GitHub folders
'''

LOCAL_SCRIPT_FOLDER = os.getcwd()
LOCAL_APP_NAME = get_app_name(LOCAL_SCRIPT_FOLDER)
LOCAL_REPO_FOLDER =  get_repo_folder(LOCAL_SCRIPT_FOLDER)  
LOCAL_RAW_FOLDER = get_raw_data_folder(LOCAL_SCRIPT_FOLDER) 
LOCAL_CLEAN_FOLDER = get_clean_data_folder(LOCAL_SCRIPT_FOLDER)

cell_log.collect('* LOCAL_SCRIPT_FOLDER: {}'.format(LOCAL_SCRIPT_FOLDER))
cell_log.collect('* LOCAL_APP_NAME: {}'.format(LOCAL_APP_NAME))
cell_log.collect('* LOCAL_REPO_FOLDER: {}'.format(LOCAL_REPO_FOLDER))
cell_log.collect('* LOCAL_REPO_BRANCH: {}'.format(LOCAL_REPO_BRANCH))
cell_log.collect('* LOCAL_RAW_FOLDER: {}'.format(LOCAL_RAW_FOLDER))
cell_log.collect('* LOCAL_CLEAN_FOLDER: {}'.format(LOCAL_CLEAN_FOLDER))

Markdown('''{}'''.format(cell_log.getMarkdown()))

table_name: gr_drains
* ext: csv
* table_name: GRB Storm Drains
* desc: Storm Drains of the Grand River Basin, Michigan
* DW_USER: wilfongjt
* DW_DB_URL: https://api.data.world/v0/datasets/wilfongjt/
* GH_URL: https://raw.githubusercontent.com/Wilfongjt/source-data/refresh-data/clean-data/
* LOCAL_SCRIPT_FOLDER: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/scripts/adopt-a-drain
* LOCAL_APP_NAME: adopt-a-drain
* LOCAL_REPO_FOLDER: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data
* LOCAL_REPO_BRANCH: refresh-data
* LOCAL_RAW_FOLDER: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/raw-data/adopt-a-drain
* LOCAL_CLEAN_FOLDER: /Users/jameswilfong/Documents/Github/Wilfongjt/source-data/clean-data/adopt-a-drain

## Package Configuration for Convenence

In [7]:
dw_dataset_id = DW_USER + "/" + title_name.lower().replace('_','-').replace(' ','-')
gh_csv_name = table_name
gh_csv_name_ext = gh_csv_name + '.' + ext
gh_csv_path_name = GH_URL + gh_csv_name_ext

'''
------------- configure source csv
'''

tbl = { "owner_id": DW_USER, 
        "app_name": LOCAL_APP_NAME,
             "dw_title": title_name, 
             "dw_desc": desc,
             "dw_table": table_name,
             "dw_dataset_id": dw_dataset_id,
             "dw_url": DW_DB_URL + table_name + '.' + ext,
             "gh_url": GH_URL + table_name, 
             "visibility": "OPEN", 
             "license": "Public Domain",
             "files": {table_name + '.' + 'csv': {"url": gh_csv_path_name }},
             "local_raw": LOCAL_RAW_FOLDER + '/' + gh_csv_name_ext,
             "local_clean": LOCAL_CLEAN_FOLDER + '/' + gh_csv_name_ext,
           }
'''
------------- configure outliers
'''
_outliers = {
  'outliers': [
    {'column':'dr_facility_id',
     'range':(1, 50000000),
     'reason':'ignore {} outliers (1 <= dr_facility_id or => 50000000).',
     'count': 0
    }, 
    {'column':'dr_lon',
     'range':(-90.0, -80.0),
     'reason':'Remove {} observations too far west or east.',
     'count': 0
    },  
    {'column':'dr_lat',
     'range':(40.0, 50.0),
     'reason':'Remove {} observations too far north or south.',
     'count': 0
    }
  ]
}
# pprint( tbl)

## Wrangling Script

In [8]:
cell_log.clear()

cell_log.collect("# CSV Process")
'''
--------------------------------- input
'''
cell_log.collect("* input:  {}".format( tbl["local_raw"]))
'''
--------------------------------- load data
'''
df_source = getSourceData(tbl) # open raw-data
cell_log.collect("* input: {} observations".format(len(df_source)))
cell_log.collect("* input: columns {}".format(df_source.columns.values))

'''
--------------------------------- clean column names
'''
cell_log.collect('* format: Apply a style of lowercase and underscores to column names.')##############################
df_source = clean.clean_column_names(df_source) # column names

'''
--------------------------------- map expected colums to raw-data columns
'''
df_source = df_source.rename(columns={ # rename columns in df
    "subtype": "dr_subtype",
    "drain__owner": "dr_owner",
    "local__id": "dr_local_id",
    "facilityid": "dr_facility_id",
    "drain__jurisdiction": "dr_jurisdiction",
    "subwatershed": "dr_subwatershed",
    "point__x":"dr_lon", 
    "point__y":"dr_lat"})

'''
--------------------------------- change empty values
'''

## ------------------------------ DROP empty Facility id
# mark all empties with same value
df_source['dr_facility_id'] = df_source['dr_facility_id'].apply(lambda x:  np.nan if x != x or x == '' or x == ' ' or x == None else x)
scnt = len(df_source)
df_source = df_source.dropna(subset=['dr_facility_id', 'soure__id','dr_lon', 'dr_lat'])
ecnt = len(df_source)
cell_log.collect("* clean: dropped {} observations with empty dr_facility_id, soure___id, dr_lon, or dr_lat".format(scnt - ecnt))

'''
--------------------------------- change column types
'''
cell_log.collect('* format: convert dr_facility_id column to int64')
df_source['dr_facility_id'] = df_source['dr_facility_id'].astype('int64')

'''
--------------------------------- remove numbers from df_source_id
'''

df_source['soure__id'] = df_source['soure__id'].apply(lambda x: x.split('_')[0] + '_' if isinstance(x, str) else 'XXX_') 

df_source['dr_asset_no'] = df_source['dr_facility_id']
df_source['dr_type'] = df_source['dr_facility_id'].apply(lambda x: 'Storm Water Inlet Drain')
'''
--------------------------------- create a sync id
'''
df_source['dr_sync_id'] = df_source['soure__id'] + df_source['dr_facility_id'].astype(str)

'''
--------------------------------- drop soure__id
'''
df_source = df_source.drop(['soure__id'], axis=1)

'''
--------------------------------- outliers
'''
df_source = clean.remove_obvious_outliers(_outliers, df_source)
for r in _outliers['outliers']:
    cell_log.collect('* outlier: {}'.format(r['reason']))

'''
--------------------------------- Drop DUPLICATES
'''
scnt = len(df_source)
df_source = df_source.drop_duplicates('dr_facility_id',keep=False)
ecnt = len(df_source)
cell_log.collect('* duplicates: dropped {} duplicate facility ids'.format(scnt - ecnt))


'''
--------------------------------- save csv 
'''
# assume new file and remove old one
if os.path.isfile(tbl["local_clean"]):
    os.remove(tbl['local_clean'])
    cell_log.collect('* system: overwrite {} '.format(tbl['local_clean']))

cell_log.collect("* inter-output: columns {}".format(df_source.columns.values))
cell_log.collect('* inter-output: {} obs to {}'.format(len(df_source) , tbl["local_clean"]))


df_source.to_csv(tbl["local_clean"], index=False)


* clean_column_names: 0.005354881286621094 sec
* remove_obvious_outliers: 0.011285066604614258 sec


## Interface with GitHub and Data.World

In [9]:
interface.github(df_source, tbl, cell_log)

interface.data_world(df_source, tbl, cell_log)

Markdown('''{}'''.format(cell_log.getMarkdown()))



MaxRetryError: HTTPSConnectionPool(host='api.data.world', port=443): Max retries exceeded with url: /v0/datasets/wilfongjt/grb-storm-drains (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1c1a964400>: Failed to establish a new connection: [Errno 51] Network is unreachable',))

# Appendix - Data.World Names

## Keeping the names straight

| CSV Name      | Table Name    | Title          | Dataset ID      | Restful |
| :------------ |:------------- | :------------- | :-------------  | :------------- |
| xxxx_xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?     | 
| xxxx_xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?            |
| xxxx_xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx Xx        | xxxx-xx         |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx_Xx        | xxxxxx          |    ?         |
| xxxx-xx       | xxxx_xx       | Xxxx-Xx        | xxxx-xx         |    ?         |

* CSV Name is root of Table name
* Title is root of Dataset ID
* a space in Title will be automatically converted to hyphen in dataset id
* an underscore in Title will be removed in Dataset ID
* a hyphen in CSV Name will be replaced with underscore in Table Name
