In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [3]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import bigquery as bq
import google.datalab.bigquery as dlbq

  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '


In [3]:
OUT_DIR="/tmp/output"

In [4]:
KEYS = 'date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay'
KEYS = KEYS.split(',')
print(KEYS)

['date', 'airline', 'airline_code', 'departure_airport', 'departure_state', 'departure_lat', 'departure_lon', 'arrival_airport', 'arrival_state', 'arrival_lat', 'arrival_lon', 'departure_schedule', 'departure_actual', 'departure_delay', 'arrival_schedule', 'arrival_actual', 'arrival_delay']


In [5]:
!head -10 ATL_1_4_9.csv > atl_1_4_9.csv

In [6]:
options = PipelineOptions()

## Attention: Mind The Transforms' Return Values!
The important thing to notice in designing ```Transform```s is that ParDo Transforms always have an outer list of what they return. See ```ParseToList``` below for illustration. ```Map``` functions return the result as they computed it. I believe that's because ```ParDo```s are monadic by nature, i.e. they always have flatmap semantics.

In [7]:
class ParseToList(beam.DoFn):

    def process(self, element):
        return [element.split(",")]

In [8]:
class ColumnFilter(beam.DoFn): 
    
    def __init__(self, index, predicate):
        """
        param index: the index of the column to be compared against in the the file
        param predicate: a function taking a single argument and returning a boolean
        """
        super(beam.DoFn, self).__init__()
        self.predicate = predicate
        self.index = index
    def process(self, element):
        if self.predicate(element[self.index]):
            return [element]

In [9]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD')) 
        | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e))
        | "Write" >> beam.io.WriteToText(os.path.join(OUT_DIR, "out.csv")))
! cat $OUT_DIR/out.csv-00000-of-00001
! rm -f $OUT_DIR/out.csv*
! rm -rf beam-temp-out.csv* 

2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9


---
If you just want to play around with no IO on either end just pipe an array into your chain of transformers

In [10]:
with open('atl_1_4_9.csv') as f:
    content = f.readlines()

In [11]:
(content         
 | "Parser" >> beam.ParDo(ParseToList()) 
 | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD'))
 | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e).strip()))

['2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37',
 '2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9']

---
### Combiners

In [12]:
!cat atl_1_4_9.csv

date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,944,939,-5,1110,1110,0
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1600,1629,29,1724,1815,51
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1920,1920,0,2046,2049,3
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9
2009-04-01,CO,19704,ATL,GA,33.63,-84.42,EWR,NJ,40.69,-74.16,1915,2057,102,2146,2

In [13]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Select" >> beam.Map(lambda elem: (elem[KEYS.index('arrival_airport')],int(elem[KEYS.index('arrival_delay')])))
        | "Group_by_dep" >> beam.GroupByKey()
        | "Average" >> beam.Map(lambda e: (e[0], np.sum(e[1], dtype='float')/len(e[1])))
        | "ToCommaSepString" >> beam.Map(lambda e: "{},{}".format(e[0],e[1]))
        | "Write" >> beam.io.WriteToText(os.path.join(OUT_DIR,"out.csv")))
!cat out.csv-00000-of-00001
! rm -f $OUT_DIR/out.csv*
! rm -f beam-temp-out.csv* 

ORD,-19.6
EWR,105.0
DEN,18.0


In [14]:
[1,2,3,4,5] | beam.CombineGlobally(lambda l: sum(l))

[15]

---
### Reading from BigQuery
Using the ```%%bigquery``` cell magic we can populate a pandas dataframe directly from a bq query pass the result to a given variable

In [15]:
%load_ext google.cloud.bigquery

For legacy sql use 
```--use_legacy_sql```.
For more info on %%bigquery, see [this link](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.magics.html])

In [28]:
%%bigquery df_result
SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `going-tfx.examples.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) = 3
limit 10

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2012,6,5,AA,DFW,855,-8,-8
1,ATL,2010,6,1,DL,MSP,2335,8,-2
2,ATL,2017,6,6,DL,SLC,2127,33,61
3,ATL,2008,6,7,EV,TYS,1040,-5,-15
4,ATL,2009,6,7,EV,FAY,1229,-3,-5
5,ATL,2012,6,2,EV,VPS,1522,-4,-9
6,ATL,2008,6,4,FL,MSP,1819,9,-33
7,ATL,2009,6,4,FL,SJU,1244,13,8
8,ATL,2010,6,2,FL,TPA,826,-6,-19
9,ATL,2010,6,5,FL,FLL,1213,-1,2


In [29]:
df_result[:2]

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2012,6,5,AA,DFW,855,-8,-8
1,ATL,2010,6,1,DL,MSP,2335,8,-2


Unfortunatelyl, the ```%%bigquery``` cell can't be silenced, even when a variable is provided. So even in notebooks you may want to resort to pandas' own approach to bigquery:

In [18]:
query="""
SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `going-tfx.examples.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) = 3
"""

In [19]:
df = pd.read_gbq(query,
                     project_id='going-tfx',
                     dialect='standard')
print(df.shape)
df.head(10)

(47, 9)


Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2006,6,1,DL,CHS,947,3,-5
1,ATL,2007,6,1,DL,PHX,1217,-1,3
2,ATL,2008,6,1,FL,BOS,1304,7,-5
3,ATL,2010,6,1,DL,MSP,2335,8,-2
4,ATL,2010,6,1,EV,CAK,1259,-8,-21
5,ATL,2015,6,1,DL,PIT,1923,0,-9
6,ATL,2015,6,1,EV,CHO,1626,-3,-4
7,ATL,2016,6,1,DL,RIC,1907,-5,-22
8,ATL,2017,6,1,UA,IAH,2040,-4,-1
9,ATL,2017,6,1,DL,LAX,2304,35,-2


or Google datalabs Bigquery utility:

In [35]:
import google.datalab.bigquery as dlbq
dlbq.Query(query).execute().result().to_dataframe()[:4]

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2006,6,1,DL,CHS,947,3,-5
1,ATL,2007,6,1,DL,PHX,1217,-1,3
2,ATL,2008,6,1,FL,BOS,1304,7,-5
3,ATL,2010,6,1,DL,MSP,2335,8,-2


A row-oriented interface is provided by google.cloud integration

In [34]:
from google.cloud import bigquery as bq
client = bq.Client()
query_job = client.query(query)
rows = query_job.result()
print(type(rows))
print()
years = [row.FL_YEAR for row in rows]
print(years)

<class 'google.cloud.bigquery.table.RowIterator'>

[2006, 2007, 2008, 2010, 2010, 2015, 2015, 2016, 2017, 2017, 2006, 2008, 2008, 2009, 2010, 2010, 2012, 2012, 2015, 2007, 2008, 2009, 2008, 2008, 2009, 2011, 2011, 2011, 2012, 2014, 2006, 2007, 2010, 2010, 2012, 2017, 2017, 2011, 2011, 2012, 2013, 2016, 2017, 2008, 2009, 2011, 2013]


### Beam
A Beam pipeline to dump the result of a sql query right into a CSV file. Note that you must supply a project here, since Bigquery needs someone to send a bill to!

In [20]:
options=beam.options.pipeline_options.PipelineOptions().from_dictionary({'project': 'going-tfx'})
print(options.get_all_options())

{'profile_cpu': False, 'machine_type': None, 'runner': None, 'labels': None, 'save_main_session': False, 'streaming': False, 'experiments': None, 'requirements_cache': None, 'max_num_workers': None, 'template_location': None, 'pubsubRootUrl': None, 'environment_type': None, 'requirements_file': None, 'sdk_location': 'default', 'network': None, 'dry_run': False, 'profile_location': None, 'service_account_email': None, 'flink_master': None, 'profile_memory': False, 'direct_runner_use_stacked_bundle': True, 'type_check_strictness': 'DEFAULT_TO_ANY', 'min_cpu_platform': None, 'job_name': None, 'environment_config': None, 'use_public_ips': None, 'num_workers': None, 'hdfs_host': None, 'disk_size_gb': None, 'runtime_type_check': False, 'on_success_matcher': None, 'temp_location': None, 'setup_file': None, 'disk_type': None, 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'worker_harness_container_image': None, 'hdfs_port': None, 'autoscaling_algorithm': None, 'zone': None, 'hdfs_user

In [21]:
KEYS = list(df.keys())
def toCsvRow (row_dict): 
    str_cols = [str(row_dict[c]) for c in KEYS]
    return ",".join(str_cols)

In [22]:
with beam.Pipeline(options=options) as p:
    rows = p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
    (rows | "encode" >> beam.Map(toCsvRow) \
    | "Write" >> beam.io.WriteToText(os.path.join(OUT_DIR,"out.csv")))
!cat $OUT_DIR/out.csv-00000-of-00001 | wc -l
! rm -rf $OUT_DIR/out.csv-00000-of-00001 beam-temp-out.csv* 

  pipeline.replace_all(_get_transform_overrides(pipeline.options))


47


---
A couple of map-reduce scenarios (still to come)

In [23]:
with open('atl_1_4_9.csv') as f:
    print(f.readline()) # display and skip headers
    print(f.readline()) # display and skip first line
    content = f.readlines()

date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay

2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,944,939,-5,1110,1110,0



In [24]:
print(content 
 | beam.Map(lambda line: line.strip().split(","))
 | beam.Map(lambda arr: (arr[4], arr[16]))
)

[('GA', '51'), ('GA', '3'), ('GA', '-37'), ('GA', '-21'), ('GA', '-25'), ('GA', '-24'), ('GA', '9'), ('GA', '105')]
