In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [2]:
import tensorflow as tf
import numpy as np
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import bigquery as bq
import pandas as pd

In [3]:
KEYS = 'date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay'
KEYS = KEYS.split(',')
print(KEYS)

['date', 'airline', 'airline_code', 'departure_airport', 'departure_state', 'departure_lat', 'departure_lon', 'arrival_airport', 'arrival_state', 'arrival_lat', 'arrival_lon', 'departure_schedule', 'departure_actual', 'departure_delay', 'arrival_schedule', 'arrival_actual', 'arrival_delay']


In [4]:
!head -10 ATL_1_4_9.csv > atl_1_4_9.csv

In [5]:
options = PipelineOptions()

## Attention: Mind The Transforms' Return Values!
The important thing to notice in designing ```Transform```s is that ParDo Transforms always have an outer list of what they return. See ```ParseToList``` below for illustration. ```Map``` functions return the result as they computed it. 

In [6]:
class ParseToList(beam.DoFn):

    def process(self, element):
        return [element.split(",")]

In [7]:
class ColumnFilter(beam.DoFn): 
    
    def __init__(self, index, predicate):
        """
        param index: the index of the column to be compared against in the the file
        param predicate: a function taking a single argument and returning a boolean
        """
        super(beam.DoFn, self).__init__()
        self.predicate = predicate
        self.index = index
    def process(self, element):
        if self.predicate(element[self.index]):
            return [element]

In [8]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD')) 
        | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9


---
If you just want to play around with no IO on either end just pipe an array into your chain of transformers

In [9]:
with open('atl_1_4_9.csv') as f:
    content = f.readlines()

In [10]:
(content         
 | "Parser" >> beam.ParDo(ParseToList()) 
 | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD'))
 | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e)))

['2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37\n',
 '2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9\n']

---
### Combiners

In [11]:
!cat atl_1_4_9.csv

date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,944,939,-5,1110,1110,0
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1600,1629,29,1724,1815,51
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1920,1920,0,2046,2049,3
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9
2009-04-01,CO,19704,ATL,GA,33.63,-84.42,EWR,NJ,40.69,-74.16,1915,2057,102,2146,2

In [12]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Select" >> beam.Map(lambda elem: (elem[KEYS.index('arrival_airport')],int(elem[KEYS.index('arrival_delay')])))
        | "Group_by_dep" >> beam.GroupByKey()
        | "Average" >> beam.Map(lambda e: (e[0], np.sum(e[1], dtype='float')/len(e[1])))
        | "ToCommaSepString" >> beam.Map(lambda e: "{},{}".format(e[0],e[1]))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

ORD,-19.6
EWR,105.0
DEN,18.0


In [13]:
[1,2,3,4,5] | beam.CombineGlobally(lambda l: sum(l))

[15]

---
### Reading from BigQuery

In [14]:
import pandas as pd
query = """
SELECT
  year,
  COUNT(1) as num_babies
FROM
  publicdata.samples.natality
WHERE
  year > 2000
GROUP BY
  year
"""
df = pd.read_gbq(query,
                     project_id='going-tfx',
                     dialect='standard')

print(df.head())

   year  num_babies
0  2003     4096092
1  2004     4118907
2  2008     4255156
3  2006     4273225
4  2002     4027376


In [15]:
query="""
SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `going-tfx.examples.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) = 3
"""

In [16]:
df = pd.read_gbq(query,
                     project_id='going-tfx',
                     dialect='standard')
print(df.shape)
df.head(10)

(47, 9)


Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2006,6,1,DL,CHS,947,3,-5
1,ATL,2007,6,1,DL,PHX,1217,-1,3
2,ATL,2008,6,1,FL,BOS,1304,7,-5
3,ATL,2010,6,1,DL,MSP,2335,8,-2
4,ATL,2010,6,1,EV,CAK,1259,-8,-21
5,ATL,2015,6,1,DL,PIT,1923,0,-9
6,ATL,2015,6,1,EV,CHO,1626,-3,-4
7,ATL,2016,6,1,DL,RIC,1907,-5,-22
8,ATL,2017,6,1,UA,IAH,2040,-4,-1
9,ATL,2017,6,1,DL,LAX,2304,35,-2


In [17]:
KEYS = list(df.keys())
KEYS

['ORIGIN',
 'FL_YEAR',
 'FL_MONTH',
 'FL_DOW',
 'UNIQUE_CARRIER',
 'DEST',
 'CRS_ARR_TIME',
 'DEP_DELAY',
 'ARR_DELAY']

In [18]:
options=beam.options.pipeline_options.PipelineOptions().from_dictionary({'project': 'going-tfx'})
#options.get_all_options()

In [28]:
KEYS = list(df.keys())
def toCsvRow (dict): 
    row = [str(dict[c]) for c in KEYS]
    return ",".join(row)

In [30]:
with beam.Pipeline(options=options) as p:
    rows = p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
    (rows | "encode" >> beam.Map(toCsvRow) \
    | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 



ATL,2012,6,5,AA,DFW,855,-8,-8
ATL,2006,6,1,DL,CHS,947,3,-5
ATL,2010,6,1,DL,MSP,2335,8,-2
ATL,2016,6,1,DL,RIC,1907,-5,-22
ATL,2017,6,1,DL,LAX,2304,35,-2
ATL,2015,6,1,DL,PIT,1923,0,-9
ATL,2007,6,1,DL,PHX,1217,-1,3
ATL,2015,6,2,DL,LGA,1915,84,104
ATL,2010,6,2,DL,ABQ,1204,-1,-11
ATL,2008,6,2,DL,ORD,800,-7,-5
ATL,2008,6,3,DL,RSW,1603,-1,-3
ATL,2011,6,4,DL,LGA,1259,4,-8
ATL,2011,6,4,DL,MCO,1640,3,6
ATL,2014,6,4,DL,LAS,1107,-1,-2
ATL,2010,6,5,DL,LAS,1245,-3,-8
ATL,2017,6,5,DL,MDW,946,-1,-10
ATL,2016,6,6,DL,SGF,2107,3,-4
ATL,2012,6,6,DL,BWI,1433,-2,-3
ATL,2011,6,6,DL,COS,2021,-2,-21
ATL,2017,6,6,DL,SLC,2127,33,61
ATL,2011,6,6,DL,CHS,935,-2,1
ATL,2013,6,7,DL,EYW,1933,-3,-23
ATL,2011,6,7,DL,EYW,1709,-5,0
ATL,2015,6,1,EV,CHO,1626,-3,-4
ATL,2010,6,1,EV,CAK,1259,-8,-21
ATL,2012,6,2,EV,ROA,1044,12,9
ATL,2012,6,2,EV,VPS,1522,-4,-9
ATL,2006,6,2,EV,FNT,1601,2,0
ATL,2007,6,3,EV,MDT,2300,181,185
ATL,2011,6,4,EV,CRW,2148,-3,-3
ATL,2012,6,4,EV,CHO,1631,18,1
ATL,2013,6,6,EV,EWN,1335,-5,2
ATL,2009,6,7,EV,FAY