In [None]:
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

In [None]:
import tensorflow as tf
import numpy as np
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import bigquery as bq
import pandas as pd

In [None]:
KEYS = 'date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay'
KEYS = KEYS.split(',')
print(KEYS)

In [None]:
!head -10 ATL_1_4_9.csv > atl_1_4_9.csv

In [None]:
options = PipelineOptions()

## Attention: Mind The Transforms' Return Values!
The important thing to notice in designing ```Transform```s is that ParDo Transforms always have an outer list of what they return. See ```ParseToList``` below for illustration. ```Map``` functions return the result as they computed it. 

In [None]:
class ParseToList(beam.DoFn):

    def process(self, element):
        return [element.split(",")]

In [None]:
class ColumnFilter(beam.DoFn): 
    
    def __init__(self, index, predicate):
        """
        param index: the index of the column to be compared against in the the file
        param predicate: a function taking a single argument and returning a boolean
        """
        super(beam.DoFn, self).__init__()
        self.predicate = predicate
        self.index = index
    def process(self, element):
        if self.predicate(element[self.index]):
            return [element]

In [None]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD')) 
        | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

---
If you just want to play around with no IO on either end just pipe an array into your chain of transformers

In [None]:
with open('atl_1_4_9.csv') as f:
    content = f.readlines()

In [None]:
(content         
 | "Parser" >> beam.ParDo(ParseToList()) 
 | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD'))
 | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e).strip()))

---
### Combiners

In [None]:
!cat atl_1_4_9.csv

In [None]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Select" >> beam.Map(lambda elem: (elem[KEYS.index('arrival_airport')],int(elem[KEYS.index('arrival_delay')])))
        | "Group_by_dep" >> beam.GroupByKey()
        | "Average" >> beam.Map(lambda e: (e[0], np.sum(e[1], dtype='float')/len(e[1])))
        | "ToCommaSepString" >> beam.Map(lambda e: "{},{}".format(e[0],e[1]))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

In [None]:
[1,2,3,4,5] | beam.CombineGlobally(lambda l: sum(l))

---
### Reading from BigQuery

In [None]:
# !

In [2]:
%load_ext google.cloud.bigquery

In [6]:
%%bigquery blabla
SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `going-tfx.examples.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) = 3

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2006,6,1,DL,CHS,947,3,-5
1,ATL,2007,6,1,DL,PHX,1217,-1,3
2,ATL,2008,6,1,FL,BOS,1304,7,-5
3,ATL,2010,6,1,DL,MSP,2335,8,-2
4,ATL,2010,6,1,EV,CAK,1259,-8,-21
5,ATL,2015,6,1,DL,PIT,1923,0,-9
6,ATL,2015,6,1,EV,CHO,1626,-3,-4
7,ATL,2016,6,1,DL,RIC,1907,-5,-22
8,ATL,2017,6,1,UA,IAH,2040,-4,-1
9,ATL,2017,6,1,DL,LAX,2304,35,-2


In [7]:
blabla[:10]

Unnamed: 0,ORIGIN,FL_YEAR,FL_MONTH,FL_DOW,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DEP_DELAY,ARR_DELAY
0,ATL,2006,6,1,DL,CHS,947,3,-5
1,ATL,2007,6,1,DL,PHX,1217,-1,3
2,ATL,2008,6,1,FL,BOS,1304,7,-5
3,ATL,2010,6,1,DL,MSP,2335,8,-2
4,ATL,2010,6,1,EV,CAK,1259,-8,-21
5,ATL,2015,6,1,DL,PIT,1923,0,-9
6,ATL,2015,6,1,EV,CHO,1626,-3,-4
7,ATL,2016,6,1,DL,RIC,1907,-5,-22
8,ATL,2017,6,1,UA,IAH,2040,-4,-1
9,ATL,2017,6,1,DL,LAX,2304,35,-2


In [None]:
query="""
SELECT
  ORIGIN,
  FL_YEAR,
  FL_MONTH,
  FL_DOW,
  UNIQUE_CARRIER,
  DEST,
  CRS_ARR_TIME,
  DEP_DELAY,
  ARR_DELAY
FROM `going-tfx.examples.ATL_JUNE` 
where
  MOD(ABS(FARM_FINGERPRINT(
    CONCAT(
      STRING(TIMESTAMP(FL_DATE)),
      UNIQUE_CARRIER,
      DEST
    )
  )) + CRS_ARR_TIME, 10000) = 3
"""

In [None]:
df = pd.read_gbq(query,
                     project_id='going-tfx',
                     dialect='standard')
print(df.shape)
df.head(10)

In [None]:
KEYS = list(df.keys())
KEYS

In [None]:
options=beam.options.pipeline_options.PipelineOptions().from_dictionary({'project': 'going-tfx'})
#options.get_all_options()

In [None]:
KEYS = list(df.keys())
def toCsvRow (dict): 
    row = [str(dict[c]) for c in KEYS]
    return ",".join(row)

In [None]:
with beam.Pipeline(options=options) as p:
    rows = p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
    (rows | "encode" >> beam.Map(toCsvRow) \
    | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 