In [1]:
import tensorflow as tf
import numpy as np
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

In [2]:
KEYS = 'date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay'
KEYS = KEYS.split(',')
print(KEYS)

['date', 'airline', 'airline_code', 'departure_airport', 'departure_state', 'departure_lat', 'departure_lon', 'arrival_airport', 'arrival_state', 'arrival_lat', 'arrival_lon', 'departure_schedule', 'departure_actual', 'departure_delay', 'arrival_schedule', 'arrival_actual', 'arrival_delay']


In [3]:
!head -10 ATL_1_4_9.csv > atl_1_4_9.csv

In [4]:
options = PipelineOptions()

## The Transforms' return values
The important thing to notice in designing ```Transform```s is that ParDo Transforms always have an outer list of what they return. See ```ParseToList``` below for illustration. ```Map``` functions return the result as they computed it. 

In [5]:
class ParseToList(beam.DoFn):

    def process(self, element):
        return [element.split(",")]

In [20]:
class ColumnFilter(beam.DoFn): 
    
    def __init__(self, index, predicate):
        """
        param index: the index of the column to be compared against in the the file
        param predicate: a function taking a single argument and returning a boolean
        """
        super(beam.DoFn, self).__init__()
        self.predicate = predicate
        self.index = index
    def process(self, element):
        if self.predicate(element[self.index]):
            return [element]

In [21]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD')) 
        | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9


---
If you just want to play around with no IO on either end just pipe an array into your chain of transformers

In [22]:
with open('atl_1_4_9.csv') as f:
    content = f.readlines()

In [23]:
(content         
 | "Parser" >> beam.ParDo(ParseToList()) 
 | "Filter" >> beam.ParDo(ColumnFilter(KEYS.index('arrival_airport'), lambda x: x == 'ORD'))
 | "ToCommaSepString" >> beam.Map(lambda e: ",".join(e)))

['2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37\n',
 '2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24\n',
 '2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9\n']

---
### Combiners

In [15]:
!cat atl_1_4_9.csv

date,airline,airline_code,departure_airport,departure_state,departure_lat,departure_lon,arrival_airport,arrival_state,arrival_lat,arrival_lon,departure_schedule,departure_actual,departure_delay,arrival_schedule,arrival_actual,arrival_delay
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,944,939,-5,1110,1110,0
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1600,1629,29,1724,1815,51
2009-04-01,F9,20436,ATL,GA,33.63,-84.42,DEN,CO,39.86,-104.67,1920,1920,0,2046,2049,3
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,2000,1955,-5,2110,2033,-37
2009-04-01,MQ,20398,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1325,1324,-1,1435,1414,-21
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,704,700,-4,813,748,-25
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1705,1658,-7,1821,1757,-24
2009-04-01,UA,19977,ATL,GA,33.63,-84.42,ORD,IL,41.98,-87.9,1009,1004,-5,1118,1127,9
2009-04-01,CO,19704,ATL,GA,33.63,-84.42,EWR,NJ,40.69,-74.16,1915,2057,102,2146,2

In [16]:
with beam.Pipeline(options=options) as p:
    lines = p | 'ReadFile' >> beam.io.ReadFromText('atl_1_4_9.csv',skip_header_lines=1)
    out = (lines
        | "Parser" >> beam.ParDo(ParseToList()) 
        | "Select" >> beam.Map(lambda elem: (elem[KEYS.index('arrival_airport')],int(elem[KEYS.index('arrival_delay')])))
        | "Group_by_dep" >> beam.GroupByKey()
        | "Average" >> beam.Map(lambda e: (e[0], np.sum(e[1], dtype='float')/len(e[1])))
        | "ToCommaSepString" >> beam.Map(lambda e: "{},{}".format(e[0],e[1]))
        | "Write" >> beam.io.WriteToText("out.csv"))
!cat out.csv-00000-of-00001
! rm -rf out.csv-00000-of-00001 beam-temp-out.csv* 

ORD,-19.6
EWR,105.0
DEN,18.0


In [10]:
[1,2,3,4,5] | beam.CombineGlobally(lambda l: sum(l))

[15]