In [4]:
import apache_beam as beam
import time

#Combine

- Combine is a Beam transform for combining collections of elements or values in your data.
-	Combine has variants that work on entire PCollections, and some that combine the values for each key in PCollections of key/value pairs.
-	When you apply a Combine transform, you must provide the function that contains the logic for combining the elements or values.
-	The combining function should be commutative and associative.
-	The Beam SDK also provides some pre-built combine functions for common numeric combination operations such as sum, min, and max.
-	complex combination operations might require you to create a subclass of CombineFn that has an accumulation type distinct from the input/output type.

The associativity and commutativity of a CombineFn allows runners to automatically apply some optimizations:

- Combiner lifting: This is the most significant optimization. Input elements are combined per key and window before they are shuffled, so the volume of data shuffled might be reduced by many orders of magnitude. Another term for this optimization is “mapper-side combine.”
- Incremental combining: When you have a CombineFn that reduces the data size by a lot, it is useful to combine elements as they emerge from a streaming shuffle. This spreads out the cost of doing combines over the time that your streaming computation might be idle. Incremental combining also reduces the storage of intermediate accumulators.


#CombineGlobally

Combines all the elements in PCollection.

CombineGlobally accepts a function that takes an iterable of elements as an input, and combines them to return a single element.



In [6]:
pc = [1, 10, 100, 1000]
def sum_elements(element):
  return sum(element)

with beam.Pipeline() as p:
  input = (
      p
      | 'Create' >> beam.Create(pc)
      | 'Sum' >> beam.CombineGlobally(sum_elements) #can use sum() directly
      | 'Print' >> beam.Map(print)
  )

1111


#CombineFn

- For more complex combine functions, you can define a subclass ofCombineFn. You should use a CombineFn if the combine function requires a more sophisticated accumulator.

- The more general way to combine elements, and the most flexible, is with a class that inherits from CombineFn.

    - CombineFn.create_accumulator(): This creates an empty accumulator. For example, an empty accumulator for a sum would be 0, while an empty accumulator for a product (multiplication) would be 1.

    - CombineFn.add_input(): Called once per element. Takes an accumulator and an input element, combines them and returns the updated accumulator.

    - CombineFn.merge_accumulators(): Multiple accumulators could be processed in parallel, so this function helps merging them into a single accumulator.

    - CombineFn.extract_output(): It allows to do additional calculations before extracting a result.

In [7]:
#using CombineFn with Accumulators
class AverageFn(beam.CombineFn):
  def create_accumulator(self):
    return (0.0, 0)
  def add_input(self,sum_count,input):
    (sum,count)=sum_count
    return sum+input,count+1

  def merge_accumulators(self, accumulators):
    sums, counts = zip(*accumulators)
    return sum(sums), sum(counts)

  def extract_output(self, sum_count):
    (sum, count) = sum_count
    return sum / count if count else float('NaN')


In [8]:
with beam.Pipeline() as p:
  input_data = (p
                | "Create data" >> beam.Create([21,45,78,99,1,22,5])
                | "Combine Globally" >> beam.CombineGlobally(AverageFn())
                |"Write to Local">> beam.io.WriteToText('data/result'))

In [9]:
!{'head -n 10 /content/data/result-00000-of-00001'}

38.714285714285715


In [11]:
class PercentagesFn(beam.CombineFn):
  def create_accumulator(self):
    return {}

  def add_input(self, accumulator, input):
    # accumulator == {}
    # input == '🥕'
    if input not in accumulator:
      accumulator[input] = 0  # {'🥕': 0}
    accumulator[input] += 1  # {'🥕': 1}
    return accumulator

  def extract_output(self, accumulator):
    # accumulator == {'🥕': 3, '🍅': 6, '🍆': 1}
    total = sum(accumulator.values())  # 10
    percentages = {item: count / total for item, count in accumulator.items()}
    # percentages == {'🥕': 0.3, '🍅': 0.6, '🍆': 0.1}
    return percentages

In [None]:
with beam.Pipeline() as p:
  percentages = (
      p
      | 'Create produce' >> beam.Create(['🥕', '🍅', '🍅', '🥕', '🍆', '🍅', '🍅', '🍅', '🥕', '🍅'])
      | 'Get percentages' >> beam.CombineGlobally(PercentagesFn())
      | beam.Map(print)
  )

  # assert percentages == {'🥕': 0.3, '🍅': 0.6, '🍆': 0.1}

#CombinePerKey

- Combines all elements for each key in a collection

- CombinePerKey accepts a function that takes a list of values as an input, and combines them for each key.

In [16]:
with beam.Pipeline() as pipeline:
    total = (
      pipeline
      | 'Create plant counts' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Sum' >> beam.CombinePerKey(sum)
      | beam.Map(print)
  )

('🥕', 5)
('🍆', 1)
('🍅', 12)


In [17]:
#saturated_sum which takes an iterable of numbers and adds them together, up to a predefined maximum number.
def saturated_sum(values):
    max_value = 8
    return min(sum(values), max_value)

with beam.Pipeline() as pipeline:
  saturated_total = (
      pipeline
      | 'Create plant counts' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Saturated sum' >> beam.CombinePerKey(saturated_sum)
      | beam.Map(print))

('🥕', 5)
('🍆', 1)
('🍅', 8)


#CombineValues

- Combines an iterable of values in a keyed collection of elements.

- CombineValues accepts a function that takes an iterable of elements as an input, and combines them to return a single element.

- CombineValues expects a keyed PCollection of elements, where the value is an iterable of elements to be combined.

In [18]:
with beam.Pipeline() as pipeline:
  total = (
      pipeline
      | 'Create produce counts' >> beam.Create([
          ('🥕', [3, 2]),
          ('🍆', [1]),
          ('🍅', [4, 5, 3]),
      ])
      | 'Sum' >> beam.CombineValues(sum)
      | beam.Map(print))

('🥕', 5)
('🍆', 1)
('🍅', 12)


#Latest

Gets the element with the latest timestamp.

In [None]:
#using time
def to_unix_time(time_str, format='%Y-%m-%d %H:%M:%S'):
    return time.mktime(time.strptime(time_str, format))

with beam.Pipeline() as pipeline:
  latest_element = (
      pipeline
      | 'Create crops' >> beam.Create([
          {
              'item': '🥬', 'harvest': '2020-02-24 00:00:00'
          },
          {
              'item': '🍓', 'harvest': '2020-06-16 00:00:00'
          },
          {
              'item': '🥕', 'harvest': '2020-07-17 00:00:00'
          },
          {
              'item': '🍆', 'harvest': '2020-10-26 00:00:00'
          },
          {
              'item': '🍅', 'harvest': '2020-10-01 00:00:00'
          },
      ])
      | 'With timestamps' >> beam.Map(
          lambda crop: beam.window.TimestampedValue(crop, time.mktime(time.strptime(crop['harvest'], '%Y-%m-%d %H:%M:%S'))))
      | 'Get latest element' >> beam.combiners.Latest.Globally()
      | beam.Map(print))



{'item': '🍆', 'harvest': '2020-10-26 00:00:00'}



#Max

- Gets the element with the maximum value within each aggregation.

- We use Combine.Globally() to get the maximum element from the entire PCollection.


In [22]:
with beam.Pipeline() as pipeline:
    max_element = (
        pipeline
        | 'Create numbers' >> beam.Create([3, 4, 1, 2])
        | 'Get max value' >> beam.CombineGlobally(lambda elements:max(elements or [None])) #can use max directly
        | beam.Map(print)
)

4


In [23]:
#for key,value pairs
with beam.Pipeline() as pipeline:
    elements_with_max_value_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Get max value per key' >> beam.CombinePerKey(max)
        | beam.Map(print)
)

('🥕', 3)
('🍆', 1)
('🍅', 5)


#Mean

- We use Mean.Globally() to get the average of the elements from the entire PCollection.

In [26]:
with beam.Pipeline() as pipeline:
    max_element = (
        pipeline
        | 'Create numbers' >> beam.Create([3, 4, 1, 2])
        | 'Get max value' >> beam.combiners.Mean.Globally() #can use max directly
        | beam.Map(print)
)

2.5


In [28]:
#We use Mean.PerKey() to get the average of the elements for each unique key in a PCollection of key-values.
with beam.Pipeline() as pipeline:
    elements_with_max_value_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Get max value per key' >> beam.combiners.Mean.PerKey()
        | beam.Map(print)
)

('🥕', 2.5)
('🍆', 1.0)
('🍅', 4.0)


#Min

- Use Combine.Globally() for getting min elements

- Use Combine.PerKey() for key-value pairs
    


In [29]:
with beam.Pipeline() as pipeline:
    min_element = (
        pipeline
        | 'Create numbers' >> beam.Create([3, 4, 1, 2])
        | 'Get min value' >>
        beam.CombineGlobally(lambda elements: min(elements or [-1]))
        | beam.Map(print))

1


In [30]:
with beam.Pipeline() as pipeline:
    elements_with_max_value_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Get max value per key' >> beam.CombinePerKey(min)
        | beam.Map(print)
)

('🥕', 2)
('🍆', 1)
('🍅', 3)


#Sum

- We use Combine.Globally() to get sum of all the element values from the entire PCollection.

- We use Combine.PerKey() to get the sum of all the element values for each unique key in a PCollection of key-values.

In [31]:
with beam.Pipeline() as pipeline:
    min_element = (
        pipeline
        | 'Create numbers' >> beam.Create([3, 4, 1, 2])
        | 'Get min value' >>
        beam.CombineGlobally(sum)
        | beam.Map(print))

10


In [32]:
with beam.Pipeline() as pipeline:
    elements_with_max_value_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Get max value per key' >> beam.CombinePerKey(sum)
        | beam.Map(print)
)

('🥕', 5)
('🍆', 1)
('🍅', 12)


#Top

- gets you smallest,largest,top value from Pcollections of elements.

- We use Top.Largest() to get the largest elements from the entire PCollection.

- We use Top.Smallest() to get the largest elements from the entire PCollection.

-

In [34]:
#LArgest
with beam.Pipeline() as pipeline:
  largest_elements = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Largest N values' >> beam.combiners.Top.Largest(2)
      | beam.Map(print))

[4, 3]


In [35]:
#for key-value pairs
with beam.Pipeline() as pipeline:
    largest_elements_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Largest N values per key' >> beam.combiners.Top.LargestPerKey(2)
        | beam.Map(print))

('🥕', [3, 2])
('🍆', [1])
('🍅', [5, 4])


In [36]:
#Smallest
with beam.Pipeline() as pipeline:
  largest_elements = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Largest N values' >> beam.combiners.Top.Smallest(2)
      | beam.Map(print))

[1, 2]


In [37]:
#for key-value pairs
with beam.Pipeline() as pipeline:
    largest_elements_per_key = (
        pipeline
        | 'Create produce' >> beam.Create([
            ('🥕', 3),
            ('🥕', 2),
            ('🍆', 1),
            ('🍅', 4),
            ('🍅', 5),
            ('🍅', 3),
        ])
        | 'Largest N values per key' >> beam.combiners.Top.SmallestPerKey(2)
        | beam.Map(print))

('🥕', [2, 3])
('🍆', [1])
('🍅', [3, 4])


- We use Top.Of() to get elements with customized rules from the entire PCollection.
- You can change how the elements are compared with key.
- By default you get the largest elements, but you can get the smallest by setting reverse=True.

In [38]:
with beam.Pipeline() as pipeline:
    shortest_elements = (
        pipeline
        | 'Create produce names' >> beam.Create([
            '🍓 Strawberry',
            '🥕 Carrot',
            '🍏 Green apple',
            '🍆 Eggplant',
            '🌽 Corn',
        ])
        | 'Shortest names' >> beam.combiners.Top.Of(
            2,             # number of elements
            key=len,       # optional, defaults to the element itself
            reverse=True,  # optional, defaults to False (largest/descending)
        )
        | beam.Map(print)
    )

['🌽 Corn', '🥕 Carrot']


In [39]:
with beam.Pipeline() as pipeline:
    shortest_elements_per_key = (
        pipeline
        | 'Create produce names' >> beam.Create([
            ('spring', '🥕 Carrot'),
            ('spring', '🍓 Strawberry'),
            ('summer', '🥕 Carrot'),
            ('summer', '🌽 Corn'),
            ('summer', '🍏 Green apple'),
            ('fall', '🥕 Carrot'),
            ('fall', '🍏 Green apple'),
            ('winter', '🍆 Eggplant'),
        ])
        | 'Shortest names per key' >> beam.combiners.Top.PerKey(
            2,             # number of elements
            key=len,       # optional, defaults to the value itself
            reverse=True,  # optional, defaults to False (largest/descending)
        )
        | beam.Map(print)
    )

('spring', ['🥕 Carrot', '🍓 Strawberry'])
('summer', ['🌽 Corn', '🥕 Carrot'])
('fall', ['🥕 Carrot', '🍏 Green apple'])
('winter', ['🍆 Eggplant'])
