In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession  \
        .builder  \
        .appName('Transformations1').getOrCreate()

In [17]:
tuples = [('A', 7), ('A', 8), ('A', -4),
         ('B', 3), ('B', 9), ('B', -1),
         ('C', 1), ('C', 5), ('C', 9)]

rdd1 = spark.sparkContext.parallelize(tuples)
rdd1.collect()

[('A', 7),
 ('A', 8),
 ('A', -4),
 ('B', 3),
 ('B', 9),
 ('B', -1),
 ('C', 1),
 ('C', 5),
 ('C', 9)]

In [18]:
# drop the negative values

positives = rdd1.filter(lambda x: x[1] > 0)
positives.top(20)

[('C', 9), ('C', 5), ('C', 1), ('B', 9), ('B', 3), ('A', 8), ('A', 7)]

In [19]:
positives.groupByKey().collect()

[('B', <pyspark.resultiterable.ResultIterable at 0x258c0c53370>),
 ('C', <pyspark.resultiterable.ResultIterable at 0x258c0c53970>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x258c0c532b0>)]

In [20]:
# find sum and average per key using groupByKey()

sum_and_avg = positives.groupByKey()  \
                .mapValues(lambda v: (sum(v), float(sum(v))/len(v)))

sum_and_avg.collect()

[('B', (12, 6.0)), ('C', (15, 5.0)), ('A', (15, 7.5))]

In [21]:
# find sum and average using reduceByKey()

sum_count = positives.mapValues(lambda v: (v, 1))
sum_count.collect()

[('A', (7, 1)),
 ('A', (8, 1)),
 ('B', (3, 1)),
 ('B', (9, 1)),
 ('C', (1, 1)),
 ('C', (5, 1)),
 ('C', (9, 1))]

In [22]:
sum_count_agg2 = sum_count.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
sum_count_agg2.collect()

[('B', (12, 2)), ('C', (15, 3)), ('A', (15, 2))]

In [23]:
sum_and_avg2 = sum_count_agg2.mapValues(lambda x: (x[0], float(x[0] / x[1])))
sum_and_avg2.collect()

[('B', (12, 6.0)), ('C', (15, 5.0)), ('A', (15, 7.5))]

In [16]:
# The groupByKey() transformation groups the values for each key
# in the RDD into a single sequence, similar to a SQL GROUP BY statement.
# This transformation can cause out of memory (OOM) errors
# as data is sent over the network of Spark servers and collected on
# the reducer/workers when the number of values per key is in the
# thousands or millions.
# With the reduceByKey() transformation, however, data is combined
# in each partition, so there is only one output for each key in
# each partition to send over the network of Spark servers. This
# makes it more scalable than groupByKey(). reduceByKey() merges
# the values for each key using an associative and commutative
# reduce function. It combines all the values (per key) into another
# value with the exact same data type (this is a limitation, which can
# be overcome by using the combineByKey() transformation). Overall,
# the reduceByKey() is more scaleable than the groupByKey().