[RDD Actions](https://sparkbyexamples.com/pyspark/pyspark-rdd-actions/)

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[3]") \
        .appName("RDD_Actions") \
        .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/17 09:11:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Creating RDD
data=[("Z", 1),("A", 20),("B", 30),("C", 40),("B", 30),("B", 60)]

rdd1 = spark.sparkContext.parallelize(data)

rdd2 = spark.sparkContext.parallelize([1,2,3,4,5,3,2])

In [3]:
# Collect - returns complete dataset as array
rdd1_data = rdd1.collect()
print(rdd1_data)

[Stage 0:>                                                          (0 + 3) / 3]

[('Z', 1), ('A', 20), ('B', 30), ('C', 40), ('B', 30), ('B', 60)]


                                                                                

In [4]:
# count - return the count of elements in the dataset
rdd2_count = rdd2.count()
print(rdd2_count)

[Stage 1:>                                                          (0 + 3) / 3]

7


                                                                                

In [11]:
# countApprox() - returns the approximate number of elements
rdd2_count = rdd2.countApprox(timeout=1000)
print(rdd2_count)

7


In [12]:
# countApproxDstinct() - returns approximate number of distinct elements
rdd2_distinct = rdd2.countApproxDistinct()
print(rdd2_distinct)

5


In [15]:
# countByValue() - [key, value]
rdd2_cntByVal = rdd2.countByValue()
print(rdd2_cntByVal)

rdd1_cntByVal = rdd1.countByValue()
print(rdd1_cntByVal)

defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 2, 4: 1, 5: 1})
defaultdict(<class 'int'>, {('Z', 1): 1, ('A', 20): 1, ('B', 30): 2, ('C', 40): 1, ('B', 60): 1})


In [16]:
# first - returns the first element from the dataset
print(rdd1.first())
print(rdd2.first())

('Z', 1)
1


In [17]:
# top(n) - returns the top (largest) n elements from the dataset
print(rdd1.top(2))
print(rdd2.top(3))

[('Z', 1), ('C', 40)]
[5, 4, 3]


In [18]:
# min() - returns the min element from the dataset
print(rdd1.min())
print(rdd2.min())

('A', 20)
1


In [19]:
# max() - returns the max element fromt the dataset
print(rdd1.max())
print(rdd2.max())

('Z', 1)
5


In [20]:
# take(n) - returns the first n elements from the dataset
print(rdd1.take(2))
print(rdd2.take(2))

[('Z', 1), ('A', 20)]
[1, 2]


In [21]:
# takeOrdered(n) - returns the first n elements (smallest) from the dataset
print(rdd1.takeOrdered(2))
print(rdd2.takeOrdered(3))

[('A', 20), ('B', 30)]
[1, 2, 2]


In [23]:
# takeSample() - returns the subset of an array
print(rdd1.takeSample(withReplacement=True, num=3))
print(rdd2.takeSample(withReplacement=False, num=3))

[('B', 30), ('Z', 1), ('C', 40)]
[1, 3, 3]


In [24]:
# reduce() - reduces the element of the dataset using an operator
from operator import add
reduced_rdd2 = rdd2.reduce(add) # or use lambda function to add
print(reduced_rdd2)

20


In [25]:
# treeReduce() - reduces the elements of RDD in multi-level tree pattern
reduced_rdd2 = rdd2.treeReduce(lambda x, y : x + y)
print(reduced_rdd2)

20


In [27]:
# fold() - aggregates the result for each partition 
# (additionally add the zeroValue) and results for all partitions
fold_rdd2 = rdd2.fold(0, lambda x, y : x + y)
print(fold_rdd2)

20


In [29]:
# fold()
print("Number of partitions : ", rdd2.getNumPartitions())

print(rdd2.glom().collect())

fold_rdd2 = rdd2.fold(1, lambda x, y : x + y)
print(fold_rdd2)

# Number of partitions :  3
# [[1, 2], [3, 4], [5, 3, 2]] -- [1 + {1 + [1 + 2]} + {1 + [3 + 4]} + {1 + [5 + 3 + 2]}]
#                                [1 + 4 + 8 + 11] => [24]
# 24

Number of partitions :  3
[[1, 2], [3, 4], [5, 3, 2]]
24


In [None]:
# fold()
print(rdd2.fold(1, lambda x, y : (x, y)))
print(rdd2.fold(1, lambda x, y : (x + y)))

(((1, ((1, 1), 2)), ((1, 3), 4)), (((1, 5), 3), 2))
24


In [32]:
# Aggregate() - like fold it allows aggregation on all partitions and 
# another aggregation over the result of partitions

seqOp = (lambda a, b : a + b)
combOp = (lambda a, b : a * b)
agg = rdd2.aggregate(1, seqOp, combOp)
print(agg)

# Number of partitions :  3
#                                seqOp - first aggregation over partitions
# [[1, 2], [3, 4], [5, 3, 2]] -- [1 + {1 + [1 + 2]} + {1 + [3 + 4]} + {1 + [5 + 3 + 2]}]
#                                combOp - second aggregation over result of partitions
#                                [1 * 4 * 8 * 11] => [352]
# 352

352


In [40]:
seqOp2 = (lambda x, y: (x[0] + y, x[1] + 1))
combOp2 = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
agg2 = rdd2.aggregate((0, 0), seqOp2, combOp2)
print(agg2)

(20, 7)


In [101]:
# treeAggregate()

In [102]:
spark.stop()