# Init SparkContext

In [3]:
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

In [5]:
spark = (SparkSession.builder.appName("pyspark-rdd-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)
# spark.sparkContext.getConf().getAll()

In [6]:
sc = spark.sparkContext
sc

# RDD overview
- Programmer specifies number of partitions
- Driver passes each partition to corresponding Workers
- Master parameter specifies number of workers.
- Spark automatically pushes closures to workers.

# Some transformations
- map(func): return a new distributed dataset formed by passing each element of the source through a function func.
- filter(func): return a new dataset formed by selecting those elements of the source on which func returns true.
- distinct([numTasks]): return a new dataset that contains the distinct elements of the source dataset.
- flatMap(func): similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item).

In [5]:
rdd = sc.parallelize([1, 2, 3, 4])
rdd.map(lambda x: x * 2).collect()

[2, 4, 6, 8]

In [7]:
rdd.filter(lambda x: x % 2 == 0).collect()

[2, 4]

In [8]:
rdd = sc.parallelize([1, 4, 2, 2, 3])
rdd.distinct().collect()

[4, 2, 3, 1]

In [9]:
rdd = sc.parallelize([1, 2, 3])
rdd.map(lambda x: [x, x + 5]).collect()

[[1, 6], [2, 7], [3, 8]]

In [10]:
rdd.flatMap(lambda x: [x, x + 5]).collect()

[1, 6, 2, 7, 3, 8]

# Some actions
- reduce(func): aggregate dataset's elements using function func, func takes two arguments and returns one, and is commutative and associative so that it can be computed correctly in parallel.
- take(n): return an array with the list n elements.
- collect(): return all the elements as an array. WARNING: make sure will fit in driver program.
- takeOrdered(n, key=func): return n elements ordred in ascending order or as specified by the optional key function.

In [12]:
rdd = sc.parallelize([3, 4, 5])
rdd.reduce(lambda a, b: a * b)

60

In [15]:
rdd.take(2)

[3, 4]

In [16]:
rdd.collect()

[3, 4, 5]

In [9]:
lines = sc.textFile("s3a://warehouse/bronze/sample_text.txt", 4)
lines.cache()
lines.count()

5

In [19]:
rdd.collect()

[3, 4, 5]

In [20]:
rdd = sc.parallelize([5, 3, 1, 2])
rdd.takeOrdered(3, lambda s: -1 * s)

[5, 3, 2]

In [10]:
lines.count()

5

In [22]:
lines.unpersist()

s3a://warehouse/2023 MapPartitionsRDD[22] at textFile at NativeMethodAccessorImpl.java:0

# Key-Value RDDs
- Similar to Map Reduce, Spark supports Key-Value pairs
- Each element of a Pair RDD is a pair tuple
## Some Key-Value transformation
- reduceByKey(func): return a new distributed dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function func, which must be of type (V, V) -> V.
- sortByKey(): return a new dataset (K, V) pairs sorted by keys in asceding order.
- groupByKey(): return a new dataset of (K, Iterable<V>) pairs.

In [16]:
rdd = sc.parallelize([(1, 2), (3, 4)])
rdd.collect()

[(1, 2), (3, 4)]

In [23]:
rdd = sc.parallelize([(1, 2), (3, 5), (3, 6)])
rdd.reduceByKey(lambda a, b: a + b).collect()

[(1, 2), (3, 11)]

In [25]:
rdd = sc.parallelize([(1, "b"), (2, "c"), (1, "a")])
rdd.sortByKey().collect()

[(1, 'b'), (1, 'a'), (2, 'c')]

In [26]:
rdd.groupByKey().map(lambda x : (x[0], list(x[1]))).collect()

[(2, ['c']), (1, ['b', 'a'])]

# Broadcast variables
- Keep read-only variable cached on workers, ship to each worker only once instead of with each task

In [27]:
# at the driver:
bcVar = sc.broadcast([1, 2, 3])

# at the worker (in code passed via a closure)
bcVar.value

[1, 2, 3]

# Accumulators
- Variables that can only be "added" to by associative op
- Used to efficiently implement parallel counters and sums
- Only driver can read an accumulator's value, not tasks
- Tasks at workers cannot access accumulator's values
- Tasks see accumulators as write-only variables
- Actions: each task's update to accumulator is applied only once
- Transformations: no guarantees (use only for debugging)
- Types: integers, double, long, float

In [28]:
accum = sc.accumulator(0)
rdd = sc.parallelize([1, 2, 3, 4])
def f(x):
    global accum
    accum += x
    
rdd.foreach(f)
accum.value

10

# X.join(Y)
- Return RDD of all pairs of elements with matching keys in X and Y.
- Each pair is (k, (v1, v2)) tuple, where (k, v1) is in X and (k, v2) is in Y.

In [30]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())
x.join(y).collect()

[('a', (1, 2)), ('a', (1, 3))]

# X.leftOuterJoin(Y)
- For each element (k, v) in X, resulting RDD will either contain
 - All pairs (k, (v, w)) for w in Y.
 - Or the pair (k, (v, None)) if no elements in Y have key k.

In [32]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(x.leftOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None))]

# X.rightOuterJoin(Y)
- For each element (k, w) in Y, resulting RDD will either contain
 - All pairs (k, (v, w)) for v in X.
 - Or the pair (k, (None, w)) if no elements in X have key k.

In [34]:
x = sc.parallelize([("a", 1)])
y = sc.parallelize([("a", 2), ("b", 4)])
sorted(x.rightOuterJoin(y).collect())

[('a', (1, 2)), ('b', (None, 4))]

# X.fullOuterJoin(Y)
- For each element (k, v) in X, resulting RDD will either contain
 - All pairs (k, (v, w)) for w in Y.
 - Or the pair (k, (v, None)) if no elements in Y have key k.
- For each element (k, w) in Y, resulting RDD will either contain
 - All pairs (k, (v, w)) for v in X.
 - Or the pair (k, (None, w)) if no elements in X have key k.

In [35]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
sorted(x.fullOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]