# Init SparkContext

In [1]:
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = (SparkSession.builder.appName("pyspark-rdd-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)




In [3]:
sc = spark.sparkContext
sc

# Spark RDD MapReduce

In [4]:
# lines = sc.textFile("s3a://warehouse/bronze/sample_text.txt", 4)
lines = sc.textFile("s3a://warehouse/tpch_data/h_lineitem.dsv", 4)
# lines.cache()
# lines.count()

In [6]:
# lines.count()

11996783

In [7]:
# lines.unpersist()

s3a://warehouse/tpch_data/h_lineitem.dsv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
%%time
# word counts
# Map phase: Split each line into words and assign count of 1 to each word
word_counts = lines.flatMap(lambda line: line.split("|")) \
                       .map(lambda word: (word, 1))

# Reduce phase: Sum the counts for each word
word_counts = word_counts.reduceByKey(lambda a, b: a + b)

# Collect the results
results = word_counts.take(10)

# Print the word counts
for word, count in results:
    print(f"{word}: {count}")

"L_SUPPKEY": 1
"L_EXTENDEDPRICE": 1
2876391: 1
"final accounts are across .": 278
03.04.92: 12056
5690: 624
6947: 647
"packages eat fluffily above": 90
39859: 34
18498,96: 28
CPU times: user 80.6 ms, sys: 26.8 ms, total: 107 ms
Wall time: 1min 41s


In [6]:
%%time
word_count = {}  # Dictionary to store word counts
filename = "h_lineitem.dsv"
with open(filename, 'r') as file:
    for line in file:
        words = line.strip().split("|")  # Split line into words

        for word in words:
            # Remove punctuation and convert to lowercase for consistent counting
            word = word.strip('.,!?').lower()

            if word:  # Ignore empty words
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1
                    
# Print the top 10 words and their counts
count = 0
for word, count in word_count.items():
    print(f'{word}: {count}')
    if count > 10:
        break
    count += 1

"l_orderkey": 1
"l_partkey": 1
"l_suppkey": 1
"l_linenumber": 1
"l_quantity": 1
"l_extendedprice": 1
"l_discount": 1
"l_tax": 1
"l_returnflag": 1
"l_linestatus": 1
"l_shipdate": 1
"l_commitdate": 1
"l_receiptdate": 1
"l_shipinstruct": 1
"l_shipmode": 1
"l_comment": 1
2828519: 7
24450: 33
CPU times: user 1min 31s, sys: 2.97 s, total: 1min 34s
Wall time: 1min 35s


In [10]:
spark.sql("CREATE DATABASE brazillian_ecom")

DataFrame[]

In [11]:
spark.sql("SHOW DATABASES").show()

+---------------+
|      namespace|
+---------------+
|brazillian_ecom|
|        default|
+---------------+



# RDD overview
- Programmer specifies number of partitions
- Driver passes each partition to corresponding Workers
- Master parameter specifies number of workers.
- Spark automatically pushes closures to workers.

# Some transformations
- map(func): return a new distributed dataset formed by passing each element of the source through a function func.
- filter(func): return a new dataset formed by selecting those elements of the source on which func returns true.
- distinct([numTasks]): return a new dataset that contains the distinct elements of the source dataset.
- flatMap(func): similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item).

In [4]:
rdd = sc.parallelize([1, 4, 2, 2, 3, 4])
rdd.map(lambda x: x * 2).collect()

[2, 8, 4, 4, 6, 8]

In [5]:
rdd.filter(lambda x: x % 2 == 0).collect()

[4, 2, 2, 4]

In [6]:
rdd = sc.parallelize([1, 4, 2, 2, 3])
rdd.distinct().collect()

[4, 2, 3, 1]

In [7]:
rdd = sc.parallelize([1, 2, 3])
rdd.map(lambda x: [x, x + 5]).collect()

[[1, 6], [2, 7], [3, 8]]

In [8]:
rdd.flatMap(lambda x: [x, x + 5]).collect()

[1, 6, 2, 7, 3, 8]

# Some actions
- reduce(func): aggregate dataset's elements using function func, func takes two arguments and returns one, and is commutative and associative so that it can be computed correctly in parallel.
- take(n): return an array with the list n elements.
- collect(): return all the elements as an array. WARNING: make sure will fit in driver program.
- takeOrdered(n, key=func): return n elements ordred in ascending order or as specified by the optional key function.

In [9]:
rdd = sc.parallelize([1, 4, 2, 2, 3, 4])
rdd.reduce(lambda a, b: a * b)

192

In [10]:
rdd.take(2)

[1, 4]

In [11]:
rdd.collect()

[1, 4, 2, 2, 3, 4]

In [12]:
rdd.takeOrdered(3, lambda s: -1 * s)

[4, 4, 3]

# Key-Value RDDs
- Similar to Map Reduce, Spark supports Key-Value pairs
- Each element of a Pair RDD is a pair tuple
## Some Key-Value transformation
- reduceByKey(func): return a new distributed dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function func, which must be of type (V, V) -> V.
- sortByKey(): return a new dataset (K, V) pairs sorted by keys in asceding order.
- groupByKey(): return a new dataset of (K, Iterable<V>) pairs.

In [16]:
rdd = sc.parallelize([(1, 2), (3, 4)])
rdd.collect()

[(1, 2), (3, 4)]

In [17]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])
rdd.reduceByKey(lambda a, b: a + b).collect()

[(3, 10), (1, 2)]

In [18]:
rdd = sc.parallelize([(1, "a"), (2, "c"), (1, "b")])
rdd.sortByKey().collect()

[(1, 'a'), (1, 'b'), (2, 'c')]

In [19]:
rdd.groupByKey().map(lambda x : (x[0], list(x[1]))).collect()

[(2, ['c']), (1, ['a', 'b'])]

# Broadcast variables
- Keep read-only variable cached on workers, ship to each worker only once instead of with each task

In [None]:
# at the driver:
bcVar = sc.broadcast([1, 2, 3])

# at the worker (in code passed via a closure)
bcVar.value

# Accumulators
- Variables that can only be "added" to by associative op
- Used to efficiently implement parallel counters and sums
- Only driver can read an accumulator's value, not tasks
- Tasks at workers cannot access accumulator's values
- Tasks see accumulators as write-only variables
- Actions: each task's update to accumulator is applied only once
- Transformations: no guarantees (use only for debugging)
- Types: integers, double, long, float

In [None]:
accum = sc.accumulator(0)
rdd = sc.parallelize([1, 2, 3, 4])
def f(x):
    global accum
    accum += x
    
rdd.foreach(f)
accum.value

# X.join(Y)
- Return RDD of all pairs of elements with matching keys in X and Y.
- Each pair is (k, (v1, v2)) tuple, where (k, v1) is in X and (k, v2) is in Y.

In [21]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())

[('a', (1, 2)), ('a', (1, 3))]

# X.leftOuterJoin(Y)
- For each element (k, v) in X, resulting RDD will either contain
 - All pairs (k, (v, w)) for w in Y.
 - Or the pair (k, (v, None)) if no elements in Y have key k.

In [22]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(x.leftOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None))]

# X.rightOuterJoin(Y)
- For each element (k, w) in Y, resulting RDD will either contain
 - All pairs (k, (v, w)) for v in X.
 - Or the pair (k, (None, w)) if no elements in X have key k.

In [23]:
x = sc.parallelize([("a", 1)])
y = sc.parallelize([("a", 2), ("b", 4)])
sorted(x.rightOuterJoin(y).collect())

[('a', (1, 2)), ('b', (None, 4))]

# X.fullOuterJoin(Y)
- For each element (k, v) in X, resulting RDD will either contain
 - All pairs (k, (v, w)) for w in Y.
 - Or the pair (k, (v, None)) if no elements in Y have key k.
- For each element (k, w) in Y, resulting RDD will either contain
 - All pairs (k, (v, w)) for v in X.
 - Or the pair (k, (None, w)) if no elements in X have key k.

In [24]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
sorted(x.fullOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]