In [0]:
from pyspark import SparkContext

sc = SparkContext("local[*]", "RDDPipeline")

# Original input data
lines = sc.parallelize([
    "spark makes big data simple",
    "spark runs fast on clusters",
    "data engineers love spark"
], 4)

# flatMap: Split each line into words
words = lines.flatMap(lambda s: s.split())
# Output:
# ['spark', 'makes', 'big', 'data', 'simple',
#  'spark', 'runs', 'fast', 'on', 'clusters',
#  'data', 'engineers', 'love', 'spark']

# filter: Keep words with length > 3
words_filtered = words.filter(lambda w: len(w) > 3)
# Output:
# ['spark', 'makes', 'data', 'simple',
#  'spark', 'runs', 'fast', 'clusters',
#  'data', 'engineers', 'love', 'spark']

# map: Create (word, 1) key-value pairs
pairs = words_filtered.map(lambda w: (w, 1))
# Output:
# [('spark', 1), ('makes', 1), ('data', 1), ('simple', 1),
#  ('spark', 1), ('runs', 1), ('fast', 1), ('clusters', 1),
#  ('data', 1), ('engineers', 1), ('love', 1), ('spark', 1)]

# groupByKey: Group values by key
grouped = pairs.groupByKey()
grouped_lists = grouped.mapValues(list)
# Output:
# [('spark', [1, 1, 1]),
#  ('makes', [1]),
#  ('data', [1, 1]),
#  ('simple', [1]),
#  ('runs', [1]),
#  ('fast', [1]),
#  ('clusters', [1]),
#  ('engineers', [1]),
#  ('love', [1])]

# reduceByKey: Aggregate counts per word efficiently
counts = pairs.reduceByKey(lambda a, b: a + b)
# Output:
# [('spark', 3),
#  ('makes', 1),
#  ('data', 2),
#  ('simple', 1),
#  ('runs', 1),
#  ('fast', 1),
#  ('clusters', 1),
#  ('engineers', 1),
#  ('love', 1)]

# coalesce: Reduce number of partitions (optimize performance)
counts_one_part = counts.coalesce(1)
# partitions before coalesce: 4
# partitions after coalesce: 1

print("✅ Final Result:", counts_one_part.collect())
# Final Output:
# [('spark', 3),
#  ('makes', 1),
#  ('data', 2),
#  ('simple', 1),
#  ('runs', 1),
#  ('fast', 1),
#  ('clusters', 1),
#  ('engineers', 1),
#  ('love', 1)]

sc.stop()
