In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD_Practice").getOrCreate()
sc = spark.sparkContext

In [2]:
spark

In [3]:
data = [1, 2, 3, 4, 5, 6]
rdd = sc.parallelize(data)

print("RDD Elements:", rdd.collect())

RDD Elements: [1, 2, 3, 4, 5, 6]


In [4]:
# map → transform each element
squared = rdd.map(lambda x: x * x)
print("Squared:", squared.collect())

# filter → select elements
even = rdd.filter(lambda x: x % 2 == 0)
print("Even numbers:", even.collect())


Squared: [1, 4, 9, 16, 25, 36]
Even numbers: [2, 4, 6]


In [5]:
print("Count:", rdd.count())
print("Sum:", rdd.sum())
print("Max:", rdd.max())
print("First element:", rdd.first())


Count: 6
Sum: 21
Max: 6
First element: 1


In [6]:
lines = sc.parallelize([
    "Spark RDD is powerful",
    "RDD transformations are lazy",
    "RDD actions trigger execution"
])

words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda a, b: a + b)

print("Word Count:", word_counts.collect())

Word Count: [('are', 1), ('trigger', 1), ('powerful', 1), ('lazy', 1), ('Spark', 1), ('execution', 1), ('actions', 1), ('RDD', 3), ('is', 1), ('transformations', 1)]


In [7]:
data = [("a", 1), ("b", 2), ("a", 3), ("b", 4)]
rdd = sc.parallelize(data)

# groupByKey
grouped = rdd.groupByKey().mapValues(list)
print("Grouped:", grouped.collect())

# reduceByKey (better than groupByKey)
summed = rdd.reduceByKey(lambda a, b: a + b)
print("Summed:", summed.collect())


Grouped: [('a', [1, 3]), ('b', [2, 4])]
Summed: [('a', 4), ('b', 6)]


In [8]:
# Sort by key
print("Sorted:", summed.sortByKey().collect())

# Sort by value
print("Sort by Value:", summed.sortBy(lambda x: x[1], ascending=False).collect())


Sorted: [('a', 4), ('b', 6)]
Sort by Value: [('b', 6), ('a', 4)]


In [9]:
rdd1 = sc.parallelize([("a", 1), ("b", 2)])
rdd2 = sc.parallelize([("a", 3), ("b", 4), ("c", 5)])

print("Join:", rdd1.join(rdd2).collect())
print("Left Outer Join:", rdd1.leftOuterJoin(rdd2).collect())
print("Right Outer Join:", rdd1.rightOuterJoin(rdd2).collect())


Join: [('b', (2, 4)), ('a', (1, 3))]
Left Outer Join: [('b', (2, 4)), ('a', (1, 3))]
Right Outer Join: [('b', (2, 4)), ('c', (None, 5)), ('a', (1, 3))]


In [10]:
rdd = sc.parallelize(range(1, 101), 4)  # 4 partitions
print("Partitions:", rdd.getNumPartitions())

# Repartition
rdd2 = rdd.repartition(2)
print("Repartitioned:", rdd2.getNumPartitions())


Partitions: 4
Repartitioned: 2


In [11]:
big_rdd = sc.parallelize(range(1, 1000000))

big_rdd.cache()   # keep in memory
print("Cached sum:", big_rdd.sum())

big_rdd.unpersist()


Cached sum: 499999500000


PythonRDD[72] at RDD at PythonRDD.scala:53

In [12]:
pairs = sc.parallelize([("a", 1), ("a", 2), ("b", 3), ("a", 4), ("b", 5)])

# aggregateByKey(zeroValue, seqFunc, combFunc)
result = pairs.aggregateByKey(0, 
                              lambda acc, v: acc + v,   # within partition
                              lambda acc1, acc2: acc1 + acc2)  # across partitions

print("AggregateByKey:", result.collect())


AggregateByKey: [('a', 7), ('b', 8)]


In [14]:
def process_partition(iterable):
    yield sum(iterable)

rdd = sc.parallelize(range(1, 11), 2)
partition_sum = rdd.mapPartitions(process_partition)
print("Partition Sums:", partition_sum.collect())


Partition Sums: [15, 40]


In [15]:
from pyspark.rdd import portable_hash

data = [("apple", 1), ("banana", 2), ("apple", 3), ("orange", 4)]
rdd = sc.parallelize(data)

# Hash partitioner
partitioned = rdd.partitionBy(3, lambda key: portable_hash(key) % 3)
print("Partitions:", partitioned.getNumPartitions())


Partitions: 3
