In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD_Practice").getOrCreate()
sc = spark.sparkContext

In [2]:
spark

In [3]:
data = [1, 2, 3, 4, 5, 6]
rdd = sc.parallelize(data)

print("RDD Elements:", rdd.collect())

RDD Elements: [1, 2, 3, 4, 5, 6]


In [4]:
# map → transform each element
squared = rdd.map(lambda x: x * x)
print("Squared:", squared.collect())

# filter → select elements
even = rdd.filter(lambda x: x % 2 == 0)
print("Even numbers:", even.collect())


Squared: [1, 4, 9, 16, 25, 36]
Even numbers: [2, 4, 6]


In [5]:
print("Count:", rdd.count())
print("Sum:", rdd.sum())
print("Max:", rdd.max())
print("First element:", rdd.first())


Count: 6
Sum: 21
Max: 6
First element: 1


In [6]:
lines = sc.parallelize([
    "Spark RDD is powerful",
    "RDD transformations are lazy",
    "RDD actions trigger execution"
])

words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda a, b: a + b)

print("Word Count:", word_counts.collect())

Word Count: [('are', 1), ('trigger', 1), ('powerful', 1), ('lazy', 1), ('Spark', 1), ('execution', 1), ('actions', 1), ('RDD', 3), ('is', 1), ('transformations', 1)]


In [7]:
data = [("a", 1), ("b", 2), ("a", 3), ("b", 4)]
rdd = sc.parallelize(data)

# groupByKey
grouped = rdd.groupByKey().mapValues(list)
print("Grouped:", grouped.collect())

# reduceByKey (better than groupByKey)
summed = rdd.reduceByKey(lambda a, b: a + b)
print("Summed:", summed.collect())


Grouped: [('a', [1, 3]), ('b', [2, 4])]
Summed: [('a', 4), ('b', 6)]


In [8]:
# Sort by key
print("Sorted:", summed.sortByKey().collect())

# Sort by value
print("Sort by Value:", summed.sortBy(lambda x: x[1], ascending=False).collect())


Sorted: [('a', 4), ('b', 6)]
Sort by Value: [('b', 6), ('a', 4)]


In [None]:
rdd1 = sc.parallelize([("a", 1), ("b", 2)])
rdd2 = sc.parallelize([("a", 3), ("b", 4), ("c", 5)])

print("Join:", rdd1.join(rdd2).collect())
print("Left Outer Join:", rdd1.leftOuterJoin(rdd2).collect())
print("Right Outer Join:", rdd1.rightOuterJoin(rdd2).collect())
