### 🎃 ReduceByKey vs GroupByKey (🍎 The Fruit Counting Story:)

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("ReduceByKey vs GroupByKey").master("yarn").getOrCreate()
sc = spark.sparkContext


25/06/24 18:18:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# 🍎 IMAGINE: You have a basket of fruits and want to count them!
print("🍎 Original Fruit Basket:")
fruits = [
    ("apple", 1), ("banana", 1), ("apple", 1), 
    ("orange", 1), ("banana", 1), ("apple", 1),
    ("orange", 1), ("banana", 1)
]
print(fruits)

🍎 Original Fruit Basket:
[('apple', 1), ('banana', 1), ('apple', 1), ('orange', 1), ('banana', 1), ('apple', 1), ('orange', 1), ('banana', 1)]


In [4]:
# Create RDD
fruit_rdd = sc.parallelize(fruits)

In [5]:
print("\n" + "="*50)
print("METHOD 1: Using reduceByKey() - THE SMART WAY 🧠")
print("="*50)


METHOD 1: Using reduceByKey() - THE SMART WAY 🧠


In [6]:
# reduceByKey: Like having a smart helper who counts as they collect!
# They see an apple, start counting: apple = 1
# See another apple, add it: apple = 1 + 1 = 2
# See third apple, add it: apple = 2 + 1 = 3
result_reduce = fruit_rdd.reduceByKey(lambda x, y: x + y)
print("Result with reduceByKey:")
for item in result_reduce.collect():
    print(f"  {item[0]}: {item[1]} pieces")

Result with reduceByKey:


[Stage 1:>                                                          (0 + 1) / 2]

  apple: 3 pieces
  banana: 3 pieces
  orange: 2 pieces


                                                                                

In [7]:
print("\n" + "="*50)
print("METHOD 2: Using groupByKey() - THE SIMPLE WAY 👶")
print("="*50)


METHOD 2: Using groupByKey() - THE SIMPLE WAY 👶


In [8]:
# groupByKey: Like dumping all fruits in separate boxes first, then counting
# Put all apples in apple box: [1, 1, 1]
# Put all bananas in banana box: [1, 1, 1]
# Put all oranges in orange box: [1, 1]
# Then count each box separately
result_group = fruit_rdd.groupByKey().mapValues(sum)
print("Result with groupByKey + sum:")
for item in result_group.collect():
    print(f"  {item[0]}: {item[1]} pieces")

Result with groupByKey + sum:




  banana: 3 pieces
  apple: 3 pieces
  orange: 2 pieces


                                                                                

In [9]:
print("\n" + "🎯 " + "="*48)
print("WHY reduceByKey IS BETTER - REAL EXAMPLE")
print("="*50)


WHY reduceByKey IS BETTER - REAL EXAMPLE


In [10]:
# Let's see what groupByKey actually creates before summing
print("\nWhat groupByKey creates internally:")
grouped = fruit_rdd.groupByKey()
for key, values in grouped.collect():
    print(f"  {key}: {list(values)} -> then we sum to get {sum(values)}")


What groupByKey creates internally:
  apple: [1, 1, 1] -> then we sum to get 3
  banana: [1, 1, 1] -> then we sum to get 3
  orange: [1, 1] -> then we sum to get 2


In [11]:
print("\n💡 THE KEY DIFFERENCE:")
print("="*30)
print("🧠 reduceByKey: Counts WHILE collecting (efficient)")
print("   - Less data movement between computers")
print("   - Uses less memory")
print("   - Much faster for big data!")
print()
print("👶 groupByKey: Collects ALL first, THEN counts (inefficient)")
print("   - Moves ALL data between computers first")
print("   - Uses more memory") 
print("   - Slower for big data!")

print("\n🏠 HOUSE CLEANING ANALOGY:")
print("="*35)
print("reduceByKey = Clean each room as you go")
print("groupByKey = Dump everything in living room, then sort")
print()
print("Which is smarter? Clean as you go! 🧠")



💡 THE KEY DIFFERENCE:
🧠 reduceByKey: Counts WHILE collecting (efficient)
   - Less data movement between computers
   - Uses less memory
   - Much faster for big data!

👶 groupByKey: Collects ALL first, THEN counts (inefficient)
   - Moves ALL data between computers first
   - Uses more memory
   - Slower for big data!

🏠 HOUSE CLEANING ANALOGY:
reduceByKey = Clean each room as you go
groupByKey = Dump everything in living room, then sort

Which is smarter? Clean as you go! 🧠


In [12]:
# Stop Spark
spark.stop()