In [16]:
# =====================================
# FOR GOOGLE DATAPROC 
# =====================================

In [17]:
from pyspark.sql import SparkSession
import time

In [18]:
# 🔧 SOLUTION 1: Check if Spark session exists, create if needed
def get_spark_session():
    try:
        # Try to get existing active session
        spark = SparkSession.getActiveSession()
        if spark is None:
            # Create new session if none exists
            spark = SparkSession.builder \
                .appName("Repartition vs Coalesce - Fixed") \
                .getOrCreate()
        return spark
    except:
        # If anything goes wrong, create fresh session
        spark = SparkSession.builder \
            .appName("Repartition vs Coalesce - Fixed") \
            .getOrCreate()
        return spark

In [19]:
# Get or create Spark session
spark = get_spark_session()
sc = spark.sparkContext

In [20]:
print("✅ Spark Session Status:")
print(f"   App Name: {spark.sparkContext.appName}")
print(f"   Master: {spark.sparkContext.master}")
print(f"   Spark Version: {spark.version}")
print(f"   Active: {not spark.sparkContext._jsc.sc().isStopped()}")

✅ Spark Session Status:
   App Name: PySparkShell
   Master: yarn
   Spark Version: 3.5.3
   Active: True


In [7]:
print("\n🏠 IMAGINE: You have toys in different boxes (partitions)")
print("="*55)


🏠 IMAGINE: You have toys in different boxes (partitions)


In [8]:
# Create sample data - like having toys
toys = list(range(1, 21))  # 20 toys numbered 1 to 20
print(f"📦 We have {len(toys)} toys: {toys}")

📦 We have 20 toys: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [21]:
# Create RDD with 8 partitions (8 boxes)
try:
    toy_rdd = sc.parallelize(toys, 8)
    print(f"\n📦 Initially stored in {toy_rdd.getNumPartitions()} boxes (partitions)")
    
    # Let's see what's in each box
    print("\n🔍 What's in each box initially:")
    def show_partitions(rdd, name):
        print(f"\n{name}:")
        print(f"Number of boxes: {rdd.getNumPartitions()}")
        partitions = rdd.glom().collect()  # glom() shows what's in each partition
        for i, partition in enumerate(partitions):
            if len(partition) > 0:  # Only show non-empty boxes
                print(f"  Box {i+1}: {list(partition)}")
        return rdd

    toy_rdd = show_partitions(toy_rdd, "📦 ORIGINAL BOXES")

    print("\n" + "="*60)
    print("SCENARIO 1: REDUCE boxes from 8 to 3")
    print("="*60)

    print("\n🔄 METHOD 1: coalesce(3) - THE LAZY WAY")
    print("-" * 45)
    print("💭 Thinking: 'I'm too lazy to move toys around properly!'")
    print("   Just combines nearby boxes without reshuffling")

    coalesced_rdd = toy_rdd.coalesce(3)
    show_partitions(coalesced_rdd, "📦 AFTER coalesce(3)")

    print("\n🔄 METHOD 2: repartition(3) - THE PROPER WAY")  
    print("-" * 45)
    print("💭 Thinking: 'Let me distribute toys evenly across all boxes!'")
    print("   Shuffles ALL toys to distribute them evenly")

    repartitioned_rdd = toy_rdd.repartition(3)
    show_partitions(repartitioned_rdd, "📦 AFTER repartition(3)")

    print("\n" + "="*60)
    print("SCENARIO 2: INCREASE boxes from 8 to 12") 
    print("="*60)

    print("\n🔄 METHOD 1: coalesce(12) - CAN'T REALLY INCREASE!")
    print("-" * 50)
    print("💭 coalesce says: 'I can only reduce boxes efficiently!'")
    
    cant_increase = toy_rdd.coalesce(12)
    show_partitions(cant_increase, "📦 AFTER coalesce(12)")
    print("⚠️  Notice: Didn't really increase efficiently! Still limited by original partitions!")

    print("\n🔄 METHOD 2: repartition(12) - CAN DO IT PROPERLY!")
    print("-" * 52) 
    print("💭 repartition says: 'I can increase or decrease boxes properly!'")

    increased_rdd = toy_rdd.repartition(12)
    show_partitions(increased_rdd, "📦 AFTER repartition(12)")

    print("\n" + "🎯 " + "="*58)
    print("PERFORMANCE COMPARISON")
    print("="*60)

    print("\n⚡ SPEED TEST: Reducing 8 boxes to 3 boxes")
    print("-" * 45)

    # Test coalesce speed
    start_time = time.time()
    coalesce_result = toy_rdd.coalesce(3).count()  # Using count() instead of collect()
    coalesce_time = time.time() - start_time

    # Test repartition speed  
    start_time = time.time()
    repartition_result = toy_rdd.repartition(3).count()  # Using count() instead of collect()
    repartition_time = time.time() - start_time

    print(f"🚀 coalesce(3) took: {coalesce_time:.4f} seconds")
    print(f"🏃 repartition(3) took: {repartition_time:.4f} seconds")
    
    if coalesce_time < repartition_time:
        print("✅ coalesce was faster! (as expected)")
    else:
        print("✅ Both were fast on small data!")

except Exception as e:
    print(f"❌ Error occurred: {str(e)}")
    print("\n🔧 TRYING TO RESTART SPARK...")
    # Stop existing context if any
    try:
        spark.stop()
    except:
        pass
    
    # Create fresh session
    spark = SparkSession.builder \
        .appName("Repartition vs Coalesce - Restarted") \
        .getOrCreate()
    sc = spark.sparkContext
    print("✅ Spark restarted successfully!")


📦 Initially stored in 8 boxes (partitions)

🔍 What's in each box initially:

📦 ORIGINAL BOXES:
Number of boxes: 8


                                                                                

  Box 1: [1, 2]
  Box 2: [3, 4]
  Box 3: [5, 6]
  Box 4: [7, 8, 9, 10]
  Box 5: [11, 12]
  Box 6: [13, 14]
  Box 7: [15, 16]
  Box 8: [17, 18, 19, 20]

SCENARIO 1: REDUCE boxes from 8 to 3

🔄 METHOD 1: coalesce(3) - THE LAZY WAY
---------------------------------------------
💭 Thinking: 'I'm too lazy to move toys around properly!'
   Just combines nearby boxes without reshuffling

📦 AFTER coalesce(3):
Number of boxes: 3
  Box 1: [1, 2, 3, 4]
  Box 2: [5, 6, 7, 8, 9, 10, 11, 12]
  Box 3: [13, 14, 15, 16, 17, 18, 19, 20]

🔄 METHOD 2: repartition(3) - THE PROPER WAY
---------------------------------------------
💭 Thinking: 'Let me distribute toys evenly across all boxes!'
   Shuffles ALL toys to distribute them evenly

📦 AFTER repartition(3):
Number of boxes: 3


                                                                                

  Box 1: [1, 2, 7, 8, 9, 10, 15, 16]
  Box 2: [3, 4]
  Box 3: [5, 6, 11, 12, 13, 14, 17, 18, 19, 20]

SCENARIO 2: INCREASE boxes from 8 to 12

🔄 METHOD 1: coalesce(12) - CAN'T REALLY INCREASE!
--------------------------------------------------
💭 coalesce says: 'I can only reduce boxes efficiently!'

📦 AFTER coalesce(12):
Number of boxes: 8


                                                                                

  Box 1: [1, 2]
  Box 2: [3, 4]
  Box 3: [5, 6]
  Box 4: [7, 8, 9, 10]
  Box 5: [11, 12]
  Box 6: [13, 14]
  Box 7: [15, 16]
  Box 8: [17, 18, 19, 20]
⚠️  Notice: Didn't really increase efficiently! Still limited by original partitions!

🔄 METHOD 2: repartition(12) - CAN DO IT PROPERLY!
----------------------------------------------------
💭 repartition says: 'I can increase or decrease boxes properly!'

📦 AFTER repartition(12):
Number of boxes: 12


                                                                                

  Box 1: [7, 8, 9, 10]
  Box 3: [11, 12, 13, 14]
  Box 4: [15, 16]
  Box 5: [3, 4]
  Box 6: [5, 6, 17, 18, 19, 20]
  Box 10: [1, 2]

PERFORMANCE COMPARISON

⚡ SPEED TEST: Reducing 8 boxes to 3 boxes
---------------------------------------------




🚀 coalesce(3) took: 0.8597 seconds
🏃 repartition(3) took: 2.9608 seconds
✅ coalesce was faster! (as expected)


                                                                                

In [11]:
print("\n💡 KEY DIFFERENCES:")
print("="*25)
print("🔄 repartition():")
print("   ✅ Can increase OR decrease partitions")
print("   ✅ Creates balanced partitions (even distribution)")
print("   ❌ Slower (full shuffle required)")
print("   ❌ More network traffic")
print("   🎯 Use when: You need even distribution or more partitions")


💡 KEY DIFFERENCES:
🔄 repartition():
   ✅ Can increase OR decrease partitions
   ✅ Creates balanced partitions (even distribution)
   ❌ Slower (full shuffle required)
   ❌ More network traffic
   🎯 Use when: You need even distribution or more partitions


In [12]:
print("\n🤝 coalesce():")
print("   ✅ Faster (minimal shuffle)")
print("   ✅ Less network traffic") 
print("   ❌ Can only decrease partitions effectively")
print("   ❌ May create unbalanced partitions")
print("   🎯 Use when: You only need fewer partitions and speed matters")


🤝 coalesce():
   ✅ Faster (minimal shuffle)
   ✅ Less network traffic
   ❌ Can only decrease partitions effectively
   ❌ May create unbalanced partitions
   🎯 Use when: You only need fewer partitions and speed matters


In [13]:
print("\n🏠 HOUSE ANALOGY:")
print("="*20)
print("🏠 You have stuff in 8 rooms, want to use only 3 rooms:")
print()
print("🤝 coalesce = Lazy moving:")
print("   - Just push stuff from 5 rooms into remaining 3 rooms")  
print("   - Quick but rooms might be uneven")
print("   - Can't effectively create new rooms")
print()
print("🔄 repartition = Professional moving:")
print("   - Take ALL stuff out, redistribute evenly in 3 rooms")  
print("   - Takes time but perfectly organized")
print("   - Can create new rooms properly")


🏠 HOUSE ANALOGY:
🏠 You have stuff in 8 rooms, want to use only 3 rooms:

🤝 coalesce = Lazy moving:
   - Just push stuff from 5 rooms into remaining 3 rooms
   - Quick but rooms might be uneven
   - Can't effectively create new rooms

🔄 repartition = Professional moving:
   - Take ALL stuff out, redistribute evenly in 3 rooms
   - Takes time but perfectly organized
   - Can create new rooms properly


In [14]:
print("\n🎯 WHEN TO USE WHAT:")
print("="*25)
print("📉 Reducing partitions + Speed matters → coalesce()")
print("📈 Increasing partitions → repartition()")  
print("⚖️  Need balanced partitions → repartition()")
print("🏃 Quick and dirty reduction → coalesce()")
print("💾 Before writing to files → coalesce() (fewer files)")
print("🔄 Before heavy processing → repartition() (better parallelism)")


🎯 WHEN TO USE WHAT:
📉 Reducing partitions + Speed matters → coalesce()
📈 Increasing partitions → repartition()
⚖️  Need balanced partitions → repartition()
🏃 Quick and dirty reduction → coalesce()
💾 Before writing to files → coalesce() (fewer files)
🔄 Before heavy processing → repartition() (better parallelism)


In [22]:
# DON'T stop the spark session in notebook unless you're completely done
spark.stop()  # Commented out to avoid the error you encountered