In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext.getOrCreate()

In [3]:
numbers = sc.textFile("Data/numbers.txt", 5).map(lambda x : int(x))

## What is the difference between .map() and .mapPartitions()

In [4]:
import time
class Power:
    def __init__(self, p):
        self.p = p
        time.sleep(2)
        
    def applyPower(self, x):
        return x**self.p
        
# mapPartitions        
def power_map_partitions(nums):
    c = Power(5)
    
    for x in nums:
        yield c.applyPower(x)
        
# map
def power_map(num):
    c = Power(5)
    return c.applyPower(num)

In [5]:
t = time.time()
numbers.map(power_map).collect()
time.time() - t

42.905746936798096

In [6]:
t = time.time()
numbers.mapPartitions(power_map_partitions).collect()
time.time() - t

2.0692760944366455

## Without Persisting/Caching

In [7]:
power_num = numbers.mapPartitions(power_map_partitions)
t = time.time()
power_num.collect()
print("first ", time.time() - t)

first  2.0597262382507324


In [8]:
t = time.time()
power_num.collect()
print("second ", time.time() - t)

second  2.0593881607055664


## With Persisting/Caching

In [9]:
power_num = numbers.mapPartitions(power_map_partitions).cache()
t = time.time()
power_num.collect()
print("first ", time.time() - t)

first  2.078521966934204


In [10]:
t = time.time()
power_num.collect()
print("second ", time.time() - t)

second  0.06660175323486328


In [11]:
sc.stop()