### Finding Prime Numbers

In [1]:
#Sieve of Eratosthenes
n = 500000
allnumbers = sc.parallelize(range(2, n), 8).cache()
composite = allnumbers.flatMap(lambda x: range(x*2, n, x))
prime = allnumbers.subtract(composite) #matching, so need partitioner
print(prime.take(10))

#take long time because parttion is not balanced.

[17, 97, 113, 193, 241, 257, 337, 353, 401, 433]


In [2]:
# Find the number of elements in each parttion
def partitionsize(it): 
    yield len(list(it))

print(allnumbers.mapPartitions(partitionsize).collect())
print(composite.mapPartitions(partitionsize).collect()) #this is unbalanced! 
print(prime.mapPartitions(partitionsize).collect())
print(prime.glom().collect()[1][0:4])

[62499, 62500, 62500, 62500, 62499, 62500, 62500, 62500]
[5216986, 254759, 104166, 62499, 0, 0, 0, 0]
[0, 5169, 1, 5219, 0, 5206, 0, 5189, 0, 5165, 0, 5199, 0, 5191, 0, 5199]
[17, 97, 113, 193]


In [3]:
n = 500000
allnumbers = sc.parallelize(range(2, n), 8).cache()
composite = allnumbers.flatMap(lambda x: range(x*2, n, x)).repartition(8) #make it balanced
prime = allnumbers.subtract(composite)
print(prime.take(10))

[17, 97, 113, 193, 241, 257, 337, 353, 401, 433]


In [4]:
# Find the number of elements in each parttion
def partitionsize(it): 
    yield len(list(it))

print(allnumbers.mapPartitions(partitionsize).collect())
print(composite.mapPartitions(partitionsize).collect()) #this is unbalanced! 
print(prime.mapPartitions(partitionsize).collect())
print(prime.glom().collect()[1][0:4])  #mod8 = 1
print(prime.glom().collect()[2][:4])   #mod8 = 2
print(prime.glom().collect()[3][0:4])  #mod8 = 3.  Hash partitioning

[62499, 62500, 62500, 62500, 62499, 62500, 62500, 62500]
[704805, 704790, 704800, 704800, 704800, 704799, 704800, 704816]
[0, 5169, 1, 5219, 0, 5206, 0, 5189, 0, 5165, 0, 5199, 0, 5191, 0, 5199]
[17, 97, 113, 193]
[2]
[3, 19, 67, 83]


### Data Partitioning

In [5]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)  #RDD created, no partitioner
rdd = rdd.map(lambda t: (t[0], t[1]+1))
print(rdd.partitioner)
print(rdd.glom().collect())
rdd = rdd.reduceByKey(lambda x,y: x+y)
print(rdd.glom().collect())
print(rdd.partitioner)
print(rdd.partitioner.partitionFunc)

rdd1 = rdd.map(lambda x: (x[0], x[1]+1))
print(rdd1.glom().collect())
print(rdd1.partitioner)

rdd2 = rdd.mapValues(lambda x: x+1) #key not changed.
print(rdd2.partitioner.partitionFunc)

rdd = rdd.sortByKey()
print(rdd.glom().collect())
print(rdd.partitioner.partitionFunc)
rdd3 = rdd.mapValues(lambda x: x+1) #give same partitioner.
print(rdd3.partitioner.partitionFunc)

None
None
[[(8, 9), (8, 9)], [(1, 2), (96, 97)], [(240, 241), (400, 401)], [(1, 2), (800, 801), (4, 5), (12, 13)]]
[[(8, 18), (96, 97), (240, 241), (400, 401), (800, 801), (12, 13), (4, 5)], [(1, 4)], [], []]
<pyspark.rdd.Partitioner object at 0x7ff8202cc040>
<function portable_hash at 0x7ff80140b8b0>
[[(8, 19), (96, 98), (240, 242), (400, 402), (800, 802), (12, 14), (4, 6)], [(1, 5)], [], []]
None
<function portable_hash at 0x7ff80140b8b0>
[[(1, 4), (4, 5), (8, 18)], [(12, 13), (96, 97)], [(240, 241), (400, 401)], [(800, 801)]]
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7ff810613d30>
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7ff810613d30>


In [6]:
def partitionsize(it): yield len(list(it))
    
n = 40000

def f(x):
    return x % 9

data1 = list(range(0, n, 16)) + list(range(0, n, 16))
data2 = range(0, n, 8)
rdd1 = sc.parallelize(zip(data1, data2), 8)
print(rdd1.mapPartitions(partitionsize).collect())
rdd2 = rdd1.reduceByKey(lambda x,y: x+y)
print(rdd2.mapPartitions(partitionsize).collect())
rdd3 = rdd2.partitionBy(8, f) #f: user defined function
print(rdd3.mapPartitions(partitionsize).collect())
rdd4 = rdd1.reduceByKey(lambda x,y: x+y, partitionFunc=f)
print(rdd4.mapPartitions(partitionsize).collect())

[625, 625, 625, 625, 625, 625, 625, 625]
[2500, 0, 0, 0, 0, 0, 0, 0]
[556, 278, 277, 278, 277, 278, 278, 278]
[556, 278, 277, 278, 277, 278, 278, 278]


In [7]:
a = sc.parallelize(zip(range(10000), range(10000)), 8)
b = sc.parallelize(zip(range(10000), range(10000)), 8)
c = a.join(b)
print(c.getNumPartitions())
print(a.partitioner)
a = a.reduceByKey(lambda x,y: x+y)
print(a.partitioner.partitionFunc)
b = b.reduceByKey(lambda x,y: x+y)
print(b.partitioner.partitionFunc)

c = a.join(b) #a and b are copartitioned with same partitioner
              #so the number remains the same.
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

16
None
<function portable_hash at 0x7ff80140b8b0>
<function portable_hash at 0x7ff80140b8b0>
8
<function portable_hash at 0x7ff80140b8b0>
[(0, (0, 0)), (8, (8, 8)), (16, (16, 16)), (24, (24, 24))]


### Partitioning in DataFrames

In [8]:
data1 = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4]
data2 = [2, 2, 3, 4, 5, 3, 1, 1, 2, 3]
df = spark.createDataFrame(zip(data1, data2))
print(df.rdd.getNumPartitions())
print(df.rdd.glom().collect())

8
[[Row(_1=1, _2=2)], [Row(_1=1, _2=2)], [Row(_1=1, _2=3)], [Row(_1=2, _2=4), Row(_1=2, _2=5)], [Row(_1=2, _2=3)], [Row(_1=3, _2=1)], [Row(_1=3, _2=1)], [Row(_1=3, _2=2), Row(_1=4, _2=3)]]


In [9]:
import pyspark.sql.functions

df1 = df.repartition(6, df._1)
print(df1.rdd.glom().collect())

# SparkSQL uses MurmurHash to make generating adversarial data more difficult
# Calling SparkSQL's hash function
df1.select('*', pyspark.sql.functions.hash(df1._1), pyspark.sql.functions.hash(df1._1) % 6).show()

# Calling Python's hash function
print(hash(1))

[[], [], [Row(_1=2, _2=4), Row(_1=2, _2=5), Row(_1=2, _2=3), Row(_1=4, _2=3)], [Row(_1=3, _2=1), Row(_1=3, _2=1), Row(_1=3, _2=2)], [], [Row(_1=1, _2=2), Row(_1=1, _2=2), Row(_1=1, _2=3)]]
+---+---+-----------+--------------+
| _1| _2|   hash(_1)|(hash(_1) % 6)|
+---+---+-----------+--------------+
|  2|  4| -797927272|            -4|
|  2|  5| -797927272|            -4|
|  2|  3| -797927272|            -4|
|  4|  3| 1344313940|             2|
|  3|  1|  519220707|             3|
|  3|  1|  519220707|             3|
|  3|  2|  519220707|             3|
|  1|  2|-1712319331|            -1|
|  1|  2|-1712319331|            -1|
|  1|  3|-1712319331|            -1|
+---+---+-----------+--------------+

1


In [10]:
# A 'real' example from SF Express
# Prepare three relational tables

from pyspark.sql.functions import *

num_waybills = 1000
num_customers = 100

rdd = sc.parallelize((i, ) for i in range(num_waybills))
waybills = spark.createDataFrame(rdd).select(floor(rand()*num_waybills).alias('waybill'), 
                                             floor(rand()*num_customers).alias('customer')) \
                .repartition('waybill')\
                .cache()
waybills.show()
print(waybills.count())

rdd = sc.parallelize((i, i) for i in range(num_customers))
customers = spark.createDataFrame(rdd, ['customer', 'phone']).cache()
customers.show()
print(customers.count())

rdd = sc.parallelize((i, ) for i in range(num_waybills))
waybill_status = spark.createDataFrame(rdd).select(floor(rand()*num_waybills).alias('waybill'), 
                                                   floor(rand()*10).alias('version')) \
                      .groupBy('waybill').max('version').cache()
waybill_status.show()
print(waybill_status.count())

+-------+--------+
|waybill|customer|
+-------+--------+
|     26|      82|
|    474|      75|
|     26|      79|
|     26|      94|
|    418|      28|
|    541|      81|
|    191|      75|
|    191|      92|
|    270|      43|
|    222|      44|
|    730|      23|
|    222|      36|
|    293|      31|
|    938|      41|
|    222|      87|
|    938|      15|
|    730|      11|
|    222|      48|
|    938|      29|
|    730|      16|
+-------+--------+
only showing top 20 rows

1000
+--------+-----+
|customer|phone|
+--------+-----+
|       0|    0|
|       1|    1|
|       2|    2|
|       3|    3|
|       4|    4|
|       5|    5|
|       6|    6|
|       7|    7|
|       8|    8|
|       9|    9|
|      10|   10|
|      11|   11|
|      12|   12|
|      13|   13|
|      14|   14|
|      15|   15|
|      16|   16|
|      17|   17|
|      18|   18|
|      19|   19|
+--------+-----+
only showing top 20 rows

100
+-------+------------+
|waybill|max(version)|
+-------+------------+
|    9

In [16]:
# We want to join 3 tables together.
# Knowing how each table is partitioned helps optimize the join order.

# waybills.join(customers, 'customer').join(waybill_status, 'waybill').show()
waybills.join(waybill_status, 'waybill').join(customers, 'customer').show()

+--------+-------+------------+-----+
|customer|waybill|max(version)|phone|
+--------+-------+------------+-----+
|      28|    418|           5|   28|
|      75|    191|           1|   75|
|      92|    191|           1|   92|
|      43|    270|           7|   43|
|      44|    222|           5|   44|
|      23|    730|           4|   23|
|      36|    222|           5|   36|
|      31|    293|           8|   31|
|      87|    222|           5|   87|
|      11|    730|           4|   11|
|      48|    222|           5|   48|
|      16|    730|           4|   16|
|      89|    278|           0|   89|
|      81|    926|           5|   81|
|      11|     19|           4|   11|
|      77|    926|           5|   77|
|      39|    277|           9|   39|
|      58|    277|           9|   58|
|      31|    277|           9|   31|
|      41|    415|           1|   41|
+--------+-------+------------+-----+
only showing top 20 rows



### Threading

In [17]:
import threading
import random

partitions = 12
n = 500000 * partitions

# use different seeds in different threads and different partitions
# a bit ugly, since mapPartitionsWithIndex takes a function with only index
# and it as parameters
def f1(index, it):
    random.seed(index + 987231)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f2(index, it):
    random.seed(index + 987232)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f3(index, it):
    random.seed(index + 987233)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f4(index, it):
    random.seed(index + 987234)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f5(index, it):
    random.seed(index + 987245)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

f = [f1, f2, f3, f4, f5]
    
# the function executed in each thread/job
def dojob(i):
    count = sc.parallelize(range(1, n + 1), partitions) \
              .mapPartitionsWithIndex(f[i]).reduce(lambda a,b: a+b)
    print("Worker", i, "reports: Pi is roughly", 4.0 * count / n)

# create and execute the threads
threads = []
for i in range(5):
    t = threading.Thread(target=dojob, args=(i,))
    threads += [t]
    t.start()

# wait for all threads to complete
for t in threads:
    t.join()    

Worker 3 reports: Pi is roughly 3.142332
Worker 1 reports: Pi is roughly 3.1421246666666667
Worker 2 reports: Pi is roughly 3.142312
Worker 0 reports: Pi is roughly 3.1423186666666667
Worker 4 reports: Pi is roughly 3.141875333333333
