## **Distributed Processing Challenges: Handling Data Skew in RDD PySpark**





In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

conf = SparkConf()\
    .setMaster("local[*]")\
    .setAppName("Dealing with skewd data")
    # .setExecutorEnv("spark.driver.memory","2g")\
    # .setExecutorEnv("spark.executor.memory","4g")

spark = SparkSession\
    .builder\
    .config(conf=conf)\
    .getOrCreate()

sc = spark.sparkContext
sc

In [3]:
import random
import numpy as np

## (1) Run a shuffle `groupByKey()` to see how the skew effects computation resources.

In [4]:
def salting(val):
    return str(val) + '_' + str(random.randint(0,5))

In [5]:
data_sample = [(1,4),(2,2),(2,1),(3,5),(2,5),(2,10),(2,7),(3,4),(2,1),(4,4)]
rdd_smaple = sc.parallelize(data_sample, 3)

rdd_smaple.glom().collect()

[[(1, 4), (2, 2), (2, 1)],
 [(3, 5), (2, 5), (2, 10)],
 [(2, 7), (3, 4), (2, 1), (4, 4)]]

In [6]:
rdd_smaple.collect()

[(1, 4),
 (2, 2),
 (2, 1),
 (3, 5),
 (2, 5),
 (2, 10),
 (2, 7),
 (3, 4),
 (2, 1),
 (4, 4)]

In [41]:
rdd_sample_grouped = rdd_smaple.groupByKey()

for item in rdd_sample_grouped.collect():
    print(item[0], [v for v in item[1]])

rdd_sample_grouped.glom().collect()

3 [5, 4]
1 [4]
4 [4]
2 [2, 1, 5, 10, 7, 1]


[[(3, <pyspark.resultiterable.ResultIterable at 0x1e7331deeb8>)],
 [(1, <pyspark.resultiterable.ResultIterable at 0x1e7331de358>),
  (4, <pyspark.resultiterable.ResultIterable at 0x1e7331ea240>)],
 [(2, <pyspark.resultiterable.ResultIterable at 0x1e7331ea4e0>)]]

In [42]:
rdd_sample_salting = rdd_smaple.map(lambda x:(salting(x[0]), x[1]))

In [43]:
rdd_sample_salting.count()

10

In [44]:
for item in rdd_sample_salting.collect():
    print(item)

('1_2', 4)
('2_5', 2)
('2_2', 1)
('3_5', 5)
('2_4', 5)
('2_5', 10)
('2_2', 7)
('3_0', 4)
('2_5', 1)
('4_5', 4)


In [47]:
grouped_rdd = rdd.groupByKey().cache()

grouped_rdd.map(lambda pair: (pair[0], [i+10 for i in pair[1]])).collect()

[('b',
  [60,
   97,
   48,
   101,
   108,
   53,
   17,
   74,
   19,
   27,
   83,
   23,
   50,
   32,
   80,
   95,
   54,
   79,
   53,
   84,
   84,
   47,
   14,
   64,
   50,
   107,
   46,
   59,
   73,
   47,
   47,
   72,
   32,
   40,
   52,
   95,
   104,
   109,
   14,
   82,
   40,
   42,
   22,
   83,
   55,
   29,
   106,
   69,
   99,
   47,
   12,
   66,
   55,
   16,
   107,
   22,
   89,
   67,
   11,
   21,
   97,
   28,
   69,
   36,
   32,
   91,
   33,
   53,
   38,
   70,
   92,
   74,
   87,
   16,
   67,
   57,
   45,
   71,
   28,
   48,
   14,
   27,
   57,
   93,
   75,
   65,
   75,
   100,
   77,
   81,
   104,
   95,
   68,
   70,
   28,
   98,
   38,
   105,
   92,
   105,
   86,
   47,
   68,
   66,
   19,
   39,
   38,
   106,
   23,
   57,
   18,
   29,
   23,
   51,
   89,
   27,
   93,
   36,
   66,
   96,
   91,
   43,
   61,
   15,
   71,
   75,
   44,
   13,
   47,
   35,
   94,
   94,
   34,
   93,
   27,
   12,
   37,
   100,
   26,
   48,


In [46]:
grouped_rdd.collect()

[('b', <pyspark.resultiterable.ResultIterable at 0x1e707e1f358>),
 ('c', <pyspark.resultiterable.ResultIterable at 0x1e7079cef60>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x1e6f5c021d0>)]

## Mitigate data skewness: SALTING

In [25]:
def salting(val):
    return str(val) + '_' + str(random.randint(0,5))

In [55]:
rdd.take(5)

[('b', 32), ('b', 49), ('b', 97), ('b', 9), ('b', 75)]

In [56]:
rdd_salting = rdd.map(lambda x:(salting(x[0]), x[1]))


grouped_rdd = rdd_salting.groupByKey().cache()

grouped_rdd.map(lambda pair: (pair[0], [i+10 for i in pair[1]])).count()

24

## (2) Run a shuffle `sortByKey()` to see how the skew effects computation resources.

In [7]:
rdd_sort = rdd.sortByKey(ascending=False, numPartitions=4)
rdd_sort.count()

6010810

## Mitigate data skewness: SALTING

In [23]:
def salting(val):
    return val + '_' + str(random.randint(0,5))

In [8]:
rdd_sort = rdd_salting.sortByKey(ascending=False, numPartitions=4)
rdd_sort.count()

6010810

## (3) Run a shuffle `Join()` to see how the skew effects computation resources.

In [9]:
small_rdd1 = sc.parallelize([(2,3),(1,3),(1,4),(3,1),(5,1)], 3)
small_rdd2 = sc.parallelize([(4,3), (0,1), (1,2), (2,1)], 2)

print(small_rdd1.collect())
print(small_rdd2.collect())

[(2, 3), (1, 3), (1, 4), (3, 1), (5, 1)]
[(4, 3), (0, 1), (1, 2), (2, 1)]


In [10]:
join1 = small_rdd1.join(small_rdd2)
join1.collect()

[(1, (3, 2)), (1, (4, 2)), (2, (3, 1))]

In [11]:
join1.getNumPartitions()

5

In [12]:
join1.glom().collect()

[[], [(1, (3, 2)), (1, (4, 2))], [(2, (3, 1))], [], []]

In [38]:
def salting(val):
    return val + '_' + str(random.randint(0,5))