# **Working with RDD (Resilient Distributed Dataset)**

**`Udemy Course: Best Hands-on Big Data Practices and Use Cases using PySpark`**

**`Author: Amin Karami (PhD, FHEA)`**

---

**Resilient Distributed Dataset (RDD)**: RDD is the fundamental data structure of Spark. It is fault-tolerant (resilient) and immutable distributed collections of any type of objects.

source: https://spark.apache.org/docs/latest/rdd-programming-guide.html

source: https://spark.apache.org/docs/latest/api/python/reference/

In [15]:
########## ONLY in Colab ##########
!pip3 install pyspark
########## ONLY in Colab ##########

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
########## ONLY in Ubuntu Machine ##########
# Load Spark engine
!pip3 install -q findspark
import findspark
findspark.init()
########## ONLY in Ubuntu Machine ##########

In [5]:
# Linking with Spark
from pyspark import SparkContext, SparkConf

In [6]:
# Initializing Spark
conf = SparkConf().setAppName("RDD_practice").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)

<SparkContext master=local[*] appName=RDD_practice>


# **Part 1: Create RDDs and Basic Operations**
# **There are two ways to create RDDs:**

1.   Parallelizing an existing collection in your driver program
2.   Referencing a dataset in an external storage system, such as a shared filesystem, HDFS, HBase, or any data source offering a Hadoop InputFormat.

In [2]:
# Generate random data:
data=[]
for i in range(1,101):
  data.append(i)

In [7]:
# Create RDD:
rdd_data=sc.parallelize(data,10)
rdd_data.getNumPartitions()

10

In [8]:
datacoll=rdd_data.collect()

In [9]:
# Data distribution in partitions:
rdd_data = rdd_data.repartition(5)
rdd_data.getNumPartitions()

5

In [10]:
rdd_data.glom().collect()

[[61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100],
 [21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60]]

In [11]:
# Print last partition
rdd_data.glom().collect()[-1]

[41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60]

In [12]:
# count():
rdd_data.count()

100

In [13]:
# first():
rdd_data.first()

61

In [14]:
# top():
rdd_data.top(15)

[100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86]

In [18]:
# distinct():
rdd_data.distinct().collect()

[65,
 70,
 85,
 90,
 5,
 10,
 95,
 100,
 25,
 30,
 35,
 40,
 75,
 80,
 15,
 20,
 45,
 50,
 55,
 60,
 61,
 66,
 81,
 86,
 1,
 6,
 91,
 96,
 21,
 26,
 31,
 36,
 71,
 76,
 11,
 16,
 41,
 46,
 51,
 56,
 62,
 67,
 82,
 87,
 2,
 7,
 92,
 97,
 22,
 27,
 32,
 37,
 72,
 77,
 12,
 17,
 42,
 47,
 52,
 57,
 63,
 68,
 83,
 88,
 3,
 8,
 93,
 98,
 23,
 28,
 33,
 38,
 73,
 78,
 13,
 18,
 43,
 48,
 53,
 58,
 64,
 69,
 84,
 89,
 4,
 9,
 94,
 99,
 24,
 29,
 34,
 39,
 74,
 79,
 14,
 19,
 44,
 49,
 54,
 59]

In [21]:
# map():
my_rdd=rdd_data.map(lambda x:(x,1))
my_rdd.collect()
# for element in my_rdd.collect():
#   print(element)

[(61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1)]

In [22]:
#filter()
my_rdd1 = rdd_data.filter(lambda x:(x,1))
print(my_rdd1.collect())

[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]


In [23]:
# flatMap():
my_rdd2 = rdd_data.flatMap(lambda x:[[x,x+1,x+2]])
# my_rdd2.collect()

In [25]:
# Descriptive statistics:
rdd_data.stats()

(count: 100, mean: 50.5, stdev: 28.86607004772212, max: 100.0, min: 1.0)

In [None]:
# mapPartitions():


# **Part 2: Advanced RDD Transformations and Actions**

In [26]:
# union():
data1 = rdd_data.union(rdd_data)

In [28]:
# intersection():
rdd_data.intersection(rdd_data).collect()

[70,
 90,
 10,
 100,
 30,
 40,
 80,
 20,
 50,
 60,
 61,
 81,
 1,
 91,
 21,
 31,
 71,
 11,
 41,
 51,
 62,
 82,
 2,
 92,
 22,
 32,
 72,
 12,
 42,
 52,
 63,
 83,
 3,
 93,
 23,
 33,
 73,
 13,
 43,
 53,
 64,
 84,
 4,
 94,
 24,
 34,
 74,
 14,
 44,
 54,
 65,
 85,
 5,
 95,
 25,
 35,
 75,
 15,
 45,
 55,
 66,
 86,
 6,
 96,
 26,
 36,
 76,
 16,
 46,
 56,
 67,
 87,
 7,
 97,
 27,
 37,
 77,
 17,
 47,
 57,
 68,
 88,
 8,
 98,
 28,
 38,
 78,
 18,
 48,
 58,
 69,
 89,
 9,
 99,
 29,
 39,
 79,
 19,
 49,
 59]

In [41]:
# Find empty partitions
empty = 0
# def f(rdd_data):
for x in rdd_data.glom().collect():
  if len(x)==0:
    empty +=1
print(empty)
# rdd_data.foreachPartition(f)

0


In [32]:
rdd_data.glom().collect()

[[61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100],
 [21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80],
 [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
 [41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60]]

In [44]:
# coalesce(numPartitions):
rdd_data.coalesce(2)

CoalescedRDD[52] at coalesce at NativeMethodAccessorImpl.java:0

In [45]:
# takeSample(withReplacement, num, [seed])
rdd_data.takeSample(True,10,12)

[72, 92, 85, 35, 16, 3, 39, 5, 67, 48]

In [46]:
# takeOrdered(n, [ordering])
rdd_data.takeOrdered(6,lambda x:-x)

[100, 99, 98, 97, 96, 95]

In [47]:
# reduce():
rdd_data.reduce(lambda x,y:x+y)

5050

In [48]:
new = sc.parallelize([("a", 1), ("b", 2), ("a", 1)])

In [52]:
# reduceByKey():
new20=new.reduceByKey(lambda x,y:x+y)
new20.collect()

[('b', 2), ('a', 2)]

In [53]:
# sortByKey():#Gives error if data doesnt have the format of Key,Value pairs
sorted_rdd=new.sortByKey()
sorted_rdd.collect()

[('a', 1), ('a', 1), ('b', 2)]

In [54]:
# countByKey()
count_rdd=new.countByKey()
count_rdd

defaultdict(int, {'a': 2, 'b': 1})

In [55]:
# groupByKey():
group_rdd=new.groupByKey()
group_rdd.collect()

[('b', <pyspark.resultiterable.ResultIterable at 0x7f126cd58150>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x7f126cd58650>)]

In [56]:
# lookup(key):
new.lookup("b")

[2]

In [57]:
# cache:
# By default, each transformed RDD may be recomputed each time you run an action on it.
# However, you may also persist an RDD in memory using the persist (or cache) method,
# in which case Spark will keep the elements around on the cluster for much faster access the next time you query it.
rdd_data.cache()

MapPartitionsRDD[4] at coalesce at NativeMethodAccessorImpl.java:0

In [58]:
# Persistence (https://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence)
caching = rdd_data.persist().is_cached
caching

True