# ***PySpark RDD(Resilient Distributed Datasets)***

In [None]:
import pyspark

In [None]:
from pyspark import SparkContext
 # SparkContext is a central component and an entry point for creating Resilient Distributed Datasets (RDDs) and other Spark operations. It serves as the interface between your application and the Spark cluster.

In [None]:
sc = SparkContext.getOrCreate()

### ***Operations with Pyspark RDD***

**Transformation**






**1. map()**





In [None]:
ex_rdd = sc.parallelize([1,2,3,4,5])
print(ex_rdd.map(lambda x: x**2).collect())
  # The map() transformation applies a given function
  #to each element of an RDD and returns a new RDD consisting of the results.

[1, 4, 9, 16, 25]


**2. filter()**

In [None]:
filter_rdd = sc.parallelize([11,12,13,14,15])
print(filter_rdd.filter(lambda x: x%2 == 0).collect())
  # The filter() transformation is used to create a new
  #RDD by selecting elements from the original RDD that satisfy a
  #specified condition.

[12, 14]


**3. union()**

In [None]:
input = sc.parallelize([24,42,53,68,71,86,99])
u_rdd1 = input.filter(lambda x: x%3 == 0)
u_rdd2 = input.filter(lambda x: x%9 == 0)
print(u_rdd1.union(u_rdd2).collect())
#It union result of u_rdd1 and u_rdd2
  #  The union() transformation is used to
  #   combine two RDDs into a single RDD by stacking their elements.

[24, 42, 99, 99]


**4. flatMap()**

In [None]:
fmap_rdd = sc.parallelize(["Hello World", "Welcome to PySpark RDD"])
(fmap_rdd.flatMap(lambda x: x.split(" ")).collect())
# It split the given string separated by commas
  # The flatMap() transformation is similar to map(),
  #but it can return multiple output elements for each input element.

['Hello', 'World', 'Welcome', 'to', 'PySpark', 'RDD']

### ***Operations with Pyspark RDD***

## **Actions**









**1. collect()**

In [None]:
coll_rdd = sc.parallelize([15,24,33,42,51])
print(coll_rdd.collect())
  # The collect() action is used to retrieve all elements of
  #an RDD from the Spark cluster to the driver program.

[15, 24, 33, 42, 51]


**2. count()**

In [None]:
cnt_rdd = sc.parallelize([1,2,3,4,5,6,7,8,9])
print(cnt_rdd.count())
  # The count() action returns the number of elements in RDD.

9


**3. take()**

In [None]:
tk_rdd = sc.parallelize([1,2,3,4,5,6,7,8])
print(tk_rdd.take(4))  #means it takes first 4 elements
  # The take(n) action retrieves the first n elements of an RDD and
  #returns them as a list.

[1, 2, 3, 4]


**4. reduce()**

In [None]:
red_rdd = sc.parallelize([1,2,3,4,5])
print(red_rdd.reduce(lambda x, y: x+y)) # It return addition of all elements
  # The reduce() action is used to aggregate the elements of an RDD
  #using a specified commutative and associative binary operator.

15


**5. saveAsTextFile()**

In [None]:
sv_rdd = sc.parallelize([1,2,3,4,5])
sv_rdd.saveAsTextFile('/path/to/output/our_directory')
  # The saveAsTextFile() action is used to write the elements of an RDD to a text file or a directory in the Hadoop Distributed File System (HDFS).

**Operations with Pyspark Pair RDD**

**Transformation**






**1. reduceByKey()**

In [None]:
mks_rdd = sc.parallelize([('Jesse', 38), ('Frank', 49), ('Amilia', 43), ('Marie', 33), ('Gomez', 48), ('Jesse', 23), ('Frank', 45), ('Amilia', 38), ('Marie', 36)])
print(mks_rdd.reduceByKey(lambda x, y: x + y).collect())
 # It print unique key and then add that key values
 #The reduceByKey() transformation is used on Pair RDDs and performs a reduction
 #on the values of each key using the specified reduce function.
 # For ex: ('Amilia', 43), ('Amilia', 38)
      # then it print ('Amilia', 43+38)


[('Amilia', 81), ('Marie', 69), ('Gomez', 48), ('Jesse', 61), ('Frank', 94)]


**2. groupByKey()**

In [None]:
mks_rdd = sc.parallelize([('Jesse', 38), ('Frank', 49), ('Amilia', 43), ('Marie', 33), ('Gomez', 48), ('Jesse', 23), ('Frank', 45), ('Amilia', 38), ('Marie', 36)])
dct_rdd = mks_rdd.groupByKey().collect()

for key, value in dct_rdd:
  print(key, list(value))

  # The groupByKey() transformation groups the values of each key
  #in the Pair RDD into an iterable.

Amilia [43, 38]
Marie [33, 36]
Gomez [48]
Jesse [38, 23]
Frank [49, 45]


**3. sortByKey()**

In [None]:
srt_rdd = sc.parallelize([(1, 5),(1, 10),(2, 4),(3, 1),(2, 6)])
print(srt_rdd.sortByKey().collect())
# The sortByKey() can be used to sort the pair RDD based on keys.
  # The sortByKey() transformation is used to sort the elements of
  # a Pair RDD by their keys in ascending or descending order.

[(1, 5), (1, 10), (2, 4), (2, 6), (3, 1)]


**Operations with Pyspark Pair RDD**

**Action**





**1. countByKey()**

In [None]:
mks_rdd = sc.parallelize([('Jesse', 38), ('Frank', 49), ('Amilia', 43), ('Marie', 33), ('Gomez', 48), ('Jesse', 23), ('Frank', 45), ('Amilia', 38), ('Marie', 36)])
dct_rdd = mks_rdd.countByKey().items()

for key, value in dct_rdd:
  print(key, value)
  # The countByKey() action is used on a Pair RDD to count the
  #number of occurrences of each unique key.

Jesse 2
Frank 2
Amilia 2
Marie 2
Gomez 1
