In [1]:
from __future__ import print_function
import findspark
import os
import sys
os.environ['SPARK_HOME'] = r'/Users/subham/Downloads/spark-3.0.0-bin-hadoop2.7'
findspark.init()
findspark.find()
import pyspark

from pyspark import SparkContext
from pyspark import SparkConf

## Creating Spark Context

In [2]:
conf=SparkConf().setAppName('SparkRDD').setMaster('local')
sc = SparkContext(conf=conf)

- conf object is the configuration for a Spark application
- we define the AppName and MasterURL in it
- sc is the object of SparkContext

## Creating RDD using list

In [4]:
values = [1,2,3,4,5]
rdd  = sc.parallelize(values) #Creates a RDD

In [7]:
#Print RDD values
rdd.take(5)

[1, 2, 3, 4, 5]

## Loading text file to Spark

In [9]:
rdd = sc.textFile('Spark.txt')

### Print the RDD data

In [10]:
rdd.collect()

['Apache Spark with python is Pyspark']

## RDD Persistence

In [13]:
aba = sc.parallelize(range(1,10000,2))
aba.persist()

PythonRDD[9] at RDD at PythonRDD.scala:53

In [15]:
aba.take(5)

[1, 3, 5, 7, 9]

## RDD Caching

In [16]:
textFile = sc.textFile('Spark.txt')
textFile.cache()

Spark.txt MapPartitionsRDD[12] at textFile at NativeMethodAccessorImpl.java:0

## Transformations

## Map

In [18]:
x = sc.parallelize(["spark","rdd","example","sample","example"])
y = x.map(lambda x:(x,1))
y.collect()

[('spark', 1), ('rdd', 1), ('example', 1), ('sample', 1), ('example', 1)]

## Filter

In [79]:
rdd = sc.parallelize([1,2,3,4,5,6,8,24])
rdd.filter(lambda x: x%2==0).collect()

[2, 4, 6, 8, 24]

b'(2) PythonRDD[254] at RDD at PythonRDD.scala:53 []\n |  MapPartitionsRDD[252] at mapPartitions at PythonRDD.scala:133 []\n |  ShuffledRDD[251] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(2) PairwiseRDD[250] at sortBy at <ipython-input-83-546d9f556f11>:6 []\n    |  PythonRDD[249] at sortBy at <ipython-input-83-546d9f556f11>:6 []\n    |  UnionRDD[246] at union at NativeMethodAccessorImpl.java:0 []\n    |  PythonRDD[244] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[241] at readRDDFromFile at PythonRDD.scala:262 []\n    |  PythonRDD[245] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[243] at readRDDFromFile at PythonRDD.scala:262 []'

## Union

In [20]:
a = sc.parallelize(range(1,9))
b = sc.parallelize(range(5,15))
a.union(b).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

## Intersection

In [21]:
a = sc.parallelize(range(1,9))
b = sc.parallelize(range(5,15))
a.intersection(b).collect()

[6, 8, 5, 7]

## Distinct

In [24]:
a = sc.parallelize(range(1,9))
b = sc.parallelize(range(5,15))
sorted(a.union(b).distinct().collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

## SortBy

In [27]:
y = sc.parallelize([1,3,2,1,4])
y.sortBy(lambda x : x,False).collect() #True : Ascending :: False: Descending

[4, 3, 2, 1, 1]

In [29]:
y = sc.parallelize([("H",10),("A",1),("Z",3)])
y.sortBy(lambda x : x,True).collect() #True : Ascending :: False: Descending wrt "keys"

[('A', 1), ('H', 10), ('Z', 3)]

## GroupBy

In [34]:
rdd = sc.parallelize([1,1,2,3,5,8])
result = rdd.groupBy(lambda x : x%2==0).collect()
sorted([(x,sorted(y)) for (x,y) in result])

[(False, [1, 1, 3, 5]), (True, [2, 8])]

## Zip

In [35]:
a = sc.parallelize(range(1,5))
b = sc.parallelize(range(1001,1005))
a.zip(b).collect()

[(1, 1001), (2, 1002), (3, 1003), (4, 1004)]

## Repartition

In [37]:
a = sc.parallelize([1,2,3,4,5,6,7],4) # 4 says the number of partitions you want
sorted(a.glom().collect())

[[1], [2, 3], [4, 5], [6, 7]]

In [38]:
a = sc.parallelize([1,2,3,4,5,6,7,8],4)
sorted(a.glom().collect())

[[1, 2], [3, 4], [5, 6], [7, 8]]

## Coalesce 
- Reduce the number of partitions in RDD


In [40]:
sc.parallelize([1,2,3,4,5,6,7],3).glom().collect()

[[1, 2], [3, 4], [5, 6, 7]]

In [44]:
sc.parallelize([1,2,3,4,5,6,7],3).coalesce(2).glom().collect() #coalesce(2) means reduce to 2 partitions

[[1, 2], [3, 4, 5, 6, 7]]

## Actions

## Reduce

In [46]:
from operator import mul,add
sc.parallelize([1,2,3,4,5,6,7,8]).reduce(add)

36

## First

In [48]:
sc.parallelize([10,2,3,4,5,6,7,8]).first()

10

## Take

In [52]:
a= sc.parallelize([10,12,3,4,155,6,37,58])
a.take(8)

[10, 12, 3, 4, 155, 6, 37, 58]

# takeOrdered

In [51]:
a= sc.parallelize([10,12,3,4,155,6,37,58])
a.takeOrdered(8)

[3, 4, 6, 10, 12, 37, 58, 155]

## Count

In [53]:
a= sc.parallelize([10,12,3,4,155,6,37,58])
a.count()

8

## Collect

In [55]:
x = sc.parallelize(["spark","rdd","example","sample","example"])
x.distinct().collect()

['spark', 'rdd', 'example', 'sample']

## Few Mathematical operations


In [71]:
a= sc.parallelize([10,12,3,4,155,6,37,58])
a.sum(),a.max(),a.mean(),a.variance(),a.stdev()

(285, 155, 35.62499999999999, 2363.734375, 48.61825145971418)

## CountByValue
- same as Counter

In [72]:
a= sc.parallelize([10,58,12,6,4,155,10,6,37,58,10])
a.countByValue()

defaultdict(int, {10: 3, 58: 2, 12: 1, 6: 2, 4: 1, 155: 1, 37: 1})

## creating paired RDDs

In [77]:
rdd = sc.parallelize([('a1','b1','c1','d1'),('a2','b2','c2','d2','e2','f2')])
result = rdd.map(lambda x : (x[0],list(x[1:])))
result.collect()

[('a1', ['b1', 'c1', 'd1']), ('a2', ['b2', 'c2', 'd2', 'e2', 'f2'])]

## reduceByKey

In [76]:
from operator import mul,add
sc.parallelize([(1,2),(3,4),(3,6)]).reduceByKey(add).collect()

[(1, 2), (3, 10)]

## Lineage Graph

In [None]:
rdd = sc.parallelize([1,2,3,4,5,6,8,24])
rdd.filter(lambda x: x%2==0).collect()

b = sc.parallelize(range(100,106))
unions = rdd.union(b)
a = unions.sortBy(lambda x : x,False)
sorted(a.glom().collect())

In [90]:
a.toDebugString()

b'(2) PythonRDD[254] at RDD at PythonRDD.scala:53 []\n |  MapPartitionsRDD[252] at mapPartitions at PythonRDD.scala:133 []\n |  ShuffledRDD[251] at partitionBy at NativeMethodAccessorImpl.java:0 []\n +-(2) PairwiseRDD[250] at sortBy at <ipython-input-83-546d9f556f11>:6 []\n    |  PythonRDD[249] at sortBy at <ipython-input-83-546d9f556f11>:6 []\n    |  UnionRDD[246] at union at NativeMethodAccessorImpl.java:0 []\n    |  PythonRDD[244] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[241] at readRDDFromFile at PythonRDD.scala:262 []\n    |  PythonRDD[245] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[243] at readRDDFromFile at PythonRDD.scala:262 []'