# 1. RDD Operations

#### Import findspark to use Spark in jupyter notebook

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext

#### Initialize SparkCOntext

In [2]:
sc=SparkContext.getOrCreate()

#### Create Resilient Distributed Dataset (RDD)

In [37]:
data=sc.parallelize([['France','50M','3T'],['India','30M','30T'],['Kenya','70M','25T'],
                     ['Nigeria','90M','60T'],['China','20M','2T'],['USA','80M','30T'],
                     ['UK','70M','25T'],['USA','20M','30T'],['China','70M','25T'],
                     ['France', '50M', '3T'],['China','70M','25T'] ])
data2=sc.parallelize([23,42,78,90,12,9,42,89,42,12,50,60])
a=sc.parallelize([('USA',35),('Canada',24),('Mexico',27),('Kenya',23)])
b=sc.parallelize([('Kenya',30),('USA',35),('South Africa',23),('Rwanda',23)])
print('data\n',data,'\ndata2\n',data2,'\na\n',a,'\nb\n',b)

data
 ParallelCollectionRDD[78] at readRDDFromFile at PythonRDD.scala:262 
data2
 ParallelCollectionRDD[79] at readRDDFromFile at PythonRDD.scala:262 
a
 ParallelCollectionRDD[80] at readRDDFromFile at PythonRDD.scala:262 
b
 ParallelCollectionRDD[81] at readRDDFromFile at PythonRDD.scala:262


#### Show data from RDD

In [4]:
data.collect()

[['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['China', '20M', '2T'],
 ['USA', '80M', '30T'],
 ['UK', '70M', '25T'],
 ['USA', '20M', '30T'],
 ['China', '70M', '25T'],
 ['France', '50M', '3T'],
 ['China', '70M', '25T']]

## i. Actions
Sparks action functions produce a value back to the Spark driver program

Check the RDD Persistence

In [5]:
data.persist().is_cached

True

Get number of partitions

In [6]:
data.getNumPartitions()

8

Show all data

In [7]:
data.collect()

[['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['China', '20M', '2T'],
 ['USA', '80M', '30T'],
 ['UK', '70M', '25T'],
 ['USA', '20M', '30T'],
 ['China', '70M', '25T'],
 ['France', '50M', '3T'],
 ['China', '70M', '25T']]

Get number of items

In [8]:
len(data.collect())

11

Get distinct Records

In [9]:
sorted(data2.collect())

[9, 12, 12, 23, 42, 42, 42, 50, 60, 78, 89, 90]

In [10]:
sorted(data2.distinct().collect())

[9, 12, 23, 42, 50, 60, 78, 89, 90]

Get max value from the list

In [11]:
data2.max()

90

Get min value from the list

In [12]:
data2.min()

9

Get sum value from the list

In [13]:
data2.sum()

549

Get average

In [14]:
data2.mean()

45.75

Get variance value from the list

In [15]:
data2.variance()

773.1875000000001

Get standard deviation value from the list

In [16]:
data2.stdev()

27.80624929759496

Get mean, std deviation, max and min values

In [17]:
data2.stats()

(count: 12, mean: 45.75, stdev: 27.80624929759496, max: 90.0, min: 9.0)

Show first column

In [18]:
data.first()

['France', '50M', '3T']

Get random record

In [19]:
data.takeSample(1,True)

[['France', '50M', '3T']]

Count records

In [20]:
data.count()

11

Count occurence of the items

data.countByKey().items()

## ii. Sparks Transformation Functions
Sparks transformation functions produce a new Resilient Distributed Dataset (RDD)

In [21]:
tran_data=sc.parallelize([4,6,8,2,2,6])
tran_data.collect()

[4, 6, 8, 2, 2, 6]

Select two random items

In [22]:
tran_data.sample(2,True).collect()

[4, 6, 6, 6, 2, 6]

Use map to multiply each item with 2

In [23]:
tran_data.map(lambda x : x*2).collect()

[8, 12, 16, 4, 4, 12]

Use filterMap to duplicate the items

In [24]:
tran_data.flatMap(lambda x : [x,x]).collect()

[4, 4, 6, 6, 8, 8, 2, 2, 2, 2, 6, 6]

Filter from data where item is China

In [25]:
data.filter(lambda x : "China" in x).collect()

[['China', '20M', '2T'], ['China', '70M', '25T'], ['China', '70M', '25T']]

Return unique items

In [26]:
tran_data.distinct().collect()

[8, 2, 4, 6]

Sorting

In [27]:
data.sortByKey(1, True).collect()

[['China', '20M', '2T'],
 ['China', '70M', '25T'],
 ['China', '70M', '25T'],
 ['France', '50M', '3T'],
 ['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['UK', '70M', '25T'],
 ['USA', '80M', '30T'],
 ['USA', '20M', '30T']]

Join two RDDs

In [28]:
print("a : ",a.collect())
print("b : ",b.collect())

a :  [('USA', 35), ('Canada', 24), ('Mexico', 27), ('Kenya', 23)]
b :  [('Kenya', 30), ('USA', 35), ('South Africa', 23), ('Rwanda', 23)]


In [29]:
a.join(b).collect()

[('USA', (35, 35)), ('Kenya', (23, 30))]

Left outer join

In [30]:
a.leftOuterJoin(b).collect()

[('USA', (35, 35)),
 ('Mexico', (27, None)),
 ('Canada', (24, None)),
 ('Kenya', (23, 30))]

Right outer join

In [31]:
a.rightOuterJoin(b).collect()

[('USA', (35, 35)),
 ('South Africa', (None, 23)),
 ('Kenya', (23, 30)),
 ('Rwanda', (None, 23))]

Union

In [32]:
a.union(b).collect()

[('USA', 35),
 ('Canada', 24),
 ('Mexico', 27),
 ('Kenya', 23),
 ('Kenya', 30),
 ('USA', 35),
 ('South Africa', 23),
 ('Rwanda', 23)]

Difference

In [33]:
a.subtract(b).collect()

[('Kenya', 23), ('Canada', 24), ('Mexico', 27)]

Intersection

In [34]:
a.intersection(b).collect()

[('USA', 35)]

Cartesian

In [35]:
a.cartesian(b).collect()

[(('USA', 35), ('Kenya', 30)),
 (('USA', 35), ('USA', 35)),
 (('USA', 35), ('South Africa', 23)),
 (('USA', 35), ('Rwanda', 23)),
 (('Canada', 24), ('Kenya', 30)),
 (('Canada', 24), ('USA', 35)),
 (('Canada', 24), ('South Africa', 23)),
 (('Canada', 24), ('Rwanda', 23)),
 (('Mexico', 27), ('Kenya', 30)),
 (('Mexico', 27), ('USA', 35)),
 (('Mexico', 27), ('South Africa', 23)),
 (('Mexico', 27), ('Rwanda', 23)),
 (('Kenya', 23), ('Kenya', 30)),
 (('Kenya', 23), ('USA', 35)),
 (('Kenya', 23), ('South Africa', 23)),
 (('Kenya', 23), ('Rwanda', 23))]