Import libraries

In [25]:
import findspark
findspark.init()
from pyspark import SparkContext

Initialize SparkContext

In [26]:
sc=SparkContext.getOrCreate()

Create data

In [27]:
data=sc.parallelize([['France','50M','3T'],['India','30M','30T'],['Kenya','70M','25T'],['Nigeria','90M','60T'],['China','20M','2T'],['USA','80M','30T'],['UK','70M','25T'],['USA','20M','30T'],['China','70M','25T']])

#### i. Sparks Action Functions
Sparks action functions produce a value back to the Spark driver program

Get number of partitions

In [28]:
data.getNumPartitions()

8

Show all data

In [29]:
data.collect()

[['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['China', '20M', '2T'],
 ['USA', '80M', '30T'],
 ['UK', '70M', '25T'],
 ['USA', '20M', '30T'],
 ['China', '70M', '25T']]

Show first column

In [30]:
data.first()

['France', '50M', '3T']

Get random record

In [31]:
data.takeSample(1,True)

[['China', '20M', '2T']]

Count records

In [32]:
data.count()

9

Count occurence of the items

In [33]:
data.countByKey().items()

dict_items([('France', 1), ('India', 1), ('Kenya', 1), ('Nigeria', 1), ('China', 2), ('USA', 2), ('UK', 1)])

#### ii. Sparks Transformation Functions
Sparks transformation functions produce a new Resilient Distributed Dataset (RDD)

In [34]:
tran_data=sc.parallelize([4,6,8,2,2,6])
tran_data.collect()

[4, 6, 8, 2, 2, 6]

Select two random items

In [35]:
tran_data.sample(2,True).collect()

[4, 6, 6, 6, 2]

Use map to multiply each item with 2

In [36]:
tran_data.map(lambda x : x*2).collect()

[8, 12, 16, 4, 4, 12]

Use filterMap to duplicate the items

In [37]:
tran_data.flatMap(lambda x : [x,x]).collect()

[4, 4, 6, 6, 8, 8, 2, 2, 2, 2, 6, 6]

Filter from data where item is China

In [38]:
data.filter(lambda x : "China" in x).collect()

[['China', '20M', '2T'], ['China', '70M', '25T']]

Return unique items

In [39]:
tran_data.distinct().collect()

[8, 2, 4, 6]

Sorting

In [40]:
data.sortByKey(1, True).collect()

[['China', '20M', '2T'],
 ['China', '70M', '25T'],
 ['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['UK', '70M', '25T'],
 ['USA', '80M', '30T'],
 ['USA', '20M', '30T']]

In [41]:
a=sc.parallelize([('USA',35),('Canada',24),('Mexico',27),('Kenya',23)])
b=sc.parallelize([('Kenya',30),('USA',35),('South Africa',23),('Rwanda',23)])
print("a : ",a.collect())
print("b : ",b.collect())

a :  [('USA', 35), ('Canada', 24), ('Mexico', 27), ('Kenya', 23)]
b :  [('Kenya', 30), ('USA', 35), ('South Africa', 23), ('Rwanda', 23)]


join

In [42]:
a.join(b).collect()

[('USA', (35, 35)), ('Kenya', (23, 30))]

Left outer join

In [43]:
a.leftOuterJoin(b).collect()

[('USA', (35, 35)),
 ('Mexico', (27, None)),
 ('Canada', (24, None)),
 ('Kenya', (23, 30))]

Right outer join

In [44]:
a.rightOuterJoin(b).collect()

[('USA', (35, 35)),
 ('South Africa', (None, 23)),
 ('Kenya', (23, 30)),
 ('Rwanda', (None, 23))]

Union

In [45]:
a.union(b).collect()

[('USA', 35),
 ('Canada', 24),
 ('Mexico', 27),
 ('Kenya', 23),
 ('Kenya', 30),
 ('USA', 35),
 ('South Africa', 23),
 ('Rwanda', 23)]

Difference

In [46]:
a.subtract(b).collect()

[('Kenya', 23), ('Canada', 24), ('Mexico', 27)]

Intersection

In [47]:
a.intersection(b).collect()

[('USA', 35)]

Cartesian

In [48]:
a.cartesian(b).collect()

[(('USA', 35), ('Kenya', 30)),
 (('USA', 35), ('USA', 35)),
 (('USA', 35), ('South Africa', 23)),
 (('USA', 35), ('Rwanda', 23)),
 (('Canada', 24), ('Kenya', 30)),
 (('Canada', 24), ('USA', 35)),
 (('Canada', 24), ('South Africa', 23)),
 (('Canada', 24), ('Rwanda', 23)),
 (('Mexico', 27), ('Kenya', 30)),
 (('Mexico', 27), ('USA', 35)),
 (('Mexico', 27), ('South Africa', 23)),
 (('Mexico', 27), ('Rwanda', 23)),
 (('Kenya', 23), ('Kenya', 30)),
 (('Kenya', 23), ('USA', 35)),
 (('Kenya', 23), ('South Africa', 23)),
 (('Kenya', 23), ('Rwanda', 23))]