### 1. Import Libraries

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext

### 2. Initialize SparkContext

In [2]:
sc=SparkContext.getOrCreate()

### 3.Create data

In [3]:
data=sc.parallelize([['France','50M','3T'],['India','30M','30T'],['Kenya','70M','25T'],['Nigeria','90M','60T'],['China','20M','2T'],['USA','80M','30T'],['UK','70M','25T'],['USA','20M','30T'],['China','70M','25T']])

### 4. Basic Operations

#### i. Sparks Action Functions
##### Sparks action functions produce a value back to the Spark driver program

In [4]:
# Get number of partitions
data.getNumPartitions()

8

In [5]:
# Show all data
data.collect()

[['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['China', '20M', '2T'],
 ['USA', '80M', '30T'],
 ['UK', '70M', '25T'],
 ['USA', '20M', '30T'],
 ['China', '70M', '25T']]

In [6]:
# Show first column
data.first()

['France', '50M', '3T']

In [7]:
# Get random record
data.takeSample(1,True)

[['Kenya', '70M', '25T']]

In [8]:
# Count records
data.count()

9

In [9]:
# Count occurence of the items
data.countByKey().items()

dict_items([('France', 1), ('India', 1), ('Kenya', 1), ('Nigeria', 1), ('China', 2), ('USA', 2), ('UK', 1)])

#### ii. Sparks Transformation Functions
##### Sparks transformation functions produce a new Resilient Distributed Dataset (RDD)

In [10]:
tran_data=sc.parallelize([4,6,8,2,2,6])
tran_data.collect()

[4, 6, 8, 2, 2, 6]

In [11]:
# Select two random items
tran_data.sample(2,True).collect()

[4, 4, 4, 6, 8, 2, 2, 2]

In [12]:
# Use map to multiply each item with 2
tran_data.map(lambda x : x*2).collect()

[8, 12, 16, 4, 4, 12]

In [13]:
# Use filterMap to duplicate the items
tran_data.flatMap(lambda x : [x,x]).collect()

[4, 4, 6, 6, 8, 8, 2, 2, 2, 2, 6, 6]

In [14]:
# Filter from data where item is China
data.filter(lambda x : "China" in x).collect()

[['China', '20M', '2T'], ['China', '70M', '25T']]

In [15]:
# Return unique items
tran_data.distinct().collect()

[8, 2, 4, 6]

In [16]:
# Sorting
data.sortByKey(1, True).collect()

[['China', '20M', '2T'],
 ['China', '70M', '25T'],
 ['France', '50M', '3T'],
 ['India', '30M', '30T'],
 ['Kenya', '70M', '25T'],
 ['Nigeria', '90M', '60T'],
 ['UK', '70M', '25T'],
 ['USA', '80M', '30T'],
 ['USA', '20M', '30T']]

In [17]:
a=sc.parallelize([('USA',35),('Canada',24),('Mexico',27),('Kenya',23)])
b=sc.parallelize([('Kenya',30),('USA',35),('South Africa',23),('Rwanda',23)])
print("a : ",a.collect())
print("b : ",b.collect())

a :  [('USA', 35), ('Canada', 24), ('Mexico', 27), ('Kenya', 23)]
b :  [('Kenya', 30), ('USA', 35), ('South Africa', 23), ('Rwanda', 23)]


In [18]:
# join
a.join(b).collect()

[('USA', (35, 35)), ('Kenya', (23, 30))]

In [19]:
# Left outer join
a.leftOuterJoin(b).collect()

[('USA', (35, 35)),
 ('Mexico', (27, None)),
 ('Canada', (24, None)),
 ('Kenya', (23, 30))]

In [20]:
# Right outer join
a.rightOuterJoin(b).collect()

[('USA', (35, 35)),
 ('South Africa', (None, 23)),
 ('Kenya', (23, 30)),
 ('Rwanda', (None, 23))]

In [21]:
# Union
a.union(b).collect()

[('USA', 35),
 ('Canada', 24),
 ('Mexico', 27),
 ('Kenya', 23),
 ('Kenya', 30),
 ('USA', 35),
 ('South Africa', 23),
 ('Rwanda', 23)]

In [22]:
# Difference
a.subtract(b).collect()

[('Kenya', 23), ('Canada', 24), ('Mexico', 27)]

In [23]:
# Intersection
a.intersection(b).collect()

[('USA', 35)]

In [24]:
# Cartesian
a.cartesian(b).collect()

[(('USA', 35), ('Kenya', 30)),
 (('USA', 35), ('USA', 35)),
 (('USA', 35), ('South Africa', 23)),
 (('USA', 35), ('Rwanda', 23)),
 (('Canada', 24), ('Kenya', 30)),
 (('Canada', 24), ('USA', 35)),
 (('Canada', 24), ('South Africa', 23)),
 (('Canada', 24), ('Rwanda', 23)),
 (('Mexico', 27), ('Kenya', 30)),
 (('Mexico', 27), ('USA', 35)),
 (('Mexico', 27), ('South Africa', 23)),
 (('Mexico', 27), ('Rwanda', 23)),
 (('Kenya', 23), ('Kenya', 30)),
 (('Kenya', 23), ('USA', 35)),
 (('Kenya', 23), ('South Africa', 23)),
 (('Kenya', 23), ('Rwanda', 23))]