In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

### Map and FlatMap

In [2]:
data = [('Bob','Cathy'),('David','Frank')]

rdd = spark.sparkContext.parallelize(data)

df = spark.createDataFrame(data, schema= ['1st_name', '2nd_name'])
df. show()

rdd1 = df.rdd.map(lambda x : x + (x[0] +' ' + x[1],))
df1 = rdd1.toDF( ['1st_name', '2nd_name','full_name'])


df1.show()

  from pandas.core import (


+--------+--------+
|1st_name|2nd_name|
+--------+--------+
|     Bob|   Cathy|
|   David|   Frank|
+--------+--------+

+--------+--------+-----------+
|1st_name|2nd_name|  full_name|
+--------+--------+-----------+
|     Bob|   Cathy|  Bob Cathy|
|   David|   Frank|David Frank|
+--------+--------+-----------+



In [3]:
#FlatMAp

data = ['Bob Cathy','David Frank']
rdd = spark.sparkContext.parallelize(data)

for item in rdd.collect():
    print(item)
    
rdd1 = rdd.map(lambda x: x.split(' '))  #map() function
for item in rdd1.collect():
    print(item)
    

rdd2 = rdd.flatMap(lambda x: x.split(' '))  #Flatmap() function
for item in rdd2.collect():
    print(item)

Bob Cathy
David Frank
['Bob', 'Cathy']
['David', 'Frank']
Bob
Cathy
David
Frank


### MapValues

In [12]:
data = [("apple", 1), ("banana", 2), ("orange", 5)]

rdd = spark.sparkContext.parallelize(data)

rdd.collect()

[('apple', 1), ('banana', 2), ('orange', 5)]

In [14]:
rdd1 = rdd.mapValues(lambda x:x+10)
rdd1.collect()


[('apple', 11), ('banana', 12), ('orange', 15)]

In [15]:
rdd2 = rdd.mapValues(lambda x : x * 10)
rdd2.collect()

[('apple', 10), ('banana', 20), ('orange', 50)]

In [16]:
rdd_map=rdd.mapValues(lambda x:[x,x])   
rdd_map.collect()

[('apple', [1, 1]), ('banana', [2, 2]), ('orange', [5, 5])]

### flatMapValues

In [20]:
rdd_map=rdd.flatMapValues(lambda x:[x,x])   
rdd_map.collect()

[('apple', 1),
 ('apple', 1),
 ('banana', 2),
 ('banana', 2),
 ('orange', 5),
 ('orange', 5)]

In [21]:
rdd_flatmap = rdd.flatMapValues(lambda x: range(1, x+1))  
rdd_flatmap.collect()

[('apple', 1),
 ('banana', 1),
 ('banana', 2),
 ('orange', 1),
 ('orange', 2),
 ('orange', 3),
 ('orange', 4),
 ('orange', 5)]

### ReduceBykey

In [27]:
data = [("apple", 1), ("banana", 2), ("apple", 3), ("banana", 4), ("orange", 5)]

rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[('apple', 1), ('banana', 2), ('apple', 3), ('banana', 4), ('orange', 5)]

In [28]:
rdd_reduce = rdd.reduceByKey(lambda x, y : x + y) #counting the items
rdd_reduce.collect() 

[('apple', 4), ('banana', 6), ('orange', 5)]

In [29]:
data = [("apple", 5), ("banana", 10), ("apple", 8), ("banana", 3), ("orange", 7), ("apple", 12)]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[('apple', 5),
 ('banana', 10),
 ('apple', 8),
 ('banana', 3),
 ('orange', 7),
 ('apple', 12)]

In [31]:
rdd_max = rdd.reduceByKey(lambda x, y: max(x, y))  #maximum value for each key
rdd_max.collect()

[('apple', 12), ('banana', 10), ('orange', 7)]

### groupByKey

In [34]:
rdd_grp = rdd.groupByKey()
rdd_grp.collect()  # here the values stored as iterables not in list

[('apple', <pyspark.resultiterable.ResultIterable at 0x26d8ff89210>),
 ('banana', <pyspark.resultiterable.ResultIterable at 0x26d8ff8b890>),
 ('orange', <pyspark.resultiterable.ResultIterable at 0x26d8ff91810>)]

In [36]:
rdd_grp = rdd_grp.mapValues(list)  #converting iterables to lists
rdd_grp.collect()

[('apple', [5, 8, 12]), ('banana', [10, 3]), ('orange', [7])]

### sortByKey

In [39]:
data = [("apple", 3), ("grapes", 1), ("apple", 1), ("banana", 2), ("orange", 5)]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[('apple', 3), ('grapes', 1), ('apple', 1), ('banana', 2), ('orange', 5)]

In [40]:
rdd_sort = rdd.sortByKey(ascending=True)
rdd_sort.collect()

[('apple', 3), ('apple', 1), ('banana', 2), ('grapes', 1), ('orange', 5)]

In [41]:
rdd_sort = rdd.sortByKey(ascending=False)
rdd_sort.collect()

[('orange', 5), ('grapes', 1), ('banana', 2), ('apple', 3), ('apple', 1)]

In [42]:
data = [(7 , 'A'), (3 ,'B'), (10 , 'C'), (1 ,'D'), (7,'E')]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[(7, 'A'), (3, 'B'), (10, 'C'), (1, 'D'), (7, 'E')]

In [43]:
rdd_sort = rdd.sortByKey(ascending=True)
rdd_sort.collect()

[(1, 'D'), (3, 'B'), (7, 'A'), (7, 'E'), (10, 'C')]