In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Parallelize").getOrCreate()

In [2]:
spark

In [3]:
words = ("Big","data","is ","Super","Technology","Big","data","is","Trending","now")
words

('Big',
 'data',
 'is ',
 'Super',
 'Technology',
 'Big',
 'data',
 'is',
 'Trending',
 'now')

In [4]:
words_rdd = spark.sparkContext.parallelize(words)

In [5]:
words_rdd.take(2)

['Big', 'data']

In [6]:
words_normalized = words_rdd.map(lambda x : x.lower())

In [7]:
words_normalized.take(5)

['big', 'data', 'is ', 'super', 'technology']

In [9]:
mapped_words = words_normalized.map(lambda x: (x,1))

In [10]:
mapped_words.collect()

[('big', 1),
 ('data', 1),
 ('is ', 1),
 ('super', 1),
 ('technology', 1),
 ('big', 1),
 ('data', 1),
 ('is', 1),
 ('trending', 1),
 ('now', 1)]

In [11]:
final_result = mapped_words.reduceByKey(lambda x,y : x+y)

In [13]:
final_result.collect()

[('big', 2),
 ('is ', 1),
 ('technology', 1),
 ('super', 1),
 ('is', 1),
 ('trending', 1),
 ('data', 2),
 ('now', 1)]

Chaining of functions

In [None]:
spark.sparkContext.parallelize(words).map(lambda x : x.lower()).map(lambda x : (x,1)).reduceByKey(lambda x,y : x+y).collect()

[('big', 2),
 ('is ', 1),
 ('technology', 1),
 ('super', 1),
 ('is', 1),
 ('trending', 1),
 ('data', 2),
 ('now', 1)]

In [None]:
spark.sparkContext. \
    parallelize(words). \
        map(lambda x : x.lower()). \
            map(lambda x : (x,1)). \
                reduceByKey(lambda x,y : x+y). \
                    collect() 

[('big', 2),
 ('is ', 1),
 ('technology', 1),
 ('super', 1),
 ('is', 1),
 ('trending', 1),
 ('data', 2),
 ('now', 1)]

In [None]:
result=spark.sparkContext. \
        parallelize(words). \
            map(lambda x : x.lower()). \
                map(lambda x : (x,1)). \
                    reduceByKey(lambda x,y : x+y)

In [None]:
result.collect()

[('big', 2),
 ('is ', 1),
 ('technology', 1),
 ('super', 1),
 ('is', 1),
 ('trending', 1),
 ('data', 2),
 ('now', 1)]

Number of Partitions

In [None]:
words_rdd.getNumPartitions()

4

In [15]:
spark.sparkContext.defaultParallelism

4

In [16]:
spark.sparkContext.defaultMinPartitions

2

countByValue

In [21]:
words

('Big',
 'data',
 'is ',
 'Super',
 'Technology',
 'Big',
 'data',
 'is',
 'Trending',
 'now')

In [23]:
spark.sparkContext. \
    parallelize(words). \
        map( lambda x : x.lower()). \
            countByValue()

defaultdict(int,
            {'big': 2,
             'data': 2,
             'is ': 1,
             'super': 1,
             'technology': 1,
             'is': 1,
             'trending': 1,
             'now': 1})

In [24]:
orders_rdd = spark.sparkContext.textFile("hdfs://localhost:9000/user/tkm/data/orders.csv")

In [26]:
orders_rdd.take(3)

[',order_id,date,customer_id,status',
 '0,1,09/11/2020,3371,CLOSED',
 '1,2,12/01/2020,3902,PENDING']

In [28]:
header = orders_rdd.first()

In [29]:
orders_rdd. \
    filter(lambda x : (x != header)). \
        map(lambda x : x.split(",")[4]). \
            countByValue()

defaultdict(int, {'CLOSED': 341, 'PENDING': 318, 'COMPLETE': 341})

In [30]:
orders_rdd.getNumPartitions()

2

map vs flatMap

In [31]:
spark.sparkContext.parallelize([1,2,3,4,5]).map(lambda x : x*2).collect()

[2, 4, 6, 8, 10]

In [34]:
spark.sparkContext.parallelize([
    "Hello world",
    "This is a Spark example",
    "We're using map and flatMap"]).flatMap(lambda x : x.split(" ")).collect()

['Hello',
 'world',
 'This',
 'is',
 'a',
 'Spark',
 'example',
 "We're",
 'using',
 'map',
 'and',
 'flatMap']

In [35]:
spark.sparkContext.parallelize([
    "Hello world",
    "This is a Spark example",
    "We're using map and flatMap"]).map(lambda x : x.split(" ")).collect()

[['Hello', 'world'],
 ['This', 'is', 'a', 'Spark', 'example'],
 ["We're", 'using', 'map', 'and', 'flatMap']]

In [36]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("job_stage_task").getOrCreate()

somato_rdd = spark.sparkContext.textFile("hdfs://localhost:9000/user/tkm/data/somato.txt")


menu_rdd = somato_rdd.map(lambda x : (x.split(",")[4] , 1))

result = menu_rdd.reduceByKey(lambda x,y : x+y)

result.collect()

[('Butter Chicken', 1),
 ('Pepperoni Pizzza', 2),
 ('Cheeseburger', 1),
 ('Chicken Biryani', 5),
 ('Hawaiian Pizza', 2),
 ('Paneer Tikka', 2),
 ('Margherita Pizza', 2),
 ('Pepperoni Pizza', 1),
 ('Double Cheeseburger', 2),
 ('Veggie Burger', 1),
 ('Chana Masala', 1)]