[RDD Transformation Example](https://sparkbyexamples.com/pyspark/pyspark-rdd-transformations/)

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[5]") \
        .appName("RDD_Transformations") \
        .getOrCreate()

spark

In [4]:
# Loading the data
rdd = spark.sparkContext.textFile('DataFiles/word_count.txt')
rdd

DataFiles/word_count.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
for element in rdd.collect():
    print(element)

Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
Alice’s Adventures in Wonderland
by Lewis Carroll
This eBook is for the use
of anyone anywhere
at no cost and with
This eBook is for the use
of anyone anywhere
at no cost and with
Project Gutenberg’s
Alice’s Adventures in Wonderland
by

In [6]:
# FlatMap - flattens the RDD after applying a function

rdd2 = rdd.flatMap(lambda x : x.split(" "))

for element in rdd2.collect():
    print(element)

[Stage 1:>                                                          (0 + 2) / 2]

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Alice’s
Adventures
in
Wonderland
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
by

                                                                                

In [7]:
# map() - used to apply complex operations like adding a col, updating a col

rdd3 = rdd2.map(lambda x : (x, 1))

for element in rdd3.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and', 1)
('with', 1)
('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('by', 1)
('Lewis', 1)
('Carroll', 1)
('This', 1)
('eBook', 1)
('is', 1)
('for', 1)
('the', 1)
('use', 1)
('of', 1)
('anyone', 1)
('anywhere', 1)
('at', 1)
('no', 1)
('cost', 1)
('and'

In [8]:
# reduceByKey - merges the values for each key with the function specified
rdd4 = rdd3.reduceByKey(lambda a,b : a + b)

for element in rdd4.collect():
    print(element)

[Stage 3:>                                                          (0 + 2) / 2]

('Project', 9)
('Gutenberg’s', 9)
('Alice’s', 18)
('in', 18)
('Lewis', 18)
('Carroll', 18)
('is', 27)
('use', 27)
('of', 27)
('anyone', 27)
('anywhere', 27)
('at', 27)
('no', 27)
('Adventures', 18)
('Wonderland', 18)
('by', 18)
('This', 27)
('eBook', 27)
('for', 27)
('the', 27)
('cost', 27)
('and', 27)
('with', 27)


                                                                                

In [9]:
# sortByKey - sorts RDD elements by key
rdd5 = rdd4.sortByKey()

for element in rdd5.collect():
    print(element)

('Adventures', 18)
('Alice’s', 18)
('Carroll', 18)
('Gutenberg’s', 9)
('Lewis', 18)
('Project', 9)
('This', 27)
('Wonderland', 18)
('and', 27)
('anyone', 27)
('anywhere', 27)
('at', 27)
('by', 18)
('cost', 27)
('eBook', 27)
('for', 27)
('in', 18)
('is', 27)
('no', 27)
('of', 27)
('the', 27)
('use', 27)
('with', 27)


In [12]:
# sorting based on word count

# changing the key and value pair using map() and then sorting
rdd6 = rdd4.map(lambda x : (x[1], x[0])).sortByKey()

for element in rdd6.collect():
    print(element)

(9, 'Project')
(9, 'Gutenberg’s')
(18, 'Alice’s')
(18, 'in')
(18, 'Lewis')
(18, 'Carroll')
(18, 'Adventures')
(18, 'Wonderland')
(18, 'by')
(27, 'is')
(27, 'use')
(27, 'of')
(27, 'anyone')
(27, 'anywhere')
(27, 'at')
(27, 'no')
(27, 'This')
(27, 'eBook')
(27, 'for')
(27, 'the')
(27, 'cost')
(27, 'and')
(27, 'with')


In [13]:
# filter() - filters RDD based on condition

rdd7 = rdd6.filter(lambda x : 'a' in x[1])

for element in rdd7.collect():
    print(element)

(18, 'Carroll')
(18, 'Wonderland')
(27, 'anyone')
(27, 'anywhere')
(27, 'at')
(27, 'and')


In [18]:
# Saves the output in a txt file
rdd6.saveAsTextFile('OutputFiles/word_count_output')

In [19]:
spark.stop()