In [1]:
from pyspark import SparkConf, SparkContext

# We need to import sparkconf which will set the configurations related to sparkcontext and then use it to create sparkcontext
# note that here we are importing those methods from pyspark module not pyspark.sql module unlike sparksession

conf=SparkConf().setAppName("Exam").setMaster("local")
spark= SparkContext.getOrCreate(conf)

# Creating RDD

In [2]:
myrdd= spark.parallelize(["Shreyas",24,"Jayaram", 60])
display(myrdd)

myrdd.take(2)
#using take we can retrive the number of objects we want from an RDD.

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

['Shreyas', 24]

In [3]:
# Creating RDD from external files
#we use textFile method of sparkcontext, this create each line in the text file as a record in RDD.
#we can also use this method to import csv and other files, which we will see later.

rdd1=spark.textFile('sample.txt')
rdd1.collect()

#collect action returns all the data in rdd

['Hello Spark, this is a sample file.',
 'This file is for practicing Spark RDD operations.',
 'Spark is powerful and Spark is fast.',
 'We will count all the words in this file.',
 'This is a simple example.',
 'A simple, simple example.',
 'Hello again, Spark.']

In [4]:
# we can also create RDD which considers all the content in a file as one record using wholeTextFiles method

rdd2=spark.wholeTextFiles('sample.txt')
rdd2.collect()

#here everytime two records will be created, one is name of the file and second is all the content inside the file

[('file:/home/jupyter/data/sample.txt',
  'Hello Spark, this is a sample file.\nThis file is for practicing Spark RDD operations.\nSpark is powerful and Spark is fast.\nWe will count all the words in this file.\nThis is a simple example.\nA simple, simple example.\nHello again, Spark.')]

# Actions on RDD

In [5]:
#lets look at some basic actions on rdd

myrdd.take(3) # gives first 3 records
myrdd.collect() # gives all records
myrdd.count() # gives count of records in rdd
# myrdd.min(), myrdd.max() # used to return the min, max elements in RDD
#however in this example where rdd contains both integer and string, min/max will throw error
myrdd.first()

'Shreyas'

In [6]:
#reduce - this action will “reduce” an RDD of any kind of value to one value.
RDD1 = spark.parallelize([1,3,2,4])
add = lambda x,y: x + y
RDD1.reduce(add)

10

In [7]:
# saveAsTextFile - this will save the rdd as text file.with each record in different lines.
#myrdd.saveAsTextFile('sample.txt')

#getNumPartitions() — returns the number of Partitions
myrdd.getNumPartitions()

1

# Transformations

In [8]:
# map() — transformation takes in an anonymous function and applies this function to each of the elements in the RDD
# As this function applies to each element in the rdd, the input parameter for the lambda function should be only one record and output should also be one record.
# If we create a lambda function like lambda x,y: x+y it will throw an error
myrdd = spark.parallelize([1,2,3,4,5])
myrdd.map(lambda x:x+1).collect()

# or

def addone(x):
    return x+1
myrdd.map(addone).collect()

[2, 3, 4, 5, 6]

In [27]:
#flatmap - it is a tranformation that flattens the RDD after applying the function to each element
#To flatten means to reduce the dimensionality. In simpler terms, it means reducing a multidimensional to specific dimension.
#lets look at working of map on below example

rdd1= spark.parallelize(["Hi this","Hi is shreyas"])
rdd1.map(lambda x: x.split(" ")).collect()

#we can see that map split both of them and gave 2 dimensional rdd, which has two words in each element.

[['Hi', 'this'], ['Hi', 'is', 'shreyas']]

In [28]:
rdd2 = rdd1.flatMap(lambda x: x.split(" "))
rdd2.collect()
#however we can see that this function has flatten the rdd, that is converted 2d to 1d and returned 4 words.
# flatmap also takes one input parameter, but the return value of the lambda should be an iterable such as an array. If its not an iterable spark will throw an error.
# flatmap will flatten the iterable. In the above example split returns an array, which is an iterable and it is exploded/flatten

['Hi', 'this', 'Hi', 'is', 'shreyas']

# reduce transformations
till now we learnt about the flatMap and map transformations, now lets look at reduce and group operations.
We can't run the reduce and group operations on the simple rdd we created above. We will need pair rdd, that is key-value rdd.

In [29]:
rdd2.collect()

['Hi', 'this', 'Hi', 'is', 'shreyas']

In [30]:
# lets create a pair rdd using the rdd1 we have above

rdd_pair =  rdd2.map(lambda x:(x,1))
rdd_pair.collect()

[('Hi', 1), ('this', 1), ('Hi', 1), ('is', 1), ('shreyas', 1)]

In [31]:
# to get all keys and values from a rdd

print(rdd_pair.keys().collect())
print(rdd_pair.values().collect())

#to get value associated with a particular key
print(rdd_pair.lookup('Hi'))

['Hi', 'this', 'Hi', 'is', 'shreyas']
[1, 1, 1, 1, 1]
[1, 1]


# groupByKey() vs reduceByKey(fun)

In PySpark, groupByKey groups values into an iterable collection for each key without aggregation, while reduceByKey groups values and applies a reduction function to combine them into a single value per key. reduceByKey is more efficient for aggregation tasks as it performs local aggregations (combiner logic) before shuffling data, significantly reducing the amount of data transferred over the network. groupByKey performs a full shuffle of all values, which can be less performant and memory-intensive for large datasets, and is only suitable when the full list of values is required for further processing.

In [32]:
# groupByKey()
#It takes key-value pairs (K, V) as an input, groups the values based on the key(K), and generates a dataset of KeyValueGroupedDataset(K, Iterable) pairs as an output. 

rdd5= rdd_pair.groupByKey()

print(rdd5.collect())

# answer is like (key, iterable(val1, val2,,,,valn))

[('Hi', <pyspark.resultiterable.ResultIterable object at 0x772b4dd835d0>), ('this', <pyspark.resultiterable.ResultIterable object at 0x772b4dd839d0>), ('is', <pyspark.resultiterable.ResultIterable object at 0x772b4dd83990>), ('shreyas', <pyspark.resultiterable.ResultIterable object at 0x772b4dd83a10>)]


In [33]:
# Now we can apply functions on the iterable and calculate the sum
rdd5.mapValues(len).collect()

# here len is a function which calculate the length of the iterable doing which we will get the count of occurance

[('Hi', 2), ('this', 1), ('is', 1), ('shreyas', 1)]

In [34]:
# now lets see we can do this using reduceByKey()

rdd6 = rdd_pair.reduceByKey(lambda x,y: x+y)
rdd6.collect()

[('Hi', 2), ('this', 1), ('is', 1), ('shreyas', 1)]

Using reduceByKey() spark will first aggregate the data in the source partitions iself then performs shuffle and then again calculates the aggregation. Thus the amout of data getting shuffled is less.

In [35]:
spark.stop()