# Spark Context

In [3]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

# We need to import sparkconf which will set the configurations related to sparkcontext and then use it to create sparkcontext
# note that here we are importing those methods from pyspark module not pyspark.sql module unlike sparksession

In [4]:
conf=SparkConf().setAppName("Exam").setMaster("local")
spark= SparkContext.getOrCreate(conf)

# Creating RDD

In [7]:
myrdd= spark.parallelize(["Shreyas",24,"Jayaram", 60])
display(myrdd)

myrdd.take(2)
#using take we can retrive the number of objects we want from an RDD.

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:289

['Shreyas', 24]

In [13]:
# Creating RDD from external files
#we use textFile method of sparkcontext, this create each line in the text file as a record in RDD.
#we can also use this method to import csv and other files, which we will see later.

rdd1=spark.textFile('sample3.txt')
rdd1.collect()

#collect action returns all the data in rdd

['HI', 'MY Name is Shreyas', 'I am unemployed', 'save me']

In [14]:
# we can also create RDD which considers all the content in a file as one record using wholeTextFiles method

rdd2=spark.wholeTextFiles('sample3.txt')
rdd2.collect()

#here everytime two records will be created, one is name of the file and second is all the content inside the file

[('file:/C:/Users/shrey/Music/sample3.txt',
  'HI\nMY Name is Shreyas\nI am unemployed\nsave me')]

# Actions on RDD

In [18]:
#lets look at some basic actions on rdd

myrdd.take(3) # gives first 3 records
myrdd.collect() # gives all records
myrdd.count() # gives count of records in rdd
# myrdd.min(), myrdd.max() # used to return the min, max elements in RDD
#however in this example where rdd contains both integer and string, min/max will throw error
myrdd.first()

'Shreyas'

In [19]:
#reduce - this action will “reduce” an RDD of any kind of value to one value.
RDD1 = spark.parallelize([1,3,2,4])
add = lambda x,y: x + y
RDD1.reduce(add)

10

In [None]:
#fold 

In [21]:
# saveAsTextFile - this will save the rdd as text file.with each record in different lines.
#myrdd.saveAsTextFile('sample.txt')

#getNumPartitions() — returns the number of Partitions
myrdd.getNumPartitions()

1

# Transformations

In [25]:
# map() — transformation takes in an anonymous function and applies this function to each of the elements in the RDD
myrdd = spark.parallelize([1,2,3,4,5])
myrdd.map(lambda x:x+1).collect()

# or

def addone(x):
    return x+1
myrdd.map(addone).collect()

[2, 3, 4, 5, 6]

In [30]:
#flatmap - it is a tranformation that flattens the RDD after applying the function to each element
#To flatten means to reduce the dimensionality. In simpler terms, it means reducing a multidimensional to specific dimension.
#lets look at working of map on below example

rdd1= spark.parallelize(["Hi this","is shreyas"])
rdd1.map(lambda x: x.split(" ")).collect()

#we can see that map split both of them and gave 2 dimensional rdd, which has two words in each element.

[['Hi', 'this'], ['is', 'shreyas']]

In [31]:
rdd1.flatMap(lambda x: x.split(" ")).collect()
#however we can see that this function has flatten the rdd, that is converted 2d to 1d and returned 4 words.

['Hi', 'this', 'is', 'shreyas']

In [33]:
# repartition(n) — makes n number of partitions on RDD

print(rdd1.getNumPartitions())
rdd1.repartition(2)
print(rdd1.getNumPartitions())

#we can see that even after repatitioning we are getting 1 as patitions. 
#this is because of AQE of spark which sees that data is very less hence no need of patitioning and reduce the partitions to 1.
# we can disable the AQE to repartition as per our needs

1
1


In [34]:
#

AttributeError: 'SparkContext' object has no attribute 'conf'