#### Creating a Spark Session

In [1]:
# Import findspark to locate Spark in your Python Env
import findspark
findspark.init()
findspark.find()

# Import SparkSession

from pyspark.sql import SparkSession

# Create SparkSession

spark = SparkSession.builder \
        .master("local[4]") \
        .appName("Spark Practice") \
        .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x00000236CA70CDF0>


In [11]:
# Create RDD from Parallelize
data_list = [('Emp1', 'DE', 20000), ('Emp2', 'PE', 25000)]

rdd_list = spark.sparkContext.parallelize(data_list)

print(rdd_list)
print("Num Partitions:", rdd_list.getNumPartitions())

# In order to see the content of RDD, we have to 1st collect it to the Driver, then we can loop through it to see the data
rdd_list_data = rdd_list.collect()

for row in rdd_list_data:
    print(row)

ParallelCollectionRDD[19] at readRDDFromFile at PythonRDD.scala:287
Num Partitions: 4
('Emp1', 'DE', 20000)
('Emp2', 'PE', 25000)


In [12]:
# Create RDD from External Data Source
rdd_text = spark.sparkContext.textFile("Data_Resources/test.txt")

print(rdd_text)
print("\n")
rdd_text_data = rdd_text.collect()

for row in rdd_text_data:
    print(row)

Data_Resources/test.txt MapPartitionsRDD[21] at textFile at NativeMethodAccessorImpl.java:0


Alice’s Adventures in Wonderland
by Lewis Carroll


In [13]:
# Whole Text File: Creates a pair where Key = File Path and Value = Whole Text File Content
rdd_pair = spark.sparkContext.wholeTextFiles("Data_Resources/test.txt")

print(rdd_pair)
print("\n")
rdd_pair_data = rdd_pair.collect()

for row in rdd_pair_data:
    print(row)

org.apache.spark.api.java.JavaPairRDD@a2cca44


('file:/c:/Users/PRATIK/Documents/Practice/PySpark_Practice/Data_Resources/test.txt', 'Alice’s Adventures in Wonderland\nby Lewis Carroll')


In [14]:
# Creating a Empty RDD

rdd_empty = spark.sparkContext.emptyRDD

print(rdd_empty)

# Empty RDD with 4 partitions (If the provided list had data, 
# then it will create a RDD with that data divided into 4 partitions)

rdd_empty_partitioned = spark.sparkContext.parallelize([], 4)

print(rdd_empty_partitioned)

# Get Num of Partitions

print("Number of RDD Partitions:", rdd_empty_partitioned.getNumPartitions())

<bound method SparkContext.emptyRDD of <SparkContext master=local[4] appName=Spark Practice>>
ParallelCollectionRDD[24] at readRDDFromFile at PythonRDD.scala:287
Number of RDD Partitions: 4


In [15]:
# Repartition vs Coalesce

# PySpark provides two ways to repartition
# 1. repartition() method shuffles data from all nodes also called full shuffle
# - repartition() is used to increase or decrease the RDD/DataFrame partitions

# 2. coalesce() method shuffle data from minimum nodes
# - coalesce() is used to only decrease the number of partitions in an efficient way

# Note: Both are very expensive operations as they shuffle the data across many partitions hence try to minimize using these as much as possible

import shutil, os

rdd_big = spark.sparkContext.parallelize(range(0,100, 2), 4)

num_partitions = rdd_big.getNumPartitions()
print("Original RDD Partitions:",num_partitions)

# glom() to coalesce all elements within each partition into a list
data = rdd_big.glom().collect()
print("glom() Collect:",data)

# Data from each partition
part_data_map = enumerate(data)
for row in part_data_map:
    print("Partition {}: {}".format(row[0], row[1]))

# Writing as Text files
output_dir = "Data_Resources/tmp/partition"

# To Remove the directory before writing (as if dir already exists, it fails)
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

rdd_big.saveAsTextFile("Data_Resources/tmp/partition")

Original RDD Partitions: 4
glom() Collect: [[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22], [24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48], [50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72], [74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]]
Partition 0: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]
Partition 1: [24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48]
Partition 2: [50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72]
Partition 3: [74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]


In [16]:
# Repartition Example (Used to Reduce or Increase partitions)
# Data is shuffled from all nodes across all partitions (Full Shuffle)

rdd_repartitioned = rdd_big.repartition(8)

print("Repartitioned RDD Partitions:",rdd_repartitioned.getNumPartitions())

data = rdd_repartitioned.glom().collect()

# Data from each partition
part_data_map = enumerate(data)
for row in part_data_map:
    print("Partition {}: {}".format(row[0], row[1]))

Repartitioned RDD Partitions: 8
Partition 0: [50, 52, 54, 56, 58, 60, 62, 64, 66, 68]
Partition 1: [70, 72]
Partition 2: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42]
Partition 3: [20, 22, 44, 46, 48]
Partition 4: []
Partition 5: []
Partition 6: [74, 76, 78, 80, 82, 84, 86, 88, 90, 92]
Partition 7: [94, 96, 98]


In [17]:
# Coalesce Example (Used to only Reduce partitions) Note: If tried to increase, it takes Current partition as Max and keeps the RDD same
# Optimized Version of Repartition - Data from minimum nodes and paritions is shuffled

rdd_coalesced = rdd_repartitioned.coalesce(3)

print("Coalesced RDD Partitions:",rdd_coalesced.getNumPartitions())

data = rdd_coalesced.glom().collect()

# Data from each partition
part_data_map = enumerate(data)
for row in part_data_map:
    print("Partition {}: {}".format(row[0], row[1]))

Coalesced RDD Partitions: 3
Partition 0: [50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72]
Partition 1: [94, 96, 98, 20, 22, 44, 46, 48]
Partition 2: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92]
