#### Creating a Spark Session

In [9]:
# Import findspark to locate Spark in your Python Env
import findspark
findspark.init()
findspark.find()

# Import SparkSession

from pyspark.sql import SparkSession

# Create SparkSession

spark = SparkSession.builder \
        .master("local[2]") \
        .appName("Spark Practice") \
        .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x00000269BA8CCAF0>


In [2]:
# Create RDD from Parallelize
data_list = [('Pratik', 'DE', 20000), ('Tina', 'PE', 25000)]

rdd_list = spark.sparkContext.parallelize(data_list)

print(rdd_list)

# In order to see the content of RDD, we have to 1st collect it to the Driver, then we can loop throw it to see the data
rdd_list_data = rdd_list.collect()

for row in rdd_list_data:
    print(row)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287
('Pratik', 'DE', 20000)
('Tina', 'PE', 25000)


In [7]:
# Create RDD from External Data Source
rdd_text = spark.sparkContext.textFile("Data_Resources/test.txt")

print(rdd_text)
print("\n")
rdd_text_data = rdd_text.collect()

for row in rdd_text_data:
    print(row)

Data_Resources/test.txt MapPartitionsRDD[7] at textFile at NativeMethodAccessorImpl.java:0


Alice’s Adventures in Wonderland
by Lewis Carroll


In [8]:
# Whole Text File: Creates a pair where Key = File Path and Value = Whole Text File Content
rdd_pair = spark.sparkContext.wholeTextFiles("Data_Resources/test.txt")

print(rdd_pair)
print("\n")
rdd_pair_data = rdd_pair.collect()

for row in rdd_pair_data:
    print(row)

org.apache.spark.api.java.JavaPairRDD@595dde36


('file:/C:/Users/PRATIK/Documents/Practice/PySpark_Practice/Data_Resources/test.txt', 'Alice’s Adventures in Wonderland\nby Lewis Carroll')


In [5]:
# Creating a Empty RDD

rdd_empty = spark.sparkContext.emptyRDD

print(rdd_empty)

# Empty RDD with 4 partitions (If the provided list had data, 
# then it will create a RDD with that data divided into 4 partitions)

rdd_empty_partitioned = spark.sparkContext.parallelize([], 4)

print(rdd_empty_partitioned)

<bound method SparkContext.emptyRDD of <SparkContext master=local[2] appName=Spark Practice>>
ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:287


In [6]:
# Get Num of Partitions

print("Number of RDD Partitions:", rdd_empty_partitioned.getNumPartitions())

Number of RDD Partitions: 4


In [49]:
# Repartition vs Coalesce
"""
PySpark provides two ways to repartition
1. repartition() method shuffles data from all nodes also called full shuffle
- repartition() is used to increase or decrease the RDD/DataFrame partitions

2. coalesce() method shuffle data from minimum nodes
- coalesce() is used to only decrease the number of partitions in an efficient way
"""
import shutil

rdd_big = spark.sparkContext.parallelize(range(1,21), 6)

print("Original RDD Partitions:",rdd_big.getNumPartitions())

# output_dir = "/Data_Resources/tmp/partition"

# # To Remove the directory before writing (if already exists, it fails)
# shutil.rmtree(output_dir)

# rdd_big.saveAsTextFile(output_dir)

# rdd_data = spark.read.text(output_dir).rdd
# # Get the partition count
num_partitions = rdd_big.getNumPartitions()
# print(num_partitions)
# # Collect and print the data
data = rdd_big.glom().collect()

for i in range(num_partitions):
    partition_data = ''.join(str(data[i]))
    print(f"Partition {i + 1} : {partition_data}")
# rdd_data = spark.read.text(output_dir).rdd
# data = rdd_data.collect()
# for row in data:
#     print(row[0])

Original RDD Partitions: 6
Partition 1 : [1, 2, 3]
Partition 2 : [4, 5, 6]
Partition 3 : [7, 8, 9, 10]
Partition 4 : [11, 12, 13]
Partition 5 : [14, 15, 16]
Partition 6 : [17, 18, 19, 20]


In [None]:
# rdd_repartitioned = rdd_big.repartition(3)

# print("Repartitioned RDD Partitions:",rdd_repartitioned.getNumPartitions())

# rdd_coalesced = rdd_big.coalesce(2)

# print("Coalesced RDD Partitions:",rdd_coalesced.getNumPartitions())