In [1]:
# Create Spark Session
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Factor of cores") \
    .master("local[*]") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "1G") \
    .config("spark.driver.memory", "4G") \
    .getOrCreate()
spark

In [None]:
executor_memory = spark.conf.get("spark.executor.memory", "Not set (local mode uses driver memory)")
driver_memory = spark.conf.get("spark.driver.memory", "Default (e.g., 1G)")

In [None]:
executor_memory

In [4]:
# Determine the degree of parallelism
spark.sparkContext.defaultParallelism

8

In [7]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [12]:
spark.conf.set("spark.sql.files.maxPartitionBytes", str(70 * 1024 * 1024)+"b")

In [2]:
# File size that we are going to import
import os
file_size = os.path.getsize('./Input/sample_data.csv')
print(f"""Data File Size: 
            {file_size} in bytes 
            {int(file_size) / 1024 / 1024} in MB
            {int(file_size) / 1024 / 1024 / 1024} in GB""")

Data File Size: 
            2898932284 in bytes 
            2764.637264251709 in MB
            2.6998410783708096 in GB


In [3]:
# Specify the file path
file_path = "./Input/sample_data.csv"

# Number of lines to read (including the header if needed)
n = 5

# Open the file and read lines
with open(file_path, 'r') as file:
    lines = [file.readline().strip() for _ in range(n)]

# Print the sampled lines
for line in lines:
    print(line)

Order ID,Order Date,Customer ID,Qty,Price,Amount,Sales Region,Country
ORD00001,2024-06-15,C1,128,422,54016,South,Uruguay
ORD00002,2021-06-16,C2,198,359,71082,East,China
ORD00003,2023-10-08,C5,93,305,28365,North,Bermuda
ORD00004,2021-10-07,C4,19,564,10716,North,Greenland


In [5]:
# Lets create a simple Python decorator - {get_time} to get the execution timings
# If you dont know about Python decorators - check out : https://www.geeksforgeeks.org/decorators-in-python/
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        print("-"*80)
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())
    print("-"*80)

In [13]:
# Code for benchmarking
from pyspark.sql.functions import count, lit
@get_time
def x(): 
    df = spark.read.format("csv").option("header",True).load("./Input/sample_data.csv")
    print(f"Number of Partition -> {df.rdd.getNumPartitions()}")
    df.write.format("noop").mode("overwrite").save()

Number of Partition -> 40
--------------------------------------------------------------------------------
Execution time: 53682.899713516235 ms
--------------------------------------------------------------------------------


In [None]:
# Define shuffle partitions which is not Factor of core
spark.conf.set("spark.sql.shuffle.partitions", 4)

In [None]:
spark.conf.set("spark.sql.files.maxPartitionBytes", str(128 * 3 * 1024 * 1024)+"b")

In [None]:
spark.conf.set("spark.sql.files.maxPartitionBytes", str(462 * 1024 * 1024)+"b")