In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('variables').getOrCreate()

In [3]:
from pyspark.sql import Window
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [63]:
df=spark.read.csv('C:/Users/acer/Downloads/test1.csv',header=True,inferSchema=True)

In [64]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [22]:
#accumulators as sum and count
agcnt=spark.sparkContext.accumulator(0)
sum_val=spark.sparkContext.accumulator(0)

In [23]:
def num(x):
    global agcnt
    if x.age>25:
        agcnt.add(1)
def total(x):
    global sum_val
    sum_val+=x.Salary

In [24]:
df.foreach(num)
agcnt.value

3

In [25]:
df.foreach(total)
sum_val.value

128000

In [26]:
#Broadcasting a list
broad_list = spark.sparkContext.broadcast([1, 2, 3])

rdd = spark.sparkContext.parallelize([10, 20, 30])
res = rdd.map(lambda x: x + sum(broad_list.value)).collect()
print(res) 

[16, 26, 36]


In [30]:
#broadcast join
large_df = spark.createDataFrame([(i, f"Value_{i}") for i in range(100000)], ["id", "value"])

small_df = spark.createDataFrame([(1, "X"), (2, "Y")], ["id", "code"])
result = large_df.join(F.broadcast(small_df), "id")

result.show()

+---+-------+----+
| id|  value|code|
+---+-------+----+
|  1|Value_1|   X|
|  2|Value_2|   Y|
+---+-------+----+



Partitioning and repartitioning

In [32]:
#partioning while creating
from pyspark import SparkContext
sc=SparkContext.getOrCreate()
rdd1 = sc.parallelize([1, 2, 3, 4, 5], numSlices=3)
print(rdd1.getNumPartitions()) 
rdd1.collect()

3


[1, 2, 3, 4, 5]

In [33]:
rdd1.glom().collect()

[[1], [2, 3], [4, 5]]

In [25]:
#partioning for key-value pairs
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("b", 4)])
part_rdd = rdd.partitionBy(2)  # default HashPartitioner
part_rdd.glom().collect()

[[('b', 2), ('b', 4)], [('a', 1), ('a', 3)]]

In [26]:
#issue with HashPartitioner
rdd = sc.parallelize([("a", 1), ("b", 2), ("a", 3), ("b", 4),("c",6)])
part_rdd = rdd.partitionBy(3)
part_rdd.glom().collect()

[[('b', 2), ('b', 4)], [('a', 1), ('a', 3), ('c', 6)], []]

In [29]:
# Custom partition
custom_rdd = rdd.partitionBy(3, lambda key:ord(key[0])%3)
custom_rdd.glom().collect()

[[('c', 6)], [('a', 1), ('a', 3)], [('b', 2), ('b', 4)]]

In [34]:
rdd2=rdd1.repartition(2)
rdd2.glom().collect()

[[4, 5], [1, 2, 3]]

In [38]:
rdd3=rdd1.coalesce(2, shuffle=False)
rdd3.glom().collect()

[[1], [2, 3, 4, 5]]

In [40]:
#conversion of RDD & Dataframe
rdd.collect()

[('a', 1), ('b', 2), ('a', 3), ('b', 4), ('c', 6)]

In [41]:
df=rdd.toDF(schema=['id','value'])
df.show()

+---+-----+
| id|value|
+---+-----+
|  a|    1|
|  b|    2|
|  a|    3|
|  b|    4|
|  c|    6|
+---+-----+



In [53]:
df.rdd.map(lambda x:x[0]+''+str(x[1])).collect()

['a1', 'b2', 'a3', 'b4', 'c6']

In [55]:
#activity 
data = [("Rahul", "Sharma"),("Priya", "Patel"),("Amit", "Kumar"),("Anjali", "Desai"),("Vikram", "Singh"),("Sneha", "Rai"),("Ajay", "Verma"),("Deepa", "Chaudhary"),("Manish", "Jha"),("Neha", "Gupta")]

colm = ["FirstName", "LastName"]
df = spark.createDataFrame(data, schema=colm)
df.show()

+---------+---------+
|FirstName| LastName|
+---------+---------+
|    Rahul|   Sharma|
|    Priya|    Patel|
|     Amit|    Kumar|
|   Anjali|    Desai|
|   Vikram|    Singh|
|    Sneha|      Rai|
|     Ajay|    Verma|
|    Deepa|Chaudhary|
|   Manish|      Jha|
|     Neha|    Gupta|
+---------+---------+



In [61]:
df2=df.rdd.map(lambda x:x+(x[0]+" "+x[1],))
df2.toDF(schema=['FirstName','LastName','FullName']).show()

+---------+---------+---------------+
|FirstName| LastName|       FullName|
+---------+---------+---------------+
|    Rahul|   Sharma|   Rahul Sharma|
|    Priya|    Patel|    Priya Patel|
|     Amit|    Kumar|     Amit Kumar|
|   Anjali|    Desai|   Anjali Desai|
|   Vikram|    Singh|   Vikram Singh|
|    Sneha|      Rai|      Sneha Rai|
|     Ajay|    Verma|     Ajay Verma|
|    Deepa|Chaudhary|Deepa Chaudhary|
|   Manish|      Jha|     Manish Jha|
|     Neha|    Gupta|     Neha Gupta|
+---------+---------+---------------+



Cache and persist

In [65]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [104]:
import time

start_time = time.time()
count=df.count()
high=df.select(F.max("Salary")).first()[0]
low=df.select(F.min("Salary")).first()[0]
without_cache = time.time() - start_time
print(f"Time: {without_cache:.2f} seconds")

Time: 0.47 seconds


In [105]:
cdf_cached = df.cache()

In [106]:
start_time = time.time()
count = df_cached.count()
high = df_cached.select(F.max("Salary")).first()[0]
low = df_cached.select(F.min("Salary")).first()[0]
time_with_cache = time.time() - start_time
print(f"Time: {time_with_cache:.2f} seconds")

Time: 0.45 seconds


In [108]:
start_time = time.time()
count = df_cached.count()
high = df_cached.select(F.max("Salary")).first()[0]
low = df_cached.select(F.min("Salary")).first()[0]
time_with_cache = time.time() - start_time
print(f"2nd Time: {time_with_cache:.2f} seconds")

2nd Time: 0.40 seconds


In [109]:
df_cached.unpersist()

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [111]:
from pyspark.storagelevel import StorageLevel
df.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [112]:
df.unpersist()

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [114]:
from pyspark.storagelevel import StorageLevel
df.persist(StorageLevel.MEMORY_ONLY)

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [115]:
df.unpersist()

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [116]:
from pyspark.storagelevel import StorageLevel
df.persist(StorageLevel.DISK_ONLY)

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [117]:
df.unpersist()

DataFrame[Name: string, age: int, Experience: int, Salary: int]