In [0]:
employee_data = [(10, "Srishti","Shetty", "1998", "100", "F", 8000),
                 (20, "Aish", "Rai", "2002", "200", "M", 2000),
                 (30, "Rishabh", "Alva", "2010", "100",None, 6000),
                 (40, "Aarvi",  "Bhandhary", "1996", "400", "F", 7000),
                 (50, "Pooja",  "Hegde", "2008", "500", "F", 5000),
                 (60, "Sunil", "Shetty", "1997", "400", "M", 3000),
                 (70, "Sid", "Rai", "2010", "600", "M", 5000)
                 ]

employee_schema = ["employee_id", "first_name", "last_name", "doj", "employee_dept_id", "gender","salary"]

empDF = spark.createDataFrame(data = employee_data, schema = employee_schema)
display(empDF)

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Srishti,Shetty,1998,100,F,8000
20,Aish,Rai,2002,200,M,2000
30,Rishabh,Alva,2010,100,,6000
40,Aarvi,Bhandhary,1996,400,F,7000
50,Pooja,Hegde,2008,500,F,5000
60,Sunil,Shetty,1997,400,M,3000
70,Sid,Rai,2010,600,M,5000


In [0]:
department_data = [("HR", 100),
                   ("Supply", 200),
                   ("Sales", 300),
                   ("Stock", 400)]

department_schema = ["dept_name", "dept_id"]
deptDF= spark.createDataFrame(department_data, department_schema)
display(deptDF)


dept_name,dept_id
HR,100
Supply,200
Sales,300
Stock,400


### Inner Join

In [0]:
from pyspark.sql.functions import col 

dfJoin = empDF.join(deptDF, empDF.employee_dept_id==deptDF.dept_id, "inner")\
                .withColumn("bonus", col("salary")*0.1)\
                .groupBy("dept_name").agg(sum("salary").alias("total_salary"))
                # .groupBy("dept_name").sum("salary") -- works too
                
dfJoin.show()

+---------+------------+
|dept_name|total_salary|
+---------+------------+
|    Stock|       10000|
|       HR|       14000|
|   Supply|        2000|
+---------+------------+



### Physical plan is the default plan given by Catalyst Optimizer

In [0]:
dfJoin.explain() # dfJoin.explain(mode= "simple")

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(4) HashAggregate(keys=[dept_name#246], functions=[finalmerge_sum(merge sum#868L) AS sum(salary#186L)#857L])
   +- AQEShuffleRead coalesced
      +- ShuffleQueryStage 2, Statistics(sizeInBytes=96.0 B, rowCount=3, isRuntime=true)
         +- Exchange hashpartitioning(dept_name#246, 200), ENSURE_REQUIREMENTS, [plan_id=2402]
            +- *(3) HashAggregate(keys=[dept_name#246], functions=[partial_sum(salary#186L) AS sum#868L])
               +- *(3) Project [salary#186L, dept_name#246]
                  +- *(3) SortMergeJoin [cast(employee_dept_id#184 as bigint)], [dept_id#247L], Inner
                     :- Sort [cast(employee_dept_id#184 as bigint) ASC NULLS FIRST], false, 0
                     :  +- AQEShuffleRead coalesced
                     :     +- ShuffleQueryStage 0, Statistics(sizeInBytes=224.0 B, rowCount=7, isRuntime=true)
                     :        +- Exchange hashpartitioning(cast(employee

In [0]:
dfJoin.explain(extended=True)  # dfJoin.explain(mode= "extended")

== Parsed Logical Plan ==
'Aggregate ['dept_name], ['dept_name, sum('salary) AS total_salary#858]
+- Project [employee_id#180L, first_name#181, last_name#182, doj#183, employee_dept_id#184, gender#185, salary#186L, dept_name#246, dept_id#247L, (cast(salary#186L as double) * 0.1) AS bonus#836]
   +- Join Inner, (cast(employee_dept_id#184 as bigint) = dept_id#247L)
      :- LogicalRDD [employee_id#180L, first_name#181, last_name#182, doj#183, employee_dept_id#184, gender#185, salary#186L], false
      +- LogicalRDD [dept_name#246, dept_id#247L], false

== Analyzed Logical Plan ==
dept_name: string, total_salary: bigint
Aggregate [dept_name#246], [dept_name#246, sum(salary#186L) AS total_salary#858L]
+- Project [employee_id#180L, first_name#181, last_name#182, doj#183, employee_dept_id#184, gender#185, salary#186L, dept_name#246, dept_id#247L, (cast(salary#186L as double) * 0.1) AS bonus#836]
   +- Join Inner, (cast(employee_dept_id#184 as bigint) = dept_id#247L)
      :- LogicalRDD [empl

- Parsed Logical Plan (Unresolved Logical Plan) --> 
- Analyzed Logical Plan (Refers Schema Catalogue) --> 
- Optimized Logical Plan (Predicate Boost On) --> 
- Physical Plan

In [0]:
dfJoin.explain(mode= "formatted")

== Physical Plan ==
AdaptiveSparkPlan (30)
+- == Final Plan ==
   * HashAggregate (20)
   +- AQEShuffleRead (19)
      +- ShuffleQueryStage (18), Statistics(sizeInBytes=96.0 B, rowCount=3, isRuntime=true)
         +- Exchange (17)
            +- * HashAggregate (16)
               +- * Project (15)
                  +- * SortMergeJoin Inner (14)
                     :- Sort (7)
                     :  +- AQEShuffleRead (6)
                     :     +- ShuffleQueryStage (5), Statistics(sizeInBytes=224.0 B, rowCount=7, isRuntime=true)
                     :        +- Exchange (4)
                     :           +- * Project (3)
                     :              +- * Filter (2)
                     :                 +- * Scan ExistingRDD (1)
                     +- Sort (13)
                        +- AQEShuffleRead (12)
                           +- ShuffleQueryStage (11), Statistics(sizeInBytes=128.0 B, rowCount=4, isRuntime=true)
                              +- Exchange (10)
     

In [0]:
dfJoin.explain(mode= "cost") # Shows only optimized and physical plan

== Optimized Logical Plan ==
Aggregate [dept_name#246], [dept_name#246, sum(salary#186L) AS total_salary#970L], Statistics(sizeInBytes=6.86E+35 B)
+- Project [salary#186L, dept_name#246], Statistics(sizeInBytes=6.86E+35 B)
   +- Join Inner, (cast(employee_dept_id#184 as bigint) = dept_id#247L), Statistics(sizeInBytes=1.22E+36 B)
      :- Project [employee_dept_id#184, salary#186L], Statistics(sizeInBytes=2.3 EiB)
      :  +- Filter isnotnull(employee_dept_id#184), Statistics(sizeInBytes=8.0 EiB)
      :     +- LogicalRDD [employee_id#180L, first_name#181, last_name#182, doj#183, employee_dept_id#184, gender#185, salary#186L], false, Statistics(sizeInBytes=8.0 EiB)
      +- Filter isnotnull(dept_id#247L), Statistics(sizeInBytes=8.0 EiB)
         +- LogicalRDD [dept_name#246, dept_id#247L], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(4) HashAggregate(keys=[dept_name#246], functions=[finalmerge_sum(merge sum#980L)