In [0]:
dbutils.fs.mounts()

In [0]:
%fs
ls "/mnt/j2dadlscontainer/raw_datasets"

In [0]:
# Predicate Push Down
# read employee csv file from adls account

emp_df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("dbfs:/mnt/j2dadlscontainer/raw_datasets/Employee.csv")
emp_df.display()

In [0]:
from pyspark.sql import functions as F

In [0]:
emp_df = (
    emp_df
    .withColumn(
        "LeaveOrNot",
        F.when(F.col("LeaveOrNot") == 0, False)
        .otherwise(True)
    )
)
emp_df.display()

In [0]:
emp_df = (
    emp_df
    .withColumn(
        "EverBenched",
        F.when(F.col("EverBenched") == "No", False)
        .otherwise(True)
    )
)
emp_df.display()

In [0]:
emp_df.filter(F.col("Education") == "PHD").display()

In [0]:
emp_df.write.format("parquet").partitionBy("JoiningYear").mode("overwrite").save("dbfs:/mnt/j2dadlscontainer/processed_datasets/employee")

In [0]:
# partition pruning
emp_df2 = spark.read.format("parquet").load("dbfs:/mnt/j2dadlscontainer/processed_datasets/employee")
emp_df2_temp = emp_df2.filter(F.col("JoiningYear") == 2015)
emp_df2_temp.display()

In [0]:
emp_df2_temp = (
    emp_df2
    .filter(
        (F.col("JoiningYear") == 2015) &
        (F.col("Age") > 35)
    )
)
emp_df2_temp.display()

In [0]:
# Broadcast Joins


emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)


dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)

In [0]:
spark.conf.get("spark.sql.adaptive.enabled")

In [0]:
spark.conf.set("spark.sql.adaptive.enabled", False)

In [0]:
spark.conf.get("spark.sql.adaptive.enabled")

In [0]:
# normal join

emp_dep_df = (
    empDF
    .join(
        deptDF,
        empDF.emp_dept_id == deptDF.dept_id,
        how="left"
    )
)
emp_dep_df.display()

In [0]:
emp_dep_df = (
    empDF
    .join(
        F.broadcast(deptDF),
        empDF.emp_dept_id == deptDF.dept_id,
        how="left"
    )
)
emp_dep_df.display()