In [0]:
# Read data from csv files into dataframe

In [0]:
from pyspark.sql import functions as F

In [0]:
path = "/FileStore/tables/raw_data/employees.csv"
emp_df = (
    spark.read.format("csv")
    .option("header", True)
    .load(path)
)
emp_df.display()

In [0]:
path = "/FileStore/tables/raw_data/employees.csv"
emp_df = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(path)
)
emp_df.display()

In [0]:
agg_df = (
    emp_df
    .groupBy("Gender")
    .agg(F.count(F.col("Gender")).alias("noOfEmp"))
)
agg_df.display()

In [0]:
emp_df.first()

In [0]:
emp_df.take(10)

In [0]:
emp_df.head(10)

In [0]:
emp_df.tail(10)

In [0]:
collected_df = emp_df.groupBy("Department").agg(F.collect_list(F.col("Location")).alias("Locations"))
collected_df.display()

In [0]:
collected_df = emp_df.groupBy("Department").agg(F.collect_set(F.col("Location")).alias("Locations"))
collected_df.display()

In [0]:
collected_df = collected_df.withColumn("Locations", F.explode(F.col("Locations")))
collected_df.display()

In [0]:
# window functions

In [0]:
from pyspark.sql.window import Window

In [0]:
windowSpec = Window.partitionBy("Department").orderBy(F.col("Salary").desc())

emp_df1 = (
    emp_df
    .withColumn("rk", F.dense_rank().over(windowSpec))
    .filter(
        (F.col("rk") == 2) &
        (F.col("Gender") == "Male")
    )
    .drop("rk")
) 
emp_df1.display()

In [0]:
windowSpec = Window.partitionBy("Department").orderBy(F.col("Age").desc())

emp_df2 = (
    emp_df
    .withColumn("rn", F.row_number().over(windowSpec))
    .filter(F.col("rn") <= 2)
    .drop("rn")
)

emp_df2.display()

In [0]:
# Joins

In [0]:
dep_data = [
    (100, "Engineering"),
    (200, "Finance"),
    (300, "HR"),
    (400, "Marketing"),
    (500, "Product"),
    (600, "Sales")
]

dep_df = spark.createDataFrame(dep_data, ["DepartmentId", "Department"])

dep_df.display()

In [0]:
emp_df3 = (
    emp_df
    .join(
        dep_df,
        on=["Department"],
        how="inner"
    )
)

emp_df3.display()

In [0]:
dep_data = [
    (100, "Engineering"),
    (200, "Finance"),
    (300, "HR"),
    (400, "Marketing"),
    (500, "Product"),
    (600, "Sales")
]

dep_df = spark.createDataFrame(dep_data, ["DepartmentId", "DepartmentName"])

dep_df.display()

In [0]:
emp_df3 = (
    emp_df
    .join(
        dep_df,
        on=emp_df["Department"] == dep_df["DepartmentName"],
        how="inner"
    )
    .drop("Department")
)

emp_df3.display()