In [0]:
spark.conf.set("spark.sql.shuffle.partitions", 50)

In [0]:
df = (spark.read
      .format("csv")
      .option("header", True)
      .option("inferSchema", True)
      .load("/Volumes/external-catalog/default/test-volumne/Employee_Attrition.csv"))
display(df)

In [0]:
high_risk_df = (
    df.filter((df.Attrition == "No") & (df.JobSatisfaction < 3))
      .select(
          "EmployeeNumber",
          "Age",
          "Department",
          "JobRole",
          "JobSatisfaction",
          "OverTime",
          "MonthlyIncome",
          "YearsAtCompany",
          "DistanceFromHome",
          "EnvironmentSatisfaction",
          "WorkLifeBalance"
      )
)

high_risk_df.write.format("delta").mode("append").saveAsTable("`external-catalog`.default.high_risk_attrition_employees")

In [0]:
versions_df = spark.sql("""
  DESCRIBE HISTORY `external-catalog`.default.high_risk_attrition_employees
""")
display(versions_df.select("version", "timestamp", "operation"))

In [0]:
delta_df = spark.read.format("delta").table("`external-catalog`.default.high_risk_attrition_employees")
display(delta_df)

In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define schema to match existing table
schema = StructType([
    StructField("EmployeeNumber", IntegerType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True),
    StructField("JobRole", StringType(), True),
    StructField("JobSatisfaction", IntegerType(), True),
    StructField("OverTime", StringType(), True),
    StructField("MonthlyIncome", IntegerType(), True),
    StructField("YearsAtCompany", IntegerType(), True),
    StructField("DistanceFromHome", IntegerType(), True),
    StructField("EnvironmentSatisfaction", IntegerType(), True),
    StructField("WorkLifeBalance", IntegerType(), True)
])

# Create dummy data
dummy_data = [(-1, 0, "Dummy", "Dummy", 0, "No", 0, 0, 0, 0, 0)]

dummy_df = spark.createDataFrame(dummy_data, schema)
dummy_df.write.format("delta").mode("append").saveAsTable("`external-catalog`.default.high_risk_attrition_employees")

In [0]:
delta_df = spark.read.format("delta").table("`external-catalog`.default.high_risk_attrition_employees")
display(delta_df)

In [0]:
versions_df = spark.sql("""
  DESCRIBE HISTORY `external-catalog`.default.high_risk_attrition_employees
""")
display(versions_df.select("version", "timestamp", "operation"))

In [0]:
delta_df_v = spark.read.format("delta").option("versionAsOf", 0).table("`external-catalog`.default.high_risk_attrition_employees")
display(delta_df_v)

In [0]:
delta_df_ts = spark.read.format("delta").option("timestampAsOf", "2025-09-25T14:32:00.000+00:00").table("`external-catalog`.default.high_risk_attrition_employees")
display(delta_df_ts)

In [0]:
spark.sql("""
  CREATE VOLUME IF NOT EXISTS `external-catalog`.default.employee_transformed_data
""")

In [0]:
transformed_df = (
    df.withColumn("IsHighIncome", df.MonthlyIncome > 10000)
      .withColumn("IsLongTenure", df.YearsAtCompany > 5)
)

transformed_df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("Department") \
    .save("/Volumes/external-catalog/default/employee_transformed_data")