In [None]:
# databricks_cleaning.ipynb (simplified local example)

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, round, when

spark = SparkSession.builder.appName("IKODataCleaning").getOrCreate()

# Load data
sales = spark.read.csv("../01_raw_data/sales.csv", header=True, inferSchema=True)
products = spark.read.csv("../01_raw_data/products.csv", header=True, inferSchema=True)
regions = spark.read.csv("../01_raw_data/regions.csv", header=True, inferSchema=True)

# Clean / format
sales = sales.withColumn("SaleDate", to_date(col("SaleDate"), "yyyy-MM-dd"))

# Join product data for profitability metrics
sales_enriched = (
    sales.join(products, "ProductID", "left")
    .withColumn("ProfitPerUnit", round(col("UnitPrice") - col("ProductionCost"), 2))
    .withColumn("TotalProfit", round(col("QuantitySold") * col("ProfitPerUnit"), 2))
)

# Replace missing or invalid data
sales_enriched = sales_enriched.fillna({"SalesChannel": "Unknown", "Region": "Unspecified"})

# Save transformed version (simulate Data Lake output)
sales_enriched.write.csv("../03_data_warehouse/cleaned_sales.csv", header=True, mode="overwrite")
