In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, round

spark = SparkSession.builder.appName("SalesDataPractice").getOrCreate()

# Sample sales data
data = [
    ("2025-08-01", "North", "Laptop", 2, 70000),
    ("2025-08-01", "South", "Tablet", 5, 15000),
    ("2025-08-02", "East", "Smartphone", 3, 25000),
    ("2025-08-02", "West", "Laptop", 1, 70000),
    ("2025-08-03", "North", "Smartphone", 4, 25000),
    ("2025-08-03", "South", "Tablet", 2, 15000),
    ("2025-08-04", "East", "Laptop", 3, 70000),
    ("2025-08-04", "West", "Smartphone", 2, 25000),
]

columns = ["date", "region", "product", "quantity", "unit_price"]

sales_df = spark.createDataFrame(data, columns)
sales_df.show()

In [0]:
# Add total sales column
sales_df = sales_df.withColumn("total_sales", col("quantity") * col("unit_price"))
sales_df.show()

In [0]:
region_sales = sales_df.groupBy("region").agg(
    sum("total_sales").alias("region_total_sales")
)
region_sales.show()

In [0]:
daily_sales = sales_df.groupBy("date").agg(
    sum("total_sales").alias("daily_total_sales")
).orderBy("date")
daily_sales.show()

In [0]:
sales_df.write.format("delta").mode("overwrite").saveAsTable("sales_data_delta")