#**Assignment-1**

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [0]:
spark = SparkSession.builder.appName("dbshell-01").getOrCreate()

# **Loading Dataset**

In [0]:
dfOrd = spark.read.csv("/FileStore/tables/orders.csv", header=True, inferSchema=True)
dfCus = spark.read.csv("/FileStore/tables/customers.csv", header=True, inferSchema=True)
dfPro = spark.read.csv("/FileStore/tables/products.csv", header=True, inferSchema=True)

# **Pyspark + Delta**

In [0]:
# 1. Ingest all 3 CSVs as Delta Tables.
spark.sql("CREATE DATABASE IF NOT EXISTS sales_info")
spark.sql("USE sales_info")

dfOrd.write.format("delta").mode("overwrite").saveAsTable("sales_info.orders")
dfCus.write.format("delta").mode("overwrite").saveAsTable("sales_info.customers")
dfPro.write.format("delta").mode("overwrite").saveAsTable("sales_info.products")

In [0]:
# 2. Write SQL to get the total revenue per Product.
spark.sql("""
          SELECT p.ProductName, SUM(o.Quantity * o.Price) AS totalRevenue FROM products p
          INNER JOIN orders o
          ON o.ProductID = p.ProductID
          GROUP BY p.ProductName
          """).show()

+-----------+------------+
|ProductName|totalRevenue|
+-----------+------------+
|      Phone|      150000|
|     Laptop|       75000|
|     Tablet|       30000|
|   Keyboard|       30000|
+-----------+------------+



In [0]:
# 3. Join Orders + Customers to find revenue by Region.
dlOrd2 = DeltaTable.forName(spark, "orders")
dlCus2 = DeltaTable.forName(spark, "customers")
dlPro2 = DeltaTable.forName(spark, "products")
dfOrd2 = dlOrd2.toDF()
dfCus2 = dlCus2.toDF()
dfPro2 = dlPro2.toDF()

dfOrd2.join(dfCus2, on="CustomerID", how="inner") \
    .groupBy("Region") \
    .agg(
        F.sum(F.col("Quantity") * F.col("Price")).alias("regionRevenue")
    ) \
    .show()

+------+-------------+
|Region|regionRevenue|
+------+-------------+
| South|       100000|
|  East|        30000|
|  West|        30000|
| North|       125000|
+------+-------------+



In [0]:
# 4. Update the Status of Pending orders to 'Cancelled'.
dfOrdUpdated = dfOrd2.withColumn("Status", F.when(dfOrd2.Status == 'Pending', 'Cancelled').otherwise(dfOrd2.Status))
dfOrdUpdated.show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|Cancelled|
+-------+----------+---------+--------+-----+----------+---------+



In [0]:
# 5. Merge a new return record into Orders.
dlOrd2.alias("target") \
    .merge(
    dfOrdUpdated.alias("source"),
    'target.OrderID = source.OrderID'
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

upserted = dlOrd2.toDF()
upserted.show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|Cancelled|
+-------+----------+---------+--------+-----+----------+---------+



# **DLT Pipeline**

In [0]:
# 6. Create raw → cleaned → aggregated tables:
# Clean: Remove rows with NULLs
# Aggregated: Total revenue per Category
import dlt 

@dlt.table
def extract_1():
  return spark.read.table("sales_data.orders")

@dlt.table
def extract_2():
  return spark.read.table("sales_data.customers")

@dlt.table
def extract_3():
  return spark.read.table("sales_data.products")

@dlt.table 
def cleaned_1():
  return dlt.read("extract_1").dropna()

@dlt.table 
def cleaned_2():
  return dlt.read("extract_2").dropna()

@dlt.table 
def cleaned_3():
  return dlt.read("extract_3").dropna()

@dlt.table
def agg_final():
  orders = dlt.read("cleaned_1")
  products = dlt.read("cleaned_3")
  return orders.join(products, on="ProductID", how="inner") \
          .groupBy("Category") \
          .agg(
            F.sum(F.col("Quantity") * F.col("Price"))
          )

# **Time Travel**

In [0]:
# 7. View data before the Status update.
history = dlOrd2.history()
history.select(["version", "operation", "operationParameters"]).show(truncate=False)

dlOrd2.restoreToVersion(5)
beforeUpdate = dlOrd2.toDF()
beforeUpdate.show()

+-------+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|version|operation                        |operationParameters                                                                                                                                                                    |
+-------+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|8      |RESTORE                          |{version -> 5, timestamp -> null}                                                                                                                                                      |
|7      |RESTORE                          |{version -> 5, timestamp -> null}            

In [0]:
# 8. Restore to an older version of the orders table.
dlOrd2.restoreToVersion(5)

Out[41]: DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]

# **Vaccum + Retention**

In [0]:
# 9. Run VACUUM after changing default retention.
spark.sql("""
          ALTER TABLE orders SET TBLPROPERTIES (
              'delta.logRetentionDuration' = '14 days',
              'delta.deletedFileRetentionDuration' = '14 days'
          )
          """)
    
dlOrd2 = DeltaTable.forName(spark, "orders")
dlOrd2.vacuum(1290)

Out[46]: DataFrame[]

# **Expectations**

In [0]:
# 10. Quantity > 0 , Price > 0 , OrderDate is not null
@dlt.table
def raw():
  return spark.read.format("delta").table("orders")

@dlt.expect_or_drop("check_qty_price_date", "Quantity > 0 AND Price > 0 AND OrderDate IS NOT NULL")
def silver():
  return dlt.read("raw")

# **Bonus**

In [0]:
# 11. Use when-otherwise to create a new column: OrderType = "Return" if Status == 'Returned'
dfOrd2.withColumn(
    "OrderType",
    F.when(F.col("Status") == "Returned", "Return").otherwise("NotReturn")
    ) \
    .show()

+-------+----------+---------+--------+-----+----------+---------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|OrderType|
+-------+----------+---------+--------+-----+----------+---------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|NotReturn|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|   Return|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|NotReturn|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|NotReturn|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|NotReturn|
+-------+----------+---------+--------+-----+----------+---------+---------+

