In [0]:

import dlt
from pyspark.sql.functions import col, expr, count, avg, desc

#1: Load customers.csv
@dlt.table(name="customers_raw")
def load_customers():
    return (
        spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/customers.csv")
    )

Name,Type
CustomerID,string
Name,string
City,string
Age,string


In [0]:
#2: Load orders.csv
@dlt.table(name="orders_raw")
def load_orders():
    return (
        spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/orders.csv")
    )

Name,Type
OrderID,string
CustomerID,string
Product,string
Quantity,string
Price,string
OrderDate,string


In [0]:
#3: Transform orders with TotalAmount
@dlt.table(name="orders_with_amount")
def add_total_amount():
    df = dlt.read("orders_raw")
    return df.withColumn("TotalAmount", col("Quantity") * col("Price"))

Name,Type
OrderID,string
CustomerID,string
Product,string
Quantity,string
Price,string
OrderDate,string
TotalAmount,double


In [0]:
#4: Join customers and orders
@dlt.table(name="customer_orders")
def join_customers_orders():
    customers_df = dlt.read("customers_raw")
    orders_df = dlt.read("orders_with_amount")
    return customers_df.join(orders_df, on="CustomerID", how="inner")

Name,Type
CustomerID,string
Name,string
City,string
Age,string
OrderID,string
Product,string
Quantity,string
Price,string
OrderDate,string
TotalAmount,double


In [0]:
#5: Filter orders with TotalAmount > 20000
@dlt.table(name="high_value_orders")
def filter_high_value_orders():
    df = dlt.read("customer_orders")
    return df.filter(col("TotalAmount") > 20000)

Name,Type
CustomerID,string
Name,string
City,string
Age,string
OrderID,string
Product,string
Quantity,string
Price,string
OrderDate,string
TotalAmount,double


In [0]:
#6: Customers who placed more than 1 order
@dlt.table(name="multiple_orders")
def multiple_orders():
    df = dlt.read("customer_orders")
    return (
        df.groupBy("CustomerID", "Name")
        .agg(count("OrderID").alias("OrderCount"))
        .filter(col("OrderCount") > 1)
    )

Name,Type
CustomerID,string
Name,string
OrderCount,bigint


In [0]:
#7: Average order value by city
@dlt.table(name="avg_order_by_city")
def avg_order_by_city():
    df = dlt.read("customer_orders")
    return (
        df.groupBy("City")
        .agg(avg("TotalAmount").alias("AvgOrderValue"))
    )


Name,Type
City,string
AvgOrderValue,double


In [0]:
#8: Sorted orders by OrderDate descending
@dlt.table(name="orders_sorted_desc")
def orders_sorted():
    df = dlt.read("customer_orders")
    return df.orderBy(desc("OrderDate"))

Name,Type
CustomerID,string
Name,string
City,string
Age,string
OrderID,string
Product,string
Quantity,string
Price,string
OrderDate,string
TotalAmount,double


In [0]:
#9: Write result as partitioned Parquet by City
@dlt.table(name="partitioned_orders")
def write_parquet_partitioned():
    df = dlt.read("customer_orders")
    return df.repartition("City")  # DLT handles Parquet automatically

Name,Type
CustomerID,string
Name,string
City,string
Age,string
OrderID,string
Product,string
Quantity,string
Price,string
OrderDate,string
TotalAmount,double
