In [1]:
# Google Colab Setup (Run this only in Colab)
!pip install pyspark==3.4.1
!pip install delta-spark==2.4.0


Collecting pyspark==3.4.1
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285391 sha256=bb6172fde0625f63f3767a1a89736db25419c86024f68a184f21cd130e243d38
  Stored in directory: /root/.cache/pip/wheels/e9/b4/d8/38accc42606f6675165423e9f0236f8e825f6b6b6048d6743e
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0
    Uninstalling pyspark-3.5.0:
      Successfully uninstalled pyspark-3.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [14]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col, when, sum as _sum

builder = SparkSession.builder \
    .appName("ECommercePipeline") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [8]:

orders_path = "orders_delta"
customers_path = "customers_delta"
products_path = "products_delta"


orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)
orders_df.write.format("delta").mode("overwrite").save(orders_path)
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
customers_df.write.format("delta").mode("overwrite").save(customers_path)

products_df = spark.read.csv("products.csv", header=True, inferSchema=True)
products_df.write.format("delta").mode("overwrite").save(products_path)


In [17]:

orders_df = spark.read.format("delta").load(orders_path)


orders_df.createOrReplaceTempView("orders")

spark.sql("""
    SELECT ProductID, SUM(Quantity * Price) AS TotalRevenue
    FROM orders
    WHERE Status = 'Delivered'
    GROUP BY ProductID
""").show()


+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|       50000|
|    P1003|       30000|
+---------+------------+



In [18]:
customers = spark.read.format("delta").load(customers_path)
orders = spark.read.format("delta").load(orders_path)

orders.join(customers, "CustomerID") \
    .filter(col("Status") == "Delivered") \
    .groupBy("Region") \
    .agg(_sum(col("Quantity") * col("Price")).alias("RegionRevenue")) \
    .show()


+------+-------------+
|Region|RegionRevenue|
+------+-------------+
|  West|        30000|
| North|       125000|
+------+-------------+



In [19]:
from delta.tables import DeltaTable

delta_orders = DeltaTable.forPath(spark, orders_path)

delta_orders.update(
    condition=col("Status") == "Pending",
    set={"Status": "'Cancelled'"}
)


In [20]:
from pyspark.sql import Row

new_return = spark.createDataFrame([
    Row(OrderID=3006, CustomerID='C003', ProductID='P1003', Quantity=1, Price=30000, OrderDate='2024-05-06', Status='Returned')
])

delta_orders.alias("target").merge(
    new_return.alias("source"),
    "target.OrderID = source.OrderID"
).whenNotMatchedInsertAll().execute()


In [24]:
cleaned_orders = spark.read.format("delta").load(orders_path).dropna()
cleaned_orders.write.format("delta").mode("overwrite").save("cleaned_orders_delta")


In [25]:
products = spark.read.format("delta").load(products_path)
cleaned_orders = spark.read.format("delta").load("cleaned_orders_delta")

cleaned_orders.join(products, "ProductID") \
    .withColumn("Revenue", col("Quantity") * col("Price")) \
    .groupBy("Category") \
    .agg(_sum("Revenue").alias("TotalRevenue")) \
    .show()


+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|      285000|
|Accessories|       30000|
+-----------+------------+



In [26]:
spark.read.format("delta").option("versionAsOf", 0).load(orders_path).show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [27]:
old_df = spark.read.format("delta").option("versionAsOf", 0).load(orders_path)
old_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(orders_path)


In [28]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_orders.vacuum(retentionHours=0)


DataFrame[]

In [29]:
orders = spark.read.format("delta").load(orders_path)

orders.filter(
    (col("Quantity") <= 0) |
    (col("Price") <= 0) |
    (col("OrderDate").isNull())
).show()


+-------+----------+---------+--------+-----+---------+------+
|OrderID|CustomerID|ProductID|Quantity|Price|OrderDate|Status|
+-------+----------+---------+--------+-----+---------+------+
+-------+----------+---------+--------+-----+---------+------+



In [30]:
orders.withColumn(
    "OrderType",
    when(col("Status") == "Returned", "Return").otherwise("Normal")
).select("OrderID", "Status", "OrderType").show()


+-------+---------+---------+
|OrderID|   Status|OrderType|
+-------+---------+---------+
|   3001|Delivered|   Normal|
|   3002| Returned|   Return|
|   3003|Delivered|   Normal|
|   3004|Delivered|   Normal|
|   3005|  Pending|   Normal|
+-------+---------+---------+

