# Day 4 : Delta Lake Introduction

In [0]:
%sql
-- Ensure we are in the right context
USE CATALOG workspace;
CREATE SCHEMA IF NOT EXISTS ecommerce;

-- Create the Volume where your Delta FILES will live
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.delta_storage;

In [0]:
# Load raw October data
path_oct = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"
df = spark.read.csv(path_oct, header=True, inferSchema=True)

# Define the Delta path inside your Volume
delta_volume_path = "/Volumes/workspace/ecommerce/delta_storage/ecommerce_events_files"

# Write as Delta files
df.write.format("delta").mode("overwrite").save(delta_volume_path)

# Register as a Managed Table (Best for SQL/Analytics)
df.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.oct_events_delta")

print("Successfully converted CSV to Delta and registered the table!")

Successfully converted CSV to Delta and registered the table!


In [0]:
from pyspark.sql import Row

# Create a row with an extra column 'coupon_code' that doesn't exist in the table
bad_data = [Row(event_time="2019-10-01", event_type="view", product_id=123, 
                category_id=456, category_code="electronics", brand="apple", 
                price=999.0, user_id=789, user_session="abc", coupon_code="DISCOUNT10")]

bad_df = spark.createDataFrame(bad_data)

try:
    # This should fail because 'coupon_code' is not in the schema
    bad_df.write.format("delta").mode("append").saveAsTable("workspace.ecommerce.oct_events_delta")
except Exception as e:
    print("-" * 30)
    print("✅ SCHEMA ENFORCEMENT TEST SUCCESSFUL")
    print("The system blocked the write as expected. Error caught!")
    print("-" * 30)

------------------------------
✅ SCHEMA ENFORCEMENT TEST SUCCESSFUL
The system blocked the write as expected. Error caught!
------------------------------


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col

# 1. Access the Delta Table
deltaTable = DeltaTable.forName(spark, "workspace.ecommerce.oct_events_delta")

# 2. Create a "Batch" of updates (simulating 5 rows with a price change)
# We use the existing data but modify the price to see the 'Update' in action
updates_df = df.limit(5).withColumn("price", col("price") + 5.0)

# 3. Perform the Merge logic
(deltaTable.alias("target")
  .merge(
    updates_df.alias("source"),
    "target.product_id = source.product_id AND target.event_time = source.event_time"
  )
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute())

print("MERGE (Upsert) operation completed successfully!")

MERGE (Upsert) operation completed successfully!


In [0]:
%sql
-- See the transaction log of everything we just did!
DESCRIBE HISTORY workspace.ecommerce.oct_events_delta;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2026-01-12T18:25:39.000Z,76058570651149,gomesrohit92@gmail.com,MERGE,"Map(predicate -> [""((product_id#14436 = product_id#13850) AND (event_time#14434 = event_time#13848))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(2217144857195530),0112-181520-515by3hp-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 3034, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 5, executionTimeMs -> 7225, materializeSourceTimeMs -> 909, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 3013, numTargetRowsUpdated -> 5, numOutputRows -> 5, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 5, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 3100)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-12T18:25:27.000Z,76058570651149,gomesrohit92@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2217144857195530),0112-181520-515by3hp-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 43, numRemovedFiles -> 43, numRemovedBytes -> 1405244778, numDeletionVectorsRemoved -> 0, numOutputRows -> 42448764, numOutputBytes -> 1405244778)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-12T18:21:41.000Z,76058570651149,gomesrohit92@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2217144857195530),0112-181520-515by3hp-v2n,,WriteSerializable,True,"Map(numFiles -> 43, numOutputRows -> 42448764, numOutputBytes -> 1405244778)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
