In [0]:
# Load data (October sample)
events = spark.read.csv("dbfs:/Volumes/workspace/default/kaggle_volume/2019-Oct.csv",header=True,inferSchema=True)


In [0]:
events.printSchema()
events.count()


root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



42448764

In [0]:
# Convert CSV to Delta format
delta_path = "dbfs:/Volumes/workspace/default/kaggle_volume/delta/events"

events.write.format("delta").mode("overwrite").save(delta_path)

print("CSV converted to Delta format")


CSV converted to Delta format


In [0]:
display(dbutils.fs.ls(delta_path))


path,name,size,modificationTime
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/_delta_log/,_delta_log/,0,1768196095657
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00000-9cb19ee3-b503-4afa-8800-fd1af25082d7.c000.snappy.parquet,part-00000-9cb19ee3-b503-4afa-8800-fd1af25082d7.c000.snappy.parquet,33620248,1768195025000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00001-0ed5d0e4-b61f-4025-80f1-f594832fa147.c000.snappy.parquet,part-00001-0ed5d0e4-b61f-4025-80f1-f594832fa147.c000.snappy.parquet,31245950,1768195026000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00002-7681003f-b539-4cd1-aefb-297e4ef14b5b.c000.snappy.parquet,part-00002-7681003f-b539-4cd1-aefb-297e4ef14b5b.c000.snappy.parquet,31464271,1768195025000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00003-b7f2827f-fead-4040-8357-15e95004f2b7.c000.snappy.parquet,part-00003-b7f2827f-fead-4040-8357-15e95004f2b7.c000.snappy.parquet,32185677,1768195025000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00004-0ee390fa-922d-4935-a56d-a5d9b80015ec.c000.snappy.parquet,part-00004-0ee390fa-922d-4935-a56d-a5d9b80015ec.c000.snappy.parquet,33289295,1768195025000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00005-b468f679-0cd7-4f73-990d-dbc3dd234e07.c000.snappy.parquet,part-00005-b468f679-0cd7-4f73-990d-dbc3dd234e07.c000.snappy.parquet,33123887,1768195029000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00006-f8f73220-3b5b-4fd3-8e18-e84807064b40.c000.snappy.parquet,part-00006-f8f73220-3b5b-4fd3-8e18-e84807064b40.c000.snappy.parquet,31359346,1768195029000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00007-7ae287eb-64f9-4281-a682-0d2ad1dd9eb5.c000.snappy.parquet,part-00007-7ae287eb-64f9-4281-a682-0d2ad1dd9eb5.c000.snappy.parquet,32543030,1768195030000
dbfs:/Volumes/workspace/default/kaggle_volume/delta/events/part-00008-f23f4b83-b1bb-4f3b-9c3c-e0de1724f7cf.c000.snappy.parquet,part-00008-f23f4b83-b1bb-4f3b-9c3c-e0de1724f7cf.c000.snappy.parquet,32601180,1768195030000


In [0]:
# Create Delta table using PySpark

events.write.format("delta").mode("overwrite").saveAsTable("workspace.default.events_table")

print("Managed Delta table created")


Managed Delta table created


In [0]:
# Create Delta table using SQL
spark.sql("""
CREATE TABLE workspace.default.events_delta
USING DELTA
AS
SELECT * FROM workspace.default.events_table
""")



DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:

%sql
-- verify table creation
SHOW TABLES IN workspace.default;


database,tableName,isTemporary
default,events_delta,False
default,events_table,False
default,movies,False


In [0]:
# Test Schema Enforcement (Try inserting wrong schema)
from pyspark.sql import Row

try:
    wrong_schema_df = spark.createDataFrame(
        [Row(x="a", y="b", z="c")]
    )

    wrong_schema_df.write.format("delta").mode("append").save(delta_path)

except Exception as e:
    print("Schema enforcement working!")
    print(e)


Schema enforcement working!
[_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 3a9dd800-b948-4845-98ac-6969039c7490).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- event_time: timestamp (nullable = true)
-- event_type: string (nullable = true)
-- product_id: integer (nullable = true)
-- category_id: long (nullable = true)
-- category_code: string (nullable = true)
-- brand: string (nullable = true)
-- price: double (nullable = true)
-- user_id: integer (nullable = true)
-- user_session: string (nullable = true)


Data schema:
root
-- x: string (nullable = true)
-- y: string (nullable = true)
-- z: string (nullable = true)

         
Table ACLs are enabled in this cluster, so 

In [0]:
# Handle Duplicate Inserts (Problem Example)
events.write.format("delta").mode("append").save(delta_path)


In [0]:
# Remove duplicates
deduped_events = events.dropDuplicates(
    ["user_id", "event_time", "product_id"]
)

deduped_events.write.format("delta").mode("append").save(delta_path)

print("Duplicates removed correctly")


Duplicates removed correctly
