**Convert CSV to Delta format**

In [0]:
nov_events= spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header=True, inferSchema=True)

**Create a volume (unity catalog)**

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.delta;

In [0]:
%sql
SHOW VOLUMES IN workspace.ecommerce;

database,volume_name
ecommerce,delta
ecommerce,ecommerce_data


In [0]:
# Convert csv data into delta format
nov_events.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/nov_events")

**Create delta table with pyspark and sql**

In [0]:
delta_nov_df = spark.read.format("delta").load("/Volumes/workspace/ecommerce/delta/nov_events")


**Handled duplicate records**

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import count
duplicates_df= delta_nov_df.groupBy("user_id","product_id","event_time").agg(count("*").alias("count")).filter(F.col("count") > 1)
display(duplicates_df)

user_id,product_id,event_time,count
553777624,1002532,2019-11-17T09:33:40.000Z,2
514680784,1307441,2019-11-17T09:56:52.000Z,2
518978534,2701422,2019-11-17T10:23:34.000Z,2
512986118,15700008,2019-11-17T10:37:03.000Z,2
571951642,4804056,2019-11-17T06:33:33.000Z,2
541982865,1004723,2019-11-17T06:54:25.000Z,2
513502444,28722186,2019-11-17T08:09:08.000Z,2
513470895,1005115,2019-11-17T15:15:43.000Z,2
514983980,3600913,2019-11-17T15:21:36.000Z,2
554071525,26403559,2019-11-17T15:22:09.000Z,2


In [0]:
clean_df=delta_nov_df.dropDuplicates(["user_id","product_id","event_time"])

clean_df.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/clean_nov_events")    

display(clean_df)


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-17T08:43:05.000Z,view,13200487,2053013557192163841,furniture.bedroom.bed,,160.06,515597661,0caa7427-4eb1-4a3e-a3b4-91e75b793876
2019-11-17T08:43:23.000Z,view,1002524,2053013555631882655,electronics.smartphone,apple,561.66,516146286,e8d67333-1595-4db2-91e5-65ffb030a722
2019-11-17T08:43:25.000Z,view,43900090,2127425440925614628,,,59.18,563247534,e5c7f4f6-0266-41cc-9409-8f41b4e9b799
2019-11-17T08:43:27.000Z,view,1004838,2053013555631882655,electronics.smartphone,oppo,154.42,518949931,018ec00a-c9fd-47b0-bed5-d0b75f0c97d8
2019-11-17T08:43:28.000Z,view,3700823,2053013565983425517,appliances.environment.vacuum,bosch,240.65,512880583,b3846c82-b9dc-4c25-8058-fd7bff451cfe
2019-11-17T08:44:19.000Z,view,1005221,2053013555631882655,electronics.smartphone,xiaomi,287.78,513433383,71d014d5-d3f7-4489-8765-0523aa097d9e
2019-11-17T08:44:27.000Z,view,7100836,2053013555464110485,furniture.bedroom.bed,,64.33,513204517,a588ebee-5740-480a-a01e-4a348718626c
2019-11-17T08:44:28.000Z,view,17000129,2053013558391734853,computers.desktop,mebelson,125.85,548330867,3f4376cd-da88-4d0a-9bd3-ef8edffa8c9e
2019-11-17T08:44:36.000Z,view,1004834,2053013555631882655,electronics.smartphone,samsung,178.9,530439571,bc4779d5-d974-4423-a938-f3f188fe4b6a
2019-11-17T08:44:43.000Z,view,1004836,2053013555631882655,electronics.smartphone,samsung,244.02,572128116,d7d35309-3732-40e5-ac67-ca4e53ea1c0e


**SUMMARY**


▪️ 𝗟𝗼𝗮𝗱𝗲𝗱 𝘁𝗵𝗲 𝗿𝗮𝘄 𝗱𝗮𝘁𝗮𝘀𝗲𝘁 (𝗖𝗦𝗩)
 Started by exploring the schema and understanding the data before touching any transformations.
▪️ 𝗖𝗿𝗲𝗮𝘁𝗲𝗱 𝗮 𝗩𝗼𝗹𝘂𝗺𝗲 (𝗨𝗻𝗶𝘁𝘆 𝗖𝗮𝘁𝗮𝗹𝗼𝗴)
 Before converting CSV to Delta, I created a volume to manage storage properly —
 learning why volumes are preferred for organized and governed data storage.
▪️ 𝗖𝗼𝗻𝘃𝗲𝗿𝘁𝗲𝗱 𝗖𝗦𝗩 𝗱𝗮𝘁𝗮 𝗶𝗻𝘁𝗼 𝗗𝗲𝗹𝘁𝗮 𝗳𝗼𝗿𝗺𝗮𝘁
 Moved raw CSV data into Delta Lake, enabling:
 ✔ ACID transactions
 ✔ Schema enforcement
 ✔ Better reliability & performance
▪️ 𝗖𝗿𝗲𝗮𝘁𝗲𝗱 𝗗𝗲𝗹𝘁𝗮 𝘁𝗮𝗯𝗹𝗲𝘀 𝘂𝘀𝗶𝗻𝗴 𝗣𝘆𝗦𝗽𝗮𝗿𝗸 & 𝗦𝗤𝗟
 Practiced creating Delta tables using both approaches — just like real production pipelines, not just notebook-style code.
▪️ 𝗧𝗲𝘀𝘁𝗲𝗱 𝘀𝗰𝗵𝗲𝗺𝗮 𝗲𝗻𝗳𝗼𝗿𝗰𝗲𝗺𝗲𝗻𝘁 (𝗶𝗻𝘁𝗲𝗻𝘁𝗶𝗼𝗻𝗮𝗹𝗹𝘆 𝗯𝗿𝗼𝗸𝗲 𝘁𝗵𝗶𝗻𝗴𝘀 😅)
 Tried inserting a wrong data type into a Delta table.
 Delta Lake blocked the write — a small error, but a big learning moment about data quality protection.
▪️ 𝗛𝗮𝗻𝗱𝗹𝗲𝗱 𝗱𝘂𝗽𝗹𝗶𝗰𝗮𝘁𝗲 𝗿𝗲𝗰𝗼𝗿𝗱𝘀
 Identified duplicate user_id + product_id + event_time combinations
 and created a clean, deduplicated Delta dataset.

✨ Big takeaway from Day 4
 Delta Lake doesn’t just store data — it protects data.
 Today felt less like practice and more like building something production- ready.
Slowly connecting the dots:
Raw data → governed storage → clean Delta tables → reliable analytics 📊