In [0]:
dbutils.fs.cp("file:/Workspace/Shared/orders.csv","dbfs:/FileStore/orders.csv")

True

In [0]:
orders_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/orders.csv")
orders_df.write.format("delta").mode("overwrite").save("/delta/orders_converted")

In [0]:
# Reading the source CSV file into a DataFrame
from pyspark.sql.functions import col
df_orders = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/FileStore/orders.csv")

In [0]:
# Add a new column 'TotalAmount' by multiplying 'Quantity' and 'Price'
df_transformed = df_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))

# Filter records where 'Quantity' > 1
df_filtered = df_transformed.filter(col("Quantity") > 1)

# Write the transformed data to a Delta table with schema evolution enabled
df_filtered.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/delta/orders_converted")

In [0]:
from delta.tables import *
delta_table = DeltaTable.forPath(spark, "/delta/orders_converted")

delta_table.alias("tgt").merge(
    df_filtered.alias("src"),
    "tgt.OrderID = src.OrderID"  
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()


In [0]:
%sql

CREATE OR REFRESH LIVE TABLE orders_raw
AS SELECT * FROM cloud_files('dbfs:/FileStore/orders.csv', 'csv', 
    map('header', 'true', 'inferSchema', 'true'));

In [0]:
%sql
CREATE OR REFRESH LIVE TABLE orders_transformed
AS
SELECT 
  OrderID,
  OrderDate,
  CustomerID,
  Product,
  Quantity,
  Price,
  (Quantity * Price) AS TotalAmount
FROM LIVE.orders_raw
WHERE Quantity > 1;

In [0]:
%sql

CREATE OR REFRESH LIVE TABLE orders_incremental
AS
MERGE INTO orders_final tgt
USING LIVE.orders_transformed src
ON tgt.OrderID = src.OrderID
WHEN MATCHED THEN 
  UPDATE SET *
WHEN NOT MATCHED THEN 
  INSERT *;

In [0]:
%sql
-- Read data from the Delta table
SELECT * FROM orders_final;


In [0]:
# Read data from the Delta table
df_orders = spark.read.format("delta").load("/delta/orders_converted")
df_orders.show()

+-------+----------+----------+-------+--------+-----+-----------+
|OrderID| OrderDate|CustomerID|Product|Quantity|Price|TotalAmount|
+-------+----------+----------+-------+--------+-----+-----------+
|    101|2024-01-01|      C001| Laptop|       2| 1000|       2000|
|    103|2024-01-03|      C003| Tablet|       3|  300|        900|
|    105|2024-01-05|      C005|  Mouse|       5|   20|        100|
+-------+----------+----------+-------+--------+-----+-----------+



In [0]:
%sql
-- Update the price of laptops by 10%

UPDATE orders_final
SET Price = Price * 1.10
WHERE Product = 'Laptop';


In [0]:
%sql
DELETE FROM orders_final
WHERE Quantity < 2;


In [0]:
%sql
INSERT INTO orders_final (OrderID, OrderDate, CustomerID, Product, Quantity, Price, TotalAmount)
VALUES (106, '2024-01-06', 'C006', 'Keyboard', 3, 100, 300);

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW updated_orders AS
SELECT * FROM VALUES
    (101, '2024-01-10', 'C001', 'Laptop', 2, 1200),
    (106, '2024-01-12', 'C006', 'Keyboard', 3, 50)
    AS t(OrderID, OrderDate, CustomerID, Product, Quantity, Price);

MERGE INTO orders_final tgt
USING updated_orders src
ON tgt.OrderID = src.OrderID
WHEN MATCHED THEN 
  UPDATE SET 
    tgt.Quantity = src.Quantity,
    tgt.Price = src.Price,
    tgt.TotalAmount = src.Quantity * src.Price
WHEN NOT MATCHED THEN 
  INSERT (OrderID, OrderDate, CustomerID, Product, Quantity, Price, TotalAmount)
  VALUES (src.OrderID, src.OrderDate, src.CustomerID, src.Product, src.Quantity, src.Price, src.Quantity * src.Price);



In [0]:
spark.sql("DESCRIBE HISTORY orders_final");


In [0]:
%sql
--Time Travel Using Version Number
SELECT * FROM orders_final VERSION AS OF 1;

In [0]:
%sql
--Optimize the Delta table using Z-Ordering on the Product column
OPTIMIZE orders_final
ZORDER BY (Product);

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/orders.parquet","dbfs:/FileStore/orders.parquet")

True

In [0]:
csv_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/orders.csv")
csv_df.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/orders.parquet")
parquet_df = spark.read.format("parquet").load("dbfs:/FileStore/orders.parquet")
parquet_df.write.format("delta").mode("overwrite").save("/delta/parquet_files")
delta_df = spark.read.format("delta").load("/delta/parquet_files")
display(delta_df)

OrderID,OrderDate,CustomerID,Product,Quantity,Price
101,2024-01-01,C001,Laptop,2,1000
102,2024-01-02,C002,Phone,1,500
103,2024-01-03,C003,Tablet,3,300
104,2024-01-04,C004,Monitor,1,150
105,2024-01-05,C005,Mouse,5,20
