# Delta CRUD
* Delta comes pre-installed on managed platform like Databricks, otherwise follow the instructions to install it
### Pre-requisites
* https://docs.delta.io/latest/quick-start.html

### Quickstart

In [0]:
#dbutils is a databricks utility function 
dbutils.fs.rm('/tmp/ch3', True)

In [0]:
data = spark.range(0, 5)
data.write.format("delta").mode("overwrite").save("/tmp/ch3/delta-table")

df = spark.read.format("delta").load("/tmp/ch3/delta-table")
df.show()

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "/tmp/ch3/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge( newData.alias("newData"), "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show(5)

In [0]:
df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/ch3/delta-table")
df.show()

In [0]:
streamingDf = spark.readStream.format("rate").load()
stream = streamingDf.selectExpr("value as id").writeStream.format("delta").option("checkpointLocation", "/tmp/ch3/checkpoint").start("/tmp/ch3/delta-table")

In [0]:
stream2 = spark.readStream.format("delta").load("/tmp/ch3/delta-table").writeStream.format("console").start()

## Create a Delta Table

In [0]:
display(dbutils.fs.ls('/databricks-datasets/learning-spark-v2'))

path,name,size,modificationTime
dbfs:/databricks-datasets/learning-spark-v2/.DS_Store,.DS_Store,6148,1602174521000
dbfs:/databricks-datasets/learning-spark-v2/README/,README/,0,1655562813466
dbfs:/databricks-datasets/learning-spark-v2/SPARK_README.md,SPARK_README.md,3359,1577997727000
dbfs:/databricks-datasets/learning-spark-v2/blogs.json,blogs.json,915,1587070414000
dbfs:/databricks-datasets/learning-spark-v2/cctvVideos/,cctvVideos/,0,1655562813466
dbfs:/databricks-datasets/learning-spark-v2/flights/,flights/,0,1655562813466
dbfs:/databricks-datasets/learning-spark-v2/iot-devices/,iot-devices/,0,1655562813466
dbfs:/databricks-datasets/learning-spark-v2/loans/,loans/,0,1655562813466
dbfs:/databricks-datasets/learning-spark-v2/mnm_dataset.csv,mnm_dataset.csv,1284872,1587070414000
dbfs:/databricks-datasets/learning-spark-v2/people/,people/,0,1655562813466


In [0]:
dataPath = '/databricks-datasets/learning-spark-v2/people/people-10m.delta'
# dbutils.fs.ls(dataPath)

In [0]:

spark.sql(""" 
    CREATE TABLE IF NOT EXISTS some_delta_table  
    USING delta  
    OPTIONS (path = '{}') 
  """.format(dataPath)) 

## Read from Delta

In [0]:
%sql
SELECT COUNT(*) FROM some_delta_table 

count(1)
10000000


In [0]:
%sql
SELECT COUNT(*) FROM delta.`/databricks-datasets/learning-spark-v2/people/people-10m.delta`

count(1)
10000000


## Write to Delta

In [0]:
df.write.format("delta").saveAsTable("some_other_delta_table") 

## Update Delta Table

## Delete from Delta Table

## Convert to Delta

### From CSV

In [0]:
%fs head '/databricks-datasets/learning-spark-v2/mnm_dataset.csv'

In [0]:
csvDataPath = '/tmp/chapter3/dataset.csv'

deltaDataPath = '/tmp/chapter3/csvToDelta'
parquetDataPath = '/tmp/chapter3/csvToParquet'

In [0]:
columns = ["State","Color", "Count"]
data = [("TX","Red",20), ("NV","Blue",66), ("CO","Blue",79),("CA","Yellow",53), ("WY","Orange",45)]

color_df = spark.sparkContext.parallelize(data).toDF(columns)
color_df.printSchema()

(color_df.write.option("header",True)
        .option("delimiter",",")
        .mode("overwrite")
        .csv(csvDataPath))

In [0]:

(spark.read  
  .option('header', True)
  .option('inferSchema', True)
  .csv(csvDataPath)
  .write 
  .mode("overwrite") 
  .format("delta") 
  .partitionBy("Color") 
  .save(deltaDataPath) ) 

In [0]:
spark.sql("SELECT count(*) FROM delta.`{}`".format(deltaDataPath)).show()

In [0]:
%sql
SHOW PARTITIONS delta.`/tmp/chapter3/csvToDelta`

Color
Orange
Blue
Yellow
Red


### From Parquet
<p><b>Step 1:</b> Convert Parquet to Delta  <br>
CONVERT TO DELTA <parquet path or table> <br>
[PARTITIONED BY (<partition columns if any>)] <br>

  <p><b>Step 2: </b>Optimize layout for fast queries: in addition to above conversion <br>
OPTIMIZE <delta_table> <br>
WHERE date >= current_timestamp() - INTERVAL 1 day <br>
ZORDER BY (<z order column/s>) <br>

In [0]:
%fs ls '/tmp/chapter3/csvToParquet'

path,name,size,modificationTime
dbfs:/tmp/chapter3/csvToParquet/Color=Blue/,Color=Blue/,0,1655566595388
dbfs:/tmp/chapter3/csvToParquet/Color=Orange/,Color=Orange/,0,1655566595388
dbfs:/tmp/chapter3/csvToParquet/Color=Red/,Color=Red/,0,1655566595388
dbfs:/tmp/chapter3/csvToParquet/Color=Yellow/,Color=Yellow/,0,1655566595388
dbfs:/tmp/chapter3/csvToParquet/_SUCCESS,_SUCCESS,0,1655566567000
dbfs:/tmp/chapter3/csvToParquet/_committed_721177383267740285,_committed_721177383267740285,35,1655566345000


In [0]:
parquet_df = (spark.read  
  .option('header', True)
  .option('inferSchema', True)
  .csv(csvDataPath)
  .write 
  .mode("overwrite") 
  .format("parquet") 
  .partitionBy("Color") 
  .save(parquetDataPath) ) 

In [0]:
spark.sql(""" 
    CREATE TABLE IF NOT EXISTS some_parquet_table  
    USING parquet  
    OPTIONS (path = '{}') 
  """.format(parquetDataPath)) 

In [0]:
%sql
select * from some_parquet_table

State,Count,Color


In [0]:
%sql
CONVERT TO DELTA some_parquet_table PARTITIONED BY (Color)

In [0]:
%sql
OPTIMIZE color_delta_table 
WHERE date >= current_timestamp() - INTERVAL 1 day 
ZORDER BY (State) 