(A) Read data from Data Lake

In [0]:
%python

 #Create Schema for green line taxi

 from pyspark.sql.functions import *
 from pyspark.sql.types import *

 yellowTaxiSchema = (
   StructType()
   .add("RideId", "integer")
   .add("VendorId", "integer")
   .add("PickupTime", "timestamp")
   .add("DropTime", "timestamp")
   .add("PickupLocationId", "integer")
   .add("DropLocationId", "integer")
   .add("CabNumber", "string")
   .add("DriverLicenseNumber", "string")
   .add("PassengerCount", "integer")
   .add("TripDistance", "double")
   .add("RatecodeId", "integer")
   .add("PaymentType", "integer")
   .add("TotalAmount", "double")
   .add("FareAmount", "double")
   .add("Extra", "double")
   .add("MtaTax", "double")
   .add("TripAmount", "double")
   .add("TollsAmount", "double")
   .add("ImprovementSurcharge", "double")
 )

In [0]:
%python

yellowTaxisDF = (
    spark
    .read
    .option("header", "true")
    .schema(yellowTaxiSchema)
    .csv("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis/YellowTaxis1.csv")

)

yellowTaxisDF.count()

9999995

In [0]:
%python

display(yellowTaxisDF)

RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
1,1,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3
2,1,2022-03-01T00:00:00Z,2022-03-01T00:10:56Z,161,68,T489328C,5076150,1,1.1,1,2,9.8,8.5,0.5,0.5,0.0,0.0,0.3
3,1,2022-03-01T00:00:00Z,2022-03-01T00:11:20Z,141,170,T509308C,5067782,0,1.7,1,1,12.35,9.0,0.5,0.5,2.05,0.0,0.3
4,2,2022-03-01T00:00:00Z,2022-03-01T00:20:01Z,161,68,VG354,5012911,2,2.86,1,1,18.17,14.5,0.5,0.5,2.37,0.0,0.3
5,2,2022-03-01T00:00:00Z,2022-03-01T00:00:00Z,239,239,T517119C,429996,1,0.0,2,1,66.0,52.0,0.0,0.5,13.2,0.0,0.3
6,2,2022-03-01T00:00:00Z,2022-03-01T00:00:00Z,48,48,T523479C,469162,1,0.1,5,2,62.8,62.0,0.0,0.5,0.0,0.0,0.3
7,2,2022-03-01T00:00:00Z,2022-03-01T00:00:00Z,48,230,T526628C,370606,1,0.89,1,2,14.8,14.0,0.0,0.5,0.0,0.0,0.3
8,2,2022-03-01T00:00:00Z,2022-03-01T00:00:00Z,234,230,T489378C,460439,5,2.22,1,2,22.8,22.0,0.0,0.5,0.0,0.0,0.3
9,2,2022-03-01T00:00:00Z,2022-03-01T00:00:00Z,113,68,T602840C,460019,1,2.87,1,1,17.38,14.5,0.5,0.5,1.58,0.0,0.3
10,2,2022-03-01T00:00:01Z,2022-03-01T00:11:15Z,137,79,T515369C,493727,4,1.69,1,1,12.88,9.0,0.5,0.5,2.58,0.0,0.3


In [0]:
%python
# Writing without delta

(
  yellowTaxisDF
  .write
  .mode("overwrite")
  .partitionBy("VendorId")
  .format("parquet")
  .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.parquet")
)

In [0]:
%python
# Writing in delta format

(
  yellowTaxisDF
  .write
  .mode("overwrite")
  .partitionBy("VendorId")
  .format("delta")
  .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta")
)

Options to create Delta Tables

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS TaxisDB

In [0]:
%sql
-- Create table based on Parquet data

CREATE TABLE IF NOT EXISTS TaxisDB.YellowTaxisParquet
USING PARQUET
LOCATION "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.parquet"

In [0]:
%sql
-- Create table based on Delta data

CREATE TABLE IF NOT EXISTS TaxisDB.YellowTaxisDelta
USING DELTA
LOCATION "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta"

In [0]:
%sql
Select count(*) as A
from TaxisDB.YellowTaxisParquet
union all
Select count(*) as B 
from TaxisDB.YellowTaxisDelta

A
9999995
9999995


In [0]:
%sql
-- Describe table
DESCRIBE TABLE EXTENDED TaxisDB.YellowTaxisParquet

-- Same as FORMATTED TABLE command, shows schema, partition column, other metadata (storage loc, provider = delta)

col_name,data_type,comment
RideId,int,
PickupTime,timestamp,
DropTime,timestamp,
PickupLocationId,int,
DropLocationId,int,
CabNumber,string,
DriverLicenseNumber,string,
PassengerCount,int,
TripDistance,double,
RatecodeId,int,


In [0]:
%sql
-- Audit history for delta table

DESCRIBE HISTORY TaxisDB.YellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-05-12T04:35:27Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,False,"Map(numFiles -> 37, numOutputRows -> 9999995, numOutputBytes -> 247650820)",,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
-- Drop Delta table
DROP TABLE TaxisDB.YellowTaxisDelta

In [0]:
%python
# Remove files used for Delta table
dbutils.fs.rm("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta", True)

True

In [0]:
%python
# Save dataframe as Delta file and Delta table together

(
    yellowTaxisDF
    .write
    .mode("overwrite")
    .partitionBy("vendorId")
    .format("delta")
    .option("path", "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta")
    .saveAsTable("TaxisDB.YellowTaxisDelta")
)

In [0]:
%sql
-- Audit history for delta table again (previous was write operation this is create or replace operation)

DESCRIBE HISTORY TaxisDB.YellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-05-12T05:13:40Z,593534500119395,shaurya.rawat@ukg.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""VendorId""], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,False,"Map(numFiles -> 23, numOutputRows -> 9999995, numOutputBytes -> 246502475)",,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%python
# Rerun the overwrite command 

(
    yellowTaxisDF
    .write
    .mode("overwrite")
    .partitionBy("vendorId")
    .format("delta")
    .option("path", "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta")
    .saveAsTable("TaxisDB.YellowTaxisDelta")
)

In [0]:
%sql
-- Audit history for delta table again 

DESCRIBE HISTORY TaxisDB.YellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2024-05-12T05:19:55Z,593534500119395,shaurya.rawat@ukg.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""VendorId""], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,False,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 248751037)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:13:40Z,593534500119395,shaurya.rawat@ukg.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""VendorId""], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,False,"Map(numFiles -> 23, numOutputRows -> 9999995, numOutputBytes -> 246502475)",,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
DROP TABLE TaxisDB.yellowTaxisDelta

In [0]:
%python
dbutils.fs.rm("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta", True)

True

In [0]:
%sql
-- Create Table definition and add data
-- Create table using DDL command 
-- Add data  
CREATE TABLE TaxisDB.yellowTaxisDelta (
  RideId INT COMMENT "This is the primary key column",
  VendorId INT,

  PickupTime TIMESTAMP,
  DropTime TIMESTAMP,

  PickupLocationId INT,
  DropLocationId INT,

  CabNumber STRING,
  DriverLicenseNumber STRING,

  PassengerCount INT,

  TripDistance DOUBLE,
  RatecodeId INT,
  PaymentType INT,
  
  TotalAmount DOUBLE,
  FareAmount DOUBLE,
  Extra DOUBLE,
  MtaTax DOUBLE,
  TripAmount DOUBLE,

  TollsAmount DOUBLE,
  ImprovementSurcharge DOUBLE,

  PickUpYear INT GENERATED ALWAYS AS (YEAR(PickupTime)) COMMENT 'Auto generated year from PickUpTime',
  PickUpMonth INT GENERATED ALWAYS AS (MONTH(PickupTime)) COMMENT 'Auto generated month from PickUpTime',
  PickUpDay INT GENERATED ALWAYS AS (DAY(PickupTime)) COMMENT 'Auto generated day from PickUpTime'
)
USING DELTA -- Default is delta
LOCATION "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta"
PARTITIONED BY (VendorId) -- optional
COMMENT  "This table stores rides information for yellow taxi"

In [0]:
%sql
DESCRIBE TABLE EXTENDED taxisDB.yellowTaxisDelta

col_name,data_type,comment
RideId,int,This is the primary key column
VendorId,int,
PickupTime,timestamp,
DropTime,timestamp,
PickupLocationId,int,
DropLocationId,int,
CabNumber,string,
DriverLicenseNumber,string,
PassengerCount,int,
TripDistance,double,


Options to insert data
1. Insert command (SQL)
2. Append dataframe (Pyspark, scala)
3. Copy command (SQL)

In [0]:
%sql
-- SQL Insert
INSERT INTO taxisDB.yellowTaxisDelta 
(
  RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RateCodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
)
VALUES
(
  '1','1','2022-03-01T00:00:00.000Z','2022-03-01T00:15:34.000Z','170','140','TAC399','5131685','1','2.9','1','1','15.3','13','0.5','0.5','1','0','0.3'
)

num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql
Select * from taxisDB.yellowTaxisDelta

RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge,PickUpYear,PickUpMonth,PickUpDay
1,1,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3,2022,3,1


In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%python
# Append a dataframe 

yellowTaxisAppendDF = (
    spark
    .read 
    .option("header", "true")
    .schema(yellowTaxiSchema)
    .csv("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis_Additional/YellowTaxis_append.csv")
)
display(yellowTaxisAppendDF)

RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
9999997,3,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3
9999998,3,2022-03-01T00:00:00Z,2022-03-01T00:10:56Z,161,68,T489328C,5076150,1,1.1,1,2,9.8,8.5,0.5,0.5,0.0,0.0,0.3
9999999,3,2022-03-01T00:00:00Z,2022-03-01T00:11:20Z,141,170,T509308C,5067782,0,1.7,1,1,12.35,9.0,0.5,0.5,2.05,0.0,0.3


In [0]:
%python
# Append to deltalake in delta format
(
    yellowTaxisAppendDF
    .write 
    .mode("append")
    .partitionBy("VendorId")
    .format("delta")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta")
)

In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
Select * from taxisDB.yellowTaxisDelta

RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge,PickUpYear,PickUpMonth,PickUpDay
1,1,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3,2022,3,1
9999997,3,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3,2022,3,1
9999998,3,2022-03-01T00:00:00Z,2022-03-01T00:10:56Z,161,68,T489328C,5076150,1,1.1,1,2,9.8,8.5,0.5,0.5,0.0,0.0,0.3,2022,3,1
9999999,3,2022-03-01T00:00:00Z,2022-03-01T00:11:20Z,141,170,T509308C,5067782,0,1.7,1,1,12.35,9.0,0.5,0.5,2.05,0.0,0.3,2022,3,1


In [0]:
%sql
-- Copy command 
COPY INTO taxisDB.yellowTaxisDelta
FROM "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis/YellowTaxis1.csv"
-- If folder is provided in FROM clause, files can be provided eg: FILES = ('f1.csv', 'f2.csv')
FILEFORMAT = CSV
VALIDATE ALL 
FORMAT_OPTIONS ('header' = 'true')
-- It failes as csv has string data and table has int and double. Validation fails

In [0]:
%sql
-- Write Copy Command
COPY INTO taxisDB.yellowTaxisDelta
FROM (
  SELECT RideId::Int,
  VendorId::Int,
  PickupTime::Timestamp,
  DropTime::Timestamp,
  PickupLocationId::Int,
  DropLocationId::Int,
  CabNumber::String,
  DriverLicenseNumber::String,
  PassengerCount::Int,
  TripDistance::Double,
  RateCodeId::Int as RatecodeId,
  PaymentType::Int,
  TotalAmount::Double,
  FareAmount::Double,
  Extra::Double,
  MtaTax::Double,
  TipAmount::Double as TripAmount,
  TollsAmount::Double,
  ImprovementSurcharge::Double
  FROM "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis/YellowTaxis1.csv"
)
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
0,0,0


In [0]:
%sql
-- Rerun again - No data inserted
-- Copy command keeps track of loaded records
COPY INTO taxisDB.yellowTaxisDelta
FROM (
  SELECT RideId::Int,
  VendorId::Int,
  PickupTime::Timestamp,
  DropTime::Timestamp,
  PickupLocationId::Int,
  DropLocationId::Int,
  CabNumber::String,
  DriverLicenseNumber::String,
  PassengerCount::Int,
  TripDistance::Double,
  RateCodeId::Int as RatecodeId,
  PaymentType::Int,
  TotalAmount::Double,
  FareAmount::Double,
  Extra::Double,
  MtaTax::Double,
  TipAmount::Double as TripAmount,
  TollsAmount::Double,
  ImprovementSurcharge::Double
  FROM "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis/YellowTaxis1.csv"
)
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
0,0,0


In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2024-05-12T06:31:53Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,2.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247363347, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
-- Copy command with force option (duplicates the data)
COPY INTO taxisDB.yellowTaxisDelta
FROM (
  SELECT RideId::Int,
  VendorId::Int,
  PickupTime::Timestamp,
  DropTime::Timestamp,
  PickupLocationId::Int,
  DropLocationId::Int,
  CabNumber::String,
  DriverLicenseNumber::String,
  PassengerCount::Int,
  TripDistance::Double,
  RateCodeId::Int as RatecodeId,
  PaymentType::Int,
  TotalAmount::Double,
  FareAmount::Double,
  Extra::Double,
  MtaTax::Double,
  TipAmount::Double as TripAmount,
  TollsAmount::Double,
  ImprovementSurcharge::Double
  FROM "abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis/YellowTaxis1.csv"
)
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true')
COPY_OPTIONS ('force' = 'true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
9999995,9999995,0


In [0]:
%sql
DESCRIBE HISTORY TaxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2024-05-12T06:36:59Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,3.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247983488, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
3,2024-05-12T06:31:53Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,2.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247363347, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
-- Finding out the files names
Select _metadata.file_path, *
from TaxisDB.yellowTaxisDelta

file_path,RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge,PickUpYear,PickUpMonth,PickUpDay
abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta/VendorId=1/part-00000-19e288d6-9ee0-4e9d-9321-0e953c505362.c000.snappy.parquet,1,1,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3,2022,3,1
abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta/VendorId=3/part-00000-f8957112-b65b-43f0-8713-bd298ae6cf9a.c000.snappy.parquet,9999997,3,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,1,15.3,13.0,0.5,0.5,1.0,0.0,0.3,2022,3,1
abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta/VendorId=3/part-00000-f8957112-b65b-43f0-8713-bd298ae6cf9a.c000.snappy.parquet,9999998,3,2022-03-01T00:00:00Z,2022-03-01T00:10:56Z,161,68,T489328C,5076150,1,1.1,1,2,9.8,8.5,0.5,0.5,0.0,0.0,0.3,2022,3,1
abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/YellowTaxis.delta/VendorId=3/part-00000-f8957112-b65b-43f0-8713-bd298ae6cf9a.c000.snappy.parquet,9999999,3,2022-03-01T00:00:00Z,2022-03-01T00:11:20Z,141,170,T509308C,5067782,0,1.7,1,1,12.35,9.0,0.5,0.5,2.05,0.0,0.3,2022,3,1


In [0]:
%sql
update taxisDB.yellowTaxisDelta 
SET passengerCount = 2
where RideId = 9999997

num_affected_rows
1


In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta 

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
8,2024-05-12T07:00:21Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#27874 = 9999997)""])",,List(726288301620293),0906-134047-2tr7h7co,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5232, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1433, scanTimeMs -> 544, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5389, rewriteTimeMs -> 853)",,Databricks-Runtime/13.3.x-photon-scala2.12
7,2024-05-12T06:51:58Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,6.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
6,2024-05-12T06:51:37Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,5.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
5,2024-05-12T06:50:33Z,593534500119395,shaurya.rawat@ukg.com,TRUNCATE,Map(),,,,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 8, numDeletedRows -> 19999994, executionTimeMs -> 185, numRemovedBytes -> 495356843)",,Databricks-Runtime/15.1.x-photon-scala2.12
4,2024-05-12T06:36:59Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,3.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247983488, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
3,2024-05-12T06:31:53Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,2.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247363347, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
DELETE FROM taxisDB.yellowTaxisDelta
where RideId = 9999999

num_affected_rows
1


In [0]:
%sql
DESCRIBE HISTORY taxisDB.YellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
9,2024-05-12T07:07:27Z,593534500119395,shaurya.rawat@ukg.com,DELETE,"Map(predicate -> [""(RideId#29789 = 9999999)""])",,List(726288301620293),0906-134047-2tr7h7co,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5389, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1008, numDeletedRows -> 1, scanTimeMs -> 300, numAddedFiles -> 1, numAddedBytes -> 5300, rewriteTimeMs -> 708)",,Databricks-Runtime/13.3.x-photon-scala2.12
8,2024-05-12T07:00:21Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#27874 = 9999997)""])",,List(726288301620293),0906-134047-2tr7h7co,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5232, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1433, scanTimeMs -> 544, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5389, rewriteTimeMs -> 853)",,Databricks-Runtime/13.3.x-photon-scala2.12
7,2024-05-12T06:51:58Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,6.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
6,2024-05-12T06:51:37Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,5.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
5,2024-05-12T06:50:33Z,593534500119395,shaurya.rawat@ukg.com,TRUNCATE,Map(),,,,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 8, numDeletedRows -> 19999994, executionTimeMs -> 185, numRemovedBytes -> 495356843)",,Databricks-Runtime/15.1.x-photon-scala2.12
4,2024-05-12T06:36:59Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,3.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247983488, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
3,2024-05-12T06:31:53Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,2.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247363347, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
1,2024-05-12T05:54:18Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
0,2024-05-12T05:43:06Z,593534500119395,shaurya.rawat@ukg.com,CREATE TABLE,"Map(partitionBy -> [""VendorId""], description -> This table stores rides information for yellow taxi, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(726288301620293),0906-134047-2tr7h7co,,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12


MERGE COMMAND

In [0]:
%python
# Create a new dataframe
# extract changed records from data lake
yellowTaxiChangesDF = (
    spark
    .read
    .option("header", "true")
    .schema(yellowTaxiSchema)
    .csv("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/YellowTaxis_Additional/YellowTaxis_changes.csv")
)
display(yellowTaxiChangesDF)


RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RatecodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
9999996,3,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,1,2.9,1,3,15.3,13.0,0.5,0.5,1.0,0.0,0.3
9999997,3,2022-03-01T00:00:00Z,2022-03-01T00:15:34Z,170,140,TAC399,5131685,2,2.9,1,3,15.3,13.0,0.5,0.5,1.0,0.0,0.3
9999998,3,2022-03-01T00:00:00Z,2022-03-01T00:10:56Z,161,68,T489328C,5076150,1,1.1,1,3,9.8,8.5,0.5,0.5,0.0,0.0,0.3
10000000,3,2022-03-01T00:00:00Z,2022-03-01T00:11:20Z,141,170,T509308C,5067782,0,1.7,1,1,12.35,9.0,0.5,0.5,2.05,0.0,0.3


In [0]:
%python
# create temporary view in sql
yellowTaxiChangesDF.createOrReplaceTempView("yellowTaxiChanges")


In [0]:
%sql
-- Merge Command
MERGE INTO TaxisDB.yellowTaxisDelta tgt 
USING yellowTaxiChanges src
  ON tgt.VendorId = src.VendorId 
  AND tgt.RideId = src.RideId

-- Update rows when join conditions match
WHEN MATCHED
  THEN
  UPDATE SET tgt.paymentType = src.paymentType

-- Insert rows if row is not present in target table
WHEN NOT MATCHED 
AND PickupTime >= '2022-03-01'
  THEN 
  INSERT(
    RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RateCodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
  )
  VALUES(
    RideId,VendorId,PickupTime,DropTime,PickupLocationId,DropLocationId,CabNumber,DriverLicenseNumber,PassengerCount,TripDistance,RateCodeId,PaymentType,TotalAmount,FareAmount,Extra,MtaTax,TripAmount,TollsAmount,ImprovementSurcharge
  )

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
4,2,0,2


Schema Enforcement and Evolution

Schema Enforcement:
1. At commit time, schema of new data is checked with delta table schema
2. Mismatch occurs when 
   - Source contains additional columns
   - data types are different
3. On schema mismatch:
   - No entry is added to transaction log
   - Exception is raised
   - Incorrect partition files are still present in folder

Schema Evolution
1. Delta lake allows adding more columns to the table by defining it explicitly
2. Merge command supports auto schema evolution

In [0]:
%python
# DataLake write operation will cause issues 
# read data with 2 columns
rateCodes1DF = (
    spark
    .read
    .option('header', 'true')
    .option('inferSchema', 'true')
    .csv("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/RateCodes/RateCodes1.csv")
)
# read data with 3 columns
rateCodes2DF = (
    spark
    .read
    .option('header', 'true')
    .option('inferSchema', 'true')
    .csv("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/raw/RateCodes/RateCodes2.csv")
)
# write in parquet format
(
    rateCodes1DF
    .write
    .mode("overwrite")
    .format("parquet")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.parquet")
)
# write append in parquet format
(
    rateCodes2DF
    .write
    .mode("append")
    .format("parquet")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.parquet")
)


In [0]:
%python
# Shows only 2 column in Output, depending on which one it reads first, which is an anamoly
rateCodesDF = (
  spark
  .read
  .format("parquet")
  .load("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.parquet")
)
display(rateCodesDF)

RateCodeId,RateCode,IsApproved
1,Standard rate,
2,JFK,
3,Newark,
4,Westchester,Yes
5,Negotiated fare,No
6,Group ride,Yes


In [0]:
%python
# DeltaLake format append in schema mismatch, not allowed
(
    rateCodes1DF
    .write
    .mode("append")
    .format("delta")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta")
)
(
    rateCodes2DF
    .write
    .mode("append")
    .format("delta")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta")
)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1044439974593582>, line 14[0m
[1;32m      1[0m [38;5;66;03m# DeltaLake format append in schema mismatch[39;00m
[1;32m      2[0m (
[1;32m      3[0m     rateCodes1DF
[1;32m      4[0m     [38;5;241m.[39mwrite
[0;32m   (...)[0m
[1;32m      7[0m     [38;5;241m.[39msave([38;5;124m"[39m[38;5;124mabfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta[39m[38;5;124m"[39m)
[1;32m      8[0m )
[1;32m      9[0m (
[1;32m     10[0m     rateCodes2DF
[1;32m     11[0m     [38;5;241m.[39mwrite
[1;32m     12[0m     [38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m)
[1;32m     13[0m     [38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)
[0;32m---> 14[0m     [38;5;241m.[39msave([38;5;

In [0]:
%python
# Schema Evolution: Merge Schema
(
    rateCodes1DF
    .write
    .mode("append")
    .format("delta")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta")
)
(
    rateCodes2DF
    .write
    .mode("append")
    .format("delta")
    .option("mergeSchema", "true")
    .save("abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta")
)

In [0]:
%python
# Read the data
rateCodesDF = (
    spark 
    .read 
    .format('delta')
    .load('abfss://datalake@mue10dadls01.dfs.core.windows.net/ShauryaRawat/Output/RateCodes.delta')
)
display(rateCodesDF)

RateCodeId,RateCode,IsApproved
4,Westchester,Yes
5,Negotiated fare,No
6,Group ride,Yes
1,Standard rate,
2,JFK,
3,Newark,
1,Standard rate,
2,JFK,
3,Newark,


Applying Table Constraints
1. NOT NULL
2. CHECK

In [0]:
%sql
ALTER TABLE taxisDB.yellowTaxisDelta
ALTER COLUMN RideId SET NOT NULL 

In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
11,2024-05-12T09:52:16Z,593534500119395,shaurya.rawat@ukg.com,CHANGE COLUMN,"Map(column -> {""name"":""RideId"",""type"":""integer"",""nullable"":false,""metadata"":{""comment"":""This is the primary key column""}})",,List(726288301620293),0906-134047-2tr7h7co,10.0,WriteSerializable,False,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
10,2024-05-12T07:22:34Z,593534500119395,shaurya.rawat@ukg.com,MERGE,"Map(predicate -> [""((VendorId#31500 = VendorId#30675) AND (RideId#31499 = RideId#30674))""], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""predicate"":""(PickupTime#30676 >= 2022-03-01 00:00:00)"",""actionType"":""insert""}])",,List(726288301620293),0906-134047-2tr7h7co,9.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 5421, numTargetBytesRemoved -> 5300, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 2, executionTimeMs -> 4097, numTargetRowsInserted -> 2, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 2371, numTargetRowsUpdated -> 2, numOutputRows -> 4, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 4, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1228)",,Databricks-Runtime/13.3.x-photon-scala2.12
9,2024-05-12T07:07:27Z,593534500119395,shaurya.rawat@ukg.com,DELETE,"Map(predicate -> [""(RideId#29789 = 9999999)""])",,List(726288301620293),0906-134047-2tr7h7co,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5389, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1008, numDeletedRows -> 1, scanTimeMs -> 300, numAddedFiles -> 1, numAddedBytes -> 5300, rewriteTimeMs -> 708)",,Databricks-Runtime/13.3.x-photon-scala2.12
8,2024-05-12T07:00:21Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#27874 = 9999997)""])",,List(726288301620293),0906-134047-2tr7h7co,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5232, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1433, scanTimeMs -> 544, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5389, rewriteTimeMs -> 853)",,Databricks-Runtime/13.3.x-photon-scala2.12
7,2024-05-12T06:51:58Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,6.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
6,2024-05-12T06:51:37Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,5.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
5,2024-05-12T06:50:33Z,593534500119395,shaurya.rawat@ukg.com,TRUNCATE,Map(),,,,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 8, numDeletedRows -> 19999994, executionTimeMs -> 185, numRemovedBytes -> 495356843)",,Databricks-Runtime/15.1.x-photon-scala2.12
4,2024-05-12T06:36:59Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,3.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247983488, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
3,2024-05-12T06:31:53Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,2.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247363347, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12
2,2024-05-12T06:04:49Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
ALTER TABLE taxisDB.yellowTaxisDelta
ADD CONSTRAINT PassengerCheck CHECK (PassengerCount <= 5)

In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
13,2024-05-12T09:56:32Z,593534500119395,shaurya.rawat@ukg.com,DROP CONSTRAINT,"Map(name -> PassengerCheck, expr -> PassengerCount <= 5, existed -> true)",,List(726288301620293),0906-134047-2tr7h7co,12.0,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
12,2024-05-12T09:55:17Z,593534500119395,shaurya.rawat@ukg.com,ADD CONSTRAINT,"Map(name -> PassengerCheck, expr -> PassengerCount <= 5)",,List(726288301620293),0906-134047-2tr7h7co,11.0,WriteSerializable,False,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
11,2024-05-12T09:52:16Z,593534500119395,shaurya.rawat@ukg.com,CHANGE COLUMN,"Map(column -> {""name"":""RideId"",""type"":""integer"",""nullable"":false,""metadata"":{""comment"":""This is the primary key column""}})",,List(726288301620293),0906-134047-2tr7h7co,10.0,WriteSerializable,False,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
10,2024-05-12T07:22:34Z,593534500119395,shaurya.rawat@ukg.com,MERGE,"Map(predicate -> [""((VendorId#31500 = VendorId#30675) AND (RideId#31499 = RideId#30674))""], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""predicate"":""(PickupTime#30676 >= 2022-03-01 00:00:00)"",""actionType"":""insert""}])",,List(726288301620293),0906-134047-2tr7h7co,9.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 5421, numTargetBytesRemoved -> 5300, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 2, executionTimeMs -> 4097, numTargetRowsInserted -> 2, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 2371, numTargetRowsUpdated -> 2, numOutputRows -> 4, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 4, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1228)",,Databricks-Runtime/13.3.x-photon-scala2.12
9,2024-05-12T07:07:27Z,593534500119395,shaurya.rawat@ukg.com,DELETE,"Map(predicate -> [""(RideId#29789 = 9999999)""])",,List(726288301620293),0906-134047-2tr7h7co,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5389, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1008, numDeletedRows -> 1, scanTimeMs -> 300, numAddedFiles -> 1, numAddedBytes -> 5300, rewriteTimeMs -> 708)",,Databricks-Runtime/13.3.x-photon-scala2.12
8,2024-05-12T07:00:21Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#27874 = 9999997)""])",,List(726288301620293),0906-134047-2tr7h7co,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5232, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1433, scanTimeMs -> 544, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5389, rewriteTimeMs -> 853)",,Databricks-Runtime/13.3.x-photon-scala2.12
7,2024-05-12T06:51:58Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,6.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
6,2024-05-12T06:51:37Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,5.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
5,2024-05-12T06:50:33Z,593534500119395,shaurya.rawat@ukg.com,TRUNCATE,Map(),,,,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 8, numDeletedRows -> 19999994, executionTimeMs -> 185, numRemovedBytes -> 495356843)",,Databricks-Runtime/15.1.x-photon-scala2.12
4,2024-05-12T06:36:59Z,593534500119395,shaurya.rawat@ukg.com,COPY INTO,Map(),,List(726288301620293),0906-134047-2tr7h7co,3.0,WriteSerializable,True,"Map(numFiles -> 3, numOutputRows -> 9999995, numOutputBytes -> 247983488, numSkippedCorruptFiles -> 0)",,Databricks-Runtime/13.3.x-photon-scala2.12


In [0]:
%sql
ALTER TABLE  taxisDB.yellowTaxisDelta
DROP CONSTRAINT PassengerCheck

Accessing data with Time travel
- Time travel allows you to restore and access previous snapshots of data, even if data has been modified or deleted
- Log retension period is 30 days by default

In [0]:
%sql
Update taxisDB.yellowTaxisDelta
SET PassengerCount = 3
where RideId = 10000000

num_affected_rows
1


In [0]:
%sql
-- Checking the Versions using time travel: Version Number
Select RideId, PassengerCount from taxisDB.yellowTaxisDelta
where RideId = 10000000
union all 
Select RideId, PassengerCount from taxisDB.yellowTaxisDelta VERSION AS OF 13
where RideId = 10000000

RideId,PassengerCount
10000000,3
10000000,0


In [0]:
%sql
-- Checking the Versions using time travel: TIMESTAMP 
Select RideId, PassengerCount from taxisDB.yellowTaxisDelta TIMESTAMP AS OF '2024-05-12T09:56:32.000+00:00'
where RideId = 10000000

RideId,PassengerCount
10000000,0


In [0]:
%sql
-- RESTORE Table to older version
RESTORE TABLE taxisDB.yellowTaxisDelta TO VERSION AS OF 13

table_size_after_restore,num_of_files_after_restore,num_removed_files,num_restored_files,removed_files_size,restored_files_size
10197,2,1,1,5596,5421


In [0]:
%sql
DESCRIBE HISTORY taxisDB.yellowTaxisDelta

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
15,2024-05-12T10:10:45Z,593534500119395,shaurya.rawat@ukg.com,RESTORE,"Map(version -> 13, timestamp -> null)",,List(726288301620293),0906-134047-2tr7h7co,14.0,Serializable,False,"Map(numRestoredFiles -> 1, removedFilesSize -> 5596, numRemovedFiles -> 1, restoredFilesSize -> 5421, numOfFilesAfterRestore -> 2, tableSizeAfterRestore -> 10197)",,Databricks-Runtime/13.3.x-photon-scala2.12
14,2024-05-12T10:05:35Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#3770 = 10000000)""])",,List(726288301620293),0906-134047-2tr7h7co,13.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5421, numCopiedRows -> 3, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 2018, scanTimeMs -> 583, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5596, rewriteTimeMs -> 1390)",,Databricks-Runtime/13.3.x-photon-scala2.12
13,2024-05-12T09:56:32Z,593534500119395,shaurya.rawat@ukg.com,DROP CONSTRAINT,"Map(name -> PassengerCheck, expr -> PassengerCount <= 5, existed -> true)",,List(726288301620293),0906-134047-2tr7h7co,12.0,WriteSerializable,True,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
12,2024-05-12T09:55:17Z,593534500119395,shaurya.rawat@ukg.com,ADD CONSTRAINT,"Map(name -> PassengerCheck, expr -> PassengerCount <= 5)",,List(726288301620293),0906-134047-2tr7h7co,11.0,WriteSerializable,False,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
11,2024-05-12T09:52:16Z,593534500119395,shaurya.rawat@ukg.com,CHANGE COLUMN,"Map(column -> {""name"":""RideId"",""type"":""integer"",""nullable"":false,""metadata"":{""comment"":""This is the primary key column""}})",,List(726288301620293),0906-134047-2tr7h7co,10.0,WriteSerializable,False,Map(),,Databricks-Runtime/13.3.x-photon-scala2.12
10,2024-05-12T07:22:34Z,593534500119395,shaurya.rawat@ukg.com,MERGE,"Map(predicate -> [""((VendorId#31500 = VendorId#30675) AND (RideId#31499 = RideId#30674))""], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""predicate"":""(PickupTime#30676 >= 2022-03-01 00:00:00)"",""actionType"":""insert""}])",,List(726288301620293),0906-134047-2tr7h7co,9.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 5421, numTargetBytesRemoved -> 5300, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 2, executionTimeMs -> 4097, numTargetRowsInserted -> 2, numTargetRowsMatchedDeleted -> 0, scanTimeMs -> 2371, numTargetRowsUpdated -> 2, numOutputRows -> 4, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 4, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1228)",,Databricks-Runtime/13.3.x-photon-scala2.12
9,2024-05-12T07:07:27Z,593534500119395,shaurya.rawat@ukg.com,DELETE,"Map(predicate -> [""(RideId#29789 = 9999999)""])",,List(726288301620293),0906-134047-2tr7h7co,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5389, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1008, numDeletedRows -> 1, scanTimeMs -> 300, numAddedFiles -> 1, numAddedBytes -> 5300, rewriteTimeMs -> 708)",,Databricks-Runtime/13.3.x-photon-scala2.12
8,2024-05-12T07:00:21Z,593534500119395,shaurya.rawat@ukg.com,UPDATE,"Map(predicate -> [""(RideId#27874 = 9999997)""])",,List(726288301620293),0906-134047-2tr7h7co,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 5232, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1433, scanTimeMs -> 544, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 5389, rewriteTimeMs -> 853)",,Databricks-Runtime/13.3.x-photon-scala2.12
7,2024-05-12T06:51:58Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [""VendorId""])",,List(726288301620293),0906-134047-2tr7h7co,6.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 5232)",,Databricks-Runtime/13.3.x-photon-scala2.12
6,2024-05-12T06:51:37Z,593534500119395,shaurya.rawat@ukg.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(726288301620293),0906-134047-2tr7h7co,5.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 4776)",,Databricks-Runtime/13.3.x-photon-scala2.12
