In [0]:
%python
# Creating a table
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", TimestampType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

df = spark.read.format("csv").option("header", True).schema(schema).load("/FileStore/tables/export.csv")

# # Create the table if it does not exist. Otherwise, replace the existing table.
df.writeTo("hive_metastore.default.people_10m").createOrReplace()

# # If you know the table does not already exist, you can call this instead:
# df.write.saveAsTable("hive_metastore.default.people_10m")

In [0]:
%python
# Creating an empty table using DeltaTableBuilder API
from delta.tables import DeltaTable
DeltaTable.createIfNotExists(spark)\
  .tableName("people_10m")\
  .addColumn("id", "INT")\
  .addColumn("firstName", "STRING")\
  .addColumn("middleName", "STRING")\
  .addColumn("lastName", "STRING", comment = "surname")\
  .addColumn("gender", "STRING")\
  .addColumn("birthDate", "TIMESTAMP")\
  .addColumn("ssn", "STRING")\
  .addColumn("salary", "INT")\
  .execute()

<delta.tables.DeltaTable at 0x7f41a750ec00>

In [0]:
%python
# Upsert to a table
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import date

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("firstName", StringType(), True),
    StructField("middleName", StringType(), True),
    StructField("lastName", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("birthDate", DateType(), True),
    StructField("ssn", StringType(), True),
    StructField("salary", IntegerType(), True),
])

data = [
  (9999998, 'Billy', 'Tommie', 'Luppitt', 'M', date.fromisoformat('1992-09-17'), '953-38-9452', 55250),
  (9999999, 'Elias', 'Cyril', 'Leadbetter', 'M', date.fromisoformat('1984-05-22'), '906-51-2137', 48500),
  (10000000, 'Joshua', 'Chas', 'Broggio', 'M', date.fromisoformat('1968-07-22'), '988-61-6247', 90000),
  (20000001, 'John', '', 'Doe', 'M', date.fromisoformat('1978-01-14'), '345-67-8901', 55500),
  (20000002, 'Mary', '', 'Smith', 'F', date.fromisoformat('1982-10-29'), '456-78-9012', 98250),
  (20000003, 'Jane', '', 'Doe', 'F', date.fromisoformat('1981-06-25'), '567-89-0123', 89900)
]

people_10m_updates = spark.createDataFrame(data = data, schema = schema)
people_10m_updates.createTempView("people_10m_updates")


from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")

(deltaTable.alias("people_10m")
  .merge(
    people_10m_updates.alias("people_10m_updates"),
    "people_10m.id = people_10m_updates.id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute()
)

In [0]:
%python
# Querying a table
df = spark.read.table("hive_metastore.default.people_10m")
df_filtered = df.filter(df["id"] >= 9999998)
display(df_filtered)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
9999999,Elias,Cyril,Leadbetter,M,1984-05-22T00:00:00Z,906-51-2137,48500
10000000,Joshua,Chas,Broggio,M,1968-07-22T00:00:00Z,988-61-6247,90000
20000002,Mary,,Smith,F,1982-10-29T00:00:00Z,456-78-9012,98250
20000003,Jane,,Doe,F,1981-06-25T00:00:00Z,567-89-0123,89900
9999998,Billy,Tommie,Luppitt,M,1992-09-17T00:00:00Z,953-38-9452,55250
20000001,John,,Doe,M,1978-01-14T00:00:00Z,345-67-8901,55500


In [0]:
%python
# Reading a table
people_df = spark.read.table("hive_metastore.default.people_10m")
display(people_df)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
1,Pennie,Carry,Hirschmann,F,1955-07-02T04:00:00Z,981-43-9345,56172
2,An,Amira,Cowper,F,1992-02-08T05:00:00Z,978-97-8086,40203
3,Quyen,Marlen,Dome,F,1970-10-11T04:00:00Z,957-57-8246,53417
4,Coralie,Antonina,Marshal,F,1990-04-11T04:00:00Z,963-39-4885,94727
5,Terrie,Wava,Bonar,F,1980-01-16T05:00:00Z,964-49-8051,79908
6,Chassidy,Concepcion,Bourthouloume,F,1990-11-24T05:00:00Z,954-59-9172,64652
7,Geri,Tambra,Mosby,F,1970-12-19T05:00:00Z,968-16-4020,38195
8,Patria,Nancy,Arstall,F,1985-01-02T05:00:00Z,984-76-3770,102053
9,Terese,Alfredia,Tocque,F,1967-11-17T05:00:00Z,967-48-7309,91294
10,Wava,Lyndsey,Jeandon,F,1963-12-30T05:00:00Z,997-82-2946,56521


In [0]:
%python
# Append mode
df.write.mode("append").saveAsTable("hive_metastore.default.people_10m")

In [0]:
%python
# Overwrite mode
df.write.mode("overwrite").saveAsTable("hive_metastore.default.people_10m")

In [0]:
%python
# Update a table
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")

deltaTable.update(
    condition = "gender = 'F'",
    set = {"gender" : "'Female'"}
)

deltaTable.update(
    condition = col('gender') == 'M',
    set = {'gender': lit('Male')}
)

In [0]:
%python
# Delete from a table
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")
deltaTable.delete("birthDate < '1955-01-01'")
deltaTable.delete(col('birthDate') < '1960-01-01')

In [0]:
%python
# Display table history
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")
display(deltaTable.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
11,2025-08-11T11:41:59Z,146386952150351,azuser4030_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(1766915037631076),0806-091614-zhfbr6hu,10.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 5, numRemovedBytes -> 81436, p25FileSize -> 67989, numDeletionVectorsRemoved -> 1, minFileSize -> 67989, numAddedFiles -> 1, maxFileSize -> 67989, p75FileSize -> 67989, p50FileSize -> 67989, numAddedBytes -> 67989)",,Databricks-Runtime/16.4.x-photon-scala2.12
10,2025-08-11T11:41:57Z,146386952150351,azuser4030_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#4583 < 1960-01-01 00:00:00)""])",,List(1766915037631076),0806-091614-zhfbr6hu,9.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 1, numAddedChangeFiles -> 0, executionTimeMs -> 1132, numDeletionVectorsUpdated -> 1, numDeletedRows -> 416, scanTimeMs -> 654, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 477)",,Databricks-Runtime/16.4.x-photon-scala2.12
9,2025-08-11T11:41:55Z,146386952150351,azuser4030_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#4583 < 1955-01-01 00:00:00)""])",,List(1766915037631076),0806-091614-zhfbr6hu,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 1, numAddedChangeFiles -> 0, executionTimeMs -> 1444, numDeletionVectorsUpdated -> 1, numDeletedRows -> 252, scanTimeMs -> 896, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 544)",,Databricks-Runtime/16.4.x-photon-scala2.12
8,2025-08-11T11:36:49Z,146386952150351,azuser4030_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(1766915037631076),0806-091614-zhfbr6hu,6.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 8, numRemovedBytes -> 369210, p25FileSize -> 72435, numDeletionVectorsRemoved -> 4, conflictDetectionTimeMs -> 390, minFileSize -> 72435, numAddedFiles -> 1, maxFileSize -> 72435, p75FileSize -> 72435, p50FileSize -> 72435, numAddedBytes -> 72435)",,Databricks-Runtime/16.4.x-photon-scala2.12
7,2025-08-11T11:36:47Z,146386952150351,azuser4030_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#2925 = M)""])",,List(1766915037631076),0806-091614-zhfbr6hu,6.0,WriteSerializable,False,"Map(numRemovedFiles -> 4, numRemovedBytes -> 184833, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 4, numAddedChangeFiles -> 0, executionTimeMs -> 2045, numDeletionVectorsUpdated -> 0, scanTimeMs -> 986, numAddedFiles -> 4, numUpdatedRows -> 16, numAddedBytes -> 9001, rewriteTimeMs -> 1055)",,Databricks-Runtime/16.4.x-photon-scala2.12
6,2025-08-11T11:36:45Z,146386952150351,azuser4030_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#2925 = F)""])",,List(1766915037631076),0806-091614-zhfbr6hu,5.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 4, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3606, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1606, numAddedFiles -> 4, numUpdatedRows -> 4008, numAddedBytes -> 184377, rewriteTimeMs -> 1991)",,Databricks-Runtime/16.4.x-photon-scala2.12
5,2025-08-11T11:12:08Z,146386952150351,azuser4030_mml.local@techademy.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(1766915037631076),0806-091614-zhfbr6hu,4.0,WriteSerializable,False,"Map(numFiles -> 4, numRemovedFiles -> 12, numRemovedBytes -> 201259, numOutputRows -> 4024, numOutputBytes -> 184833)",,Databricks-Runtime/16.4.x-photon-scala2.12
4,2025-08-11T10:58:11Z,146386952150351,azuser4030_mml.local@techademy.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(1766915037631076),0806-091614-zhfbr6hu,3.0,WriteSerializable,False,"Map(numFiles -> 4, numOutputRows -> 2012, numOutputBytes -> 96658)",,Databricks-Runtime/16.4.x-photon-scala2.12
3,2025-08-11T10:57:50Z,146386952150351,azuser4030_mml.local@techademy.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(1766915037631076),0806-091614-zhfbr6hu,2.0,WriteSerializable,False,"Map(numFiles -> 3, numOutputRows -> 1006, numOutputBytes -> 50338)",,Databricks-Runtime/16.4.x-photon-scala2.12
2,2025-08-11T09:42:56Z,146386952150351,azuser4030_mml.local@techademy.com,MERGE,"Map(predicate -> [""(id#1122 = id#1098)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(1766915037631076),0806-091614-zhfbr6hu,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 4, numTargetBytesAdded -> 8237, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 6100, materializeSourceTimeMs -> 1348, numTargetRowsInserted -> 6, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 3645, numTargetRowsUpdated -> 0, numOutputRows -> 6, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 6, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1015)",,Databricks-Runtime/16.4.x-photon-scala2.12


In [0]:
%python
# Query an earlier version of the table (time travel)
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")
deltaHistory = deltaTable.history()

display(deltaHistory.where("version == 0"))
# Or:
display(deltaHistory.where("timestamp == '2024-05-15T22:43:15.000+00:00'"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2025-08-11T06:54:03Z,146386952150351,azuser4030_mml.local@techademy.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(3653885903763628),0806-091614-zhfbr6hu,,WriteSerializable,True,"Map(numFiles -> 4, numOutputRows -> 10000000, numOutputBytes -> 218583722)",,Databricks-Runtime/16.4.x-photon-scala2.12


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo


In [0]:
%python
# Creating a dataframe from a delta table fixed to a specific version
df = spark.read.option('versionAsOf', 0).table("hive_metastore.default.people_10m")
df = spark.read.option('timestampAsOf', '2025-08-11T07:00:00.000+00:00').table("hive_metastore.default.people_10m")

display(df)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
2517288,Lianne,Izola,Perrington,F,1966-11-24T05:00:00Z,986-19-7817,109409
2517289,Clara,Bernadine,Josephy,F,1961-08-05T04:00:00Z,916-21-6604,84625
2517290,Romana,Giuseppina,Wurz,F,1984-01-12T05:00:00Z,916-42-2805,62122
2517291,Renda,Leonore,Filintsev,F,1984-02-22T05:00:00Z,923-51-9239,68815
2517292,Muriel,Jerrie,Penchen,F,1972-12-03T05:00:00Z,904-72-2213,86823
2517293,Malisa,Meryl,Morrilly,F,1965-10-08T04:00:00Z,908-82-9745,67032
2517294,Ofelia,Rebbecca,Luck,F,1992-10-20T04:00:00Z,991-22-1854,98319
2517295,Dodie,Natalie,Prudham,F,1997-05-09T04:00:00Z,942-46-1261,57133
2517296,Heide,Rashida,Vaz,F,1974-12-02T05:00:00Z,923-59-1639,82054
2517297,Maxima,Dagny,Muttock,F,1983-12-18T05:00:00Z,958-67-6101,86218


In [0]:
%python
# Optimize a table
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")

deltaTable.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
%python
# Z-order by columns
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")
deltaTable.optimize().executeZOrderBy("gender")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
%python
# Clean up snapshots with VACUUM
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hive_metastore.default.people_10m")
deltaTable.vacuum()

DataFrame[]