In [0]:
from delta.tables import *

DeltaTable.create(spark)\
    .tableName("employees_demo")\
    .addColumn("ID", "INT")\
    .addColumn("Name", "STRING")\
    .addColumn("Gender", "STRING")\
    .addColumn("Salary", "INT")\
    .addColumn("Department", "STRING")\
    .property("Description", "Schema Evolution Demo")\
    .location("FileStore/tables/delta/path_employees_demo")\
    .execute()

Out[1]: <delta.tables.DeltaTable at 0x7faf90045850>

In [0]:
%sql
SELECT * FROM employees_demo

ID,Name,Gender,Salary,Department


In [0]:
%sql
INSERT INTO employees_demo VALUES(100, "Stephe", "M", 2000, "IT")

num_affected_rows,num_inserted_rows
1,1


In [0]:
_sqldf.display()  # SELECT * FROM employees_demo

ID,Name,Gender,Salary,Department
100,Stephe,M,2000,IT


### Schema Evolution

In [0]:
from pyspark.sql.types import IntegerType, StringType
data = [(200, "Philip", "M", 8000, "HR", "Test Data")]

schema = StructType([StructField("ID", IntegerType(), False),
                    StructField("Name", StringType(), True),
                    StructField("Gender", StringType(), True),
                    StructField("Salary", IntegerType(), True),
                    StructField("Department", StringType(), True),
                    StructField("additionalcol1", StringType(), True)]) # this will alter the original schema

df = spark.createDataFrame(data, schema)
df.display()

ID,Name,Gender,Salary,Department,additionalcol1
200,Philip,M,8000,HR,Test Data


In [0]:
df.write.format("delta").mode("append").saveAsTable("employees_demo")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8121041784125140>:1[0m
[0;32m----> 1[0m [43mdf[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mdelta[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43mmode[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mappend[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241;43m.[39;49m[43msaveAsTable[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43memployees_demo[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     

The above execution will throw due to schema mismatch. Use `mergeSchema` to handle this

In [0]:
df.write.format("delta").option("mergeSchema", True).mode("append").saveAsTable("employees_demo")

In [0]:
%sql
SELECT * FROM employees_demo

ID,Name,Gender,Salary,Department,additionalcol1
200,Philip,M,8000,HR,Test Data
100,Stephe,M,2000,IT,


In [0]:
data = [(300, "David", "M", 8000, "HR", "Dummy Data")]

schema = StructType([StructField("ID", IntegerType(), False),
                    StructField("Name", StringType(), True),
                    StructField("Gender", StringType(), True),
                    StructField("Salary", IntegerType(), True),
                    StructField("Department", StringType(), True),
                    StructField("additionalcol2", StringType(), True)]) # this will also alter the original schema

df = spark.createDataFrame(data, schema)
df.display()

ID,Name,Gender,Salary,Department,additionalcol2
300,David,M,8000,HR,Dummy Data


In [0]:
df.write.format("delta").option("mergeSchema", True).mode("append").saveAsTable("employees_demo")

In [0]:
%sql
SELECT * FROM employees_demo

ID,Name,Gender,Salary,Department,additionalcol1,additionalcol2
200,Philip,M,8000,HR,Test Data,
300,David,M,8000,HR,,Dummy Data
100,Stephe,M,2000,IT,,


In [0]:
data = [(400, "Sam", "M", 5000, "Dummy")]

schema = StructType([StructField("ID", IntegerType(), False),
                    StructField("Name", StringType(), True),
                    StructField("Gender", StringType(), True),
                    StructField("Salary", IntegerType(), True),
                    StructField("additionalcol2", StringType(), True)]) # this will also alter the original schema

df = spark.createDataFrame(data, schema)
df.display()

ID,Name,Gender,Salary,additionalcol2
400,Sam,M,5000,Dummy


In [0]:
df.write.format("delta").option("mergeSchema", True).mode("append").saveAsTable("employees_demo")

In [0]:
%sql
SELECT * FROM employees_demo

ID,Name,Gender,Salary,Department,additionalcol1,additionalcol2
200,Philip,M,8000,HR,Test Data,
300,David,M,8000,HR,,Dummy Data
400,Sam,M,5000,,,Dummy
100,Stephe,M,2000,IT,,
