In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Schema Comparison").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 18:43:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/02/26 18:43:17 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


- Create 1st Dataframe

In [2]:
empData1 = [
    (111, "Stephen", "King", 2000),
    (222, "Philipp", "Larkin", 8000),
    (333, "John", "Smith", 6000),
]
empSchema1 = ["Id", "FirstName", "LastName", "salary"]

df1 = spark.createDataFrame(data=empData1, schema=empSchema1)
df1.show(truncate=False)

                                                                                

+---+---------+--------+------+
|Id |FirstName|LastName|salary|
+---+---------+--------+------+
|111|Stephen  |King    |2000  |
|222|Philipp  |Larkin  |8000  |
|333|John     |Smith   |6000  |
+---+---------+--------+------+



- create 2nd dataframe with same schema as 1st dataframe

In [4]:
empData2 = [
    (444, "Thomas", "Frank", 4000),
    (555, "Stephen", "Fleming", 3000),
    (666, "William", "Pending", 7000),
]
empSchema2 = ["Id", "FirstName", "LastName", "salary"]

df2 = spark.createDataFrame(data=empData2, schema=empSchema2)
df2.show(truncate=False)

+---+---------+--------+------+
|Id |FirstName|LastName|salary|
+---+---------+--------+------+
|444|Thomas   |Frank   |4000  |
|555|Stephen  |Fleming |3000  |
|666|William  |Pending |7000  |
+---+---------+--------+------+



- create 3rd dataframe with **Different** schema from 1st dataframe

In [5]:
empData3 = [
    (777, "David", 4000),
    (888, "Mike", 3000),
    (999, "Winston", 3000),
]
empSchema3 = ["Id", "Name", "salary"]

df3 = spark.createDataFrame(data=empData3, schema=empSchema3)
df3.show(truncate=False)

+---+-------+------+
|Id |Name   |salary|
+---+-------+------+
|777|David  |4000  |
|888|Mike   |3000  |
|999|Winston|3000  |
+---+-------+------+



- compare schema of 1st and 2nd dataframe

In [6]:
if df1.schema == df2.schema:
    print("Schema Matched")
else:
    print("Schema does not matched")

Schema Matched


- compare schema of 1st and 3rd dataframe

In [7]:
if df1.schema == df3.schema:
    print("Schema Matched")
else:
    print("Schema does not matched")

Schema does not matched


- list of columns missing in third dataframe

In [8]:
print(list(set(df2.columns) - set(df3.columns)))

['FirstName', 'LastName']


- list of columns missing in second dataframe

In [9]:
print(list(set(df3.columns) - set(df2.columns)))

['Name']


- Collect all possible columns in a list

In [10]:
allColumns = df1.columns + df3.columns
uniqueColumns = list(set(allColumns))
uniqueColumns

['FirstName', 'Name', 'LastName', 'Id', 'salary']

- Add missing columns

In [11]:
from pyspark.sql.functions import lit

In [12]:
for col in uniqueColumns:
    if col not in df1.columns:
        df1 = df1.withColumn(col, lit(None))
    if col not in df3.columns:
        df3 = df3.withColumn(col, lit(None))

df1.show(truncate=False)
df3.show(truncate=False)

+---+---------+--------+------+----+
|Id |FirstName|LastName|salary|Name|
+---+---------+--------+------+----+
|111|Stephen  |King    |2000  |NULL|
|222|Philipp  |Larkin  |8000  |NULL|
|333|John     |Smith   |6000  |NULL|
+---+---------+--------+------+----+

+---+-------+------+---------+--------+
|Id |Name   |salary|FirstName|LastName|
+---+-------+------+---------+--------+
|777|David  |4000  |NULL     |NULL    |
|888|Mike   |3000  |NULL     |NULL    |
|999|Winston|3000  |NULL     |NULL    |
+---+-------+------+---------+--------+



In [13]:
def addMissingColumns(df1, df2):
    allColumns = df1.columns + df3.columns
    uniqueColumns = list(set(allColumns))
    for col in uniqueColumns:
        if col not in df1.columns:
            df1 = df1.withColumn(col, lit(None))
        if col not in df2.columns:
            df2 = df2.withColumn(col, lit(None))
    return df1, df2

In [14]:
df1, df2 = addMissingColumns(df1, df3)
df1.show()
df2.show()

+---+---------+--------+------+----+
| Id|FirstName|LastName|salary|Name|
+---+---------+--------+------+----+
|111|  Stephen|    King|  2000|NULL|
|222|  Philipp|  Larkin|  8000|NULL|
|333|     John|   Smith|  6000|NULL|
+---+---------+--------+------+----+

+---+-------+------+---------+--------+
| Id|   Name|salary|FirstName|LastName|
+---+-------+------+---------+--------+
|777|  David|  4000|     NULL|    NULL|
|888|   Mike|  3000|     NULL|    NULL|
|999|Winston|  3000|     NULL|    NULL|
+---+-------+------+---------+--------+

