In [1]:
from pyspark.sql import SparkSession

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [3]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("jsondata") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
path='C:/Users/acer/Downloads/printermessages.json'
df=spark.read.option("multiline","true").format("json").load(path)
df.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                Body|     EnqueuedTimeUtc|          Properties|    SystemProperties|
+--------------------+--------------------+--------------------+--------------------+
|{mn=TVg4MDkwTg==:...|2024-12-06T08:02:...|{rp.mib/dev/2be00...|{{"scope":"device...|
+--------------------+--------------------+--------------------+--------------------+



In [5]:
from pyspark.sql.types import *

schema = StructType([
    StructField("EnqueuedTimeUtc", StringType(), True),
    StructField("Properties", StructType([
        StructField("appTopic", StringType(), True)
    ]), True),
    StructField("SystemProperties", StructType([
        StructField("connectionDeviceId", StringType(), True),
        StructField("connectionAuthMethod", StringType(), True),
        StructField("connectionDeviceGenerationId", StringType(), True),
        StructField("contentType", StringType(), True),
        StructField("contentEncoding", StringType(), True),
        StructField("enqueuedTime", StringType(), True)
    ]), True),
    StructField("Body", StructType([
        StructField("type", StringType(), True),
        StructField("mnsn", StringType(), True),
        StructField("timestamp", LongType(), True),
        StructField("scheduleName", StringType(), True),
    ]), True)
])

In [41]:
path='C:/Users/acer/Downloads/printermessages.json'
schema_df = spark.read.schema(schema).json(path)

In [42]:
schema_df.show()

+--------------------+--------------------+--------------------+--------------------+
|     EnqueuedTimeUtc|          Properties|    SystemProperties|                Body|
+--------------------+--------------------+--------------------+--------------------+
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{a0ba43aa-f3c1-42...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{5b98aa47-2d23-4b...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{a0ba43aa-f3c1-42...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{5b98aa47-2d23-4b...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e

In [13]:
schema_df.select("EnqueuedTimeUtc").show(truncate=False)

+----------------------------+
|EnqueuedTimeUtc             |
+----------------------------+
|2024-12-06T08:02:16.8400000Z|
|2024-12-18T23:56:20.9470000Z|
|2024-12-18T23:56:21.1970000Z|
|2024-11-21T07:21:38.2250000Z|
|2024-11-21T07:21:38.4600000Z|
|2024-11-21T07:22:08.1800000Z|
+----------------------------+



In [16]:
schema_df.printSchema()

root
 |-- EnqueuedTimeUtc: string (nullable = true)
 |-- Properties: struct (nullable = true)
 |    |-- appTopic: string (nullable = true)
 |-- SystemProperties: struct (nullable = true)
 |    |-- connectionDeviceId: string (nullable = true)
 |    |-- connectionAuthMethod: string (nullable = true)
 |    |-- connectionDeviceGenerationId: string (nullable = true)
 |    |-- contentType: string (nullable = true)
 |    |-- contentEncoding: string (nullable = true)
 |    |-- enqueuedTime: string (nullable = true)
 |-- Body: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- mnsn: string (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |    |-- scheduleName: string (nullable = true)



In [17]:
schema_df.select('Body').show(truncate=False)

+-----------------------------------------------------------------------------+
|Body                                                                         |
+-----------------------------------------------------------------------------+
|{smsProps, mn=TVg4MDkwTg==:sn=NjUwNDYyMFgwMA==, 1733472136665, manualUpdate} |
|{offline, mn=QlA1MEMyNg==:sn=MjUxMjc1NTAxMg==, 1734566180749, deviceStatus}  |
|{offline, mn=QlA1MEMyNg==:sn=MjUxMjc1NTAxMg==, 1734566180757, suppliesUpdate}|
|{smsProps, mn=TVgyNjYx:sn=MTUwMDQ2NjQwMA==, 1732173697951, deviceStatus}     |
|{smsProps, mn=TVgyNjYx:sn=MTUwMDQ2NjQwMA==, 1732173698195, suppliesUpdate}   |
|{offline, mn=QlBDMTMxV0Q=:sn=NDAxMDAzODUwMA==, 1732173727911, deviceStatus}  |
+-----------------------------------------------------------------------------+



In [24]:
schema_df.select("SystemProperties.connectionDeviceId","SystemProperties.connectionAuthMethod",
                 "SystemProperties.connectionDeviceGenerationId",
                 "SystemProperties.contentEncoding","SystemProperties.enqueuedTime").show()

+--------------------+--------------------+----------------------------+---------------+--------------------+
|  connectionDeviceId|connectionAuthMethod|connectionDeviceGenerationId|contentEncoding|        enqueuedTime|
+--------------------+--------------------+----------------------------+---------------+--------------------+
|2be00f09-5d33-47a...|{"scope":"device"...|          638658819029400530|          UTF-8|2024-12-06T08:02:...|
|c3566d1b-4d59-47b...|{"scope":"device"...|          638657983158498990|          UTF-8|2024-12-18T23:56:...|
|c3566d1b-4d59-47b...|{"scope":"device"...|          638657983158498990|          UTF-8|2024-12-18T23:56:...|
|f6ee0457-9923-4c6...|{"scope":"device"...|          638587804616854813|          UTF-8|2024-11-21T07:21:...|
|f6ee0457-9923-4c6...|{"scope":"device"...|          638587804616854813|          UTF-8|2024-11-21T07:21:...|
|f6ee0457-9923-4c6...|{"scope":"device"...|          638587804616854813|          UTF-8|2024-11-21T07:22:...|
+---------

In [20]:
schema_df.select("Body.scheduleName").show()

+--------------+
|  scheduleName|
+--------------+
|  manualUpdate|
|  deviceStatus|
|suppliesUpdate|
|  deviceStatus|
|suppliesUpdate|
|  deviceStatus|
+--------------+



In [21]:
schema_df.select("SystemProperties.enqueuedTime").show(truncate=False)

+----------------------------+
|enqueuedTime                |
+----------------------------+
|2024-12-06T08:02:16.8400000Z|
|2024-12-18T23:56:20.9470000Z|
|2024-12-18T23:56:21.1970000Z|
|2024-11-21T07:21:38.2250000Z|
|2024-11-21T07:21:38.4600000Z|
|2024-11-21T07:22:08.1800000Z|
+----------------------------+



# JSON-2

In [5]:
path='C:/Users/acer/Downloads/2024-7-20-15-26-1.json'
df=spark.read.format("json").load(path)
df.show()

+--------------------+--------------------+--------------------+--------------------+
|                Body|     EnqueuedTimeUtc|          Properties|    SystemProperties|
+--------------------+--------------------+--------------------+--------------------+
|{mn=TVgwMDAx:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAx:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAx:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAx:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAy:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAy:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAy:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAy:sn=M...|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{{"scope":"device...|
|{mn=TVgwMDAz:sn=M...|2024-08-20T09:56:...|{rp.mib/dev

In [6]:
from pyspark.sql.types import *

schema = StructType([
    StructField("EnqueuedTimeUtc", StringType(), True),
    StructField("Properties", StructType([
        StructField("appTopic", StringType(), True)
    ]), True),
    StructField("SystemProperties", StructType([
        StructField("connectionDeviceId", StringType(), True),
        StructField("connectionAuthMethod", StringType(), True),
        StructField("connectionDeviceGenerationId", StringType(), True),
        StructField("contentType", StringType(), True),
        StructField("contentEncoding", StringType(), True),
        StructField("enqueuedTime", StringType(), True)
    ]), True),
    StructField("Body", StructType([
        StructField("type", StringType(), True),
        StructField("mnsn", StringType(), True),
        StructField("timestamp", LongType(), True),
        StructField("scheduleName", StringType(), True),
    ]), True)
])

In [84]:
path2='C:/Users/acer/Downloads/2024-7-20-15-26-1.json'
schema_df2 = spark.read.schema(schema).json(path2)

In [85]:
schema_df2.show()

+--------------------+--------------------+--------------------+--------------------+
|     EnqueuedTimeUtc|          Properties|    SystemProperties|                Body|
+--------------------+--------------------+--------------------+--------------------+
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{a0ba43aa-f3c1-42...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{5b98aa47-2d23-4b...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{a0ba43aa-f3c1-42...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{5b98aa47-2d23-4b...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e

In [9]:
schema_df2.printSchema()

root
 |-- EnqueuedTimeUtc: string (nullable = true)
 |-- Properties: struct (nullable = true)
 |    |-- appTopic: string (nullable = true)
 |-- SystemProperties: struct (nullable = true)
 |    |-- connectionDeviceId: string (nullable = true)
 |    |-- connectionAuthMethod: string (nullable = true)
 |    |-- connectionDeviceGenerationId: string (nullable = true)
 |    |-- contentType: string (nullable = true)
 |    |-- contentEncoding: string (nullable = true)
 |    |-- enqueuedTime: string (nullable = true)
 |-- Body: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- mnsn: string (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |    |-- scheduleName: string (nullable = true)



In [10]:
schema_df2.select("Body.scheduleName").show()

+--------------+
|  scheduleName|
+--------------+
|   basicUpdate|
| counterUpdate|
|  deviceStatus|
|suppliesUpdate|
|   basicUpdate|
| counterUpdate|
|  deviceStatus|
|suppliesUpdate|
|   basicUpdate|
| counterUpdate|
|  deviceStatus|
|suppliesUpdate|
|   basicUpdate|
| counterUpdate|
|  deviceStatus|
|suppliesUpdate|
|   basicUpdate|
| counterUpdate|
|  deviceStatus|
|suppliesUpdate|
+--------------+
only showing top 20 rows



In [11]:
schema_df2.select('Body').show(truncate=False)

+--------------------------------------------------------------------------+
|Body                                                                      |
+--------------------------------------------------------------------------+
|{smsProps, mn=TVgwMDAx:sn=MDUxMTMxMzAwMDE=, 1724147812588, basicUpdate}   |
|{smsProps, mn=TVgwMDAx:sn=MDUxMTMxMzAwMDE=, 1724147812589, counterUpdate} |
|{smsProps, mn=TVgwMDAx:sn=MDUxMTMxMzAwMDE=, 1724147812589, deviceStatus}  |
|{smsProps, mn=TVgwMDAx:sn=MDUxMTMxMzAwMDE=, 1724147812589, suppliesUpdate}|
|{smsProps, mn=TVgwMDAy:sn=MDUxMTMxMzAwMDI=, 1724147812591, basicUpdate}   |
|{smsProps, mn=TVgwMDAy:sn=MDUxMTMxMzAwMDI=, 1724147812608, counterUpdate} |
|{smsProps, mn=TVgwMDAy:sn=MDUxMTMxMzAwMDI=, 1724147812608, deviceStatus}  |
|{smsProps, mn=TVgwMDAy:sn=MDUxMTMxMzAwMDI=, 1724147812609, suppliesUpdate}|
|{smsProps, mn=TVgwMDAz:sn=MDUxMTMxMzAwMDM=, 1724147812610, basicUpdate}   |
|{smsProps, mn=TVgwMDAz:sn=MDUxMTMxMzAwMDM=, 1724147812610, counterUpdate} |

In [12]:
schedule_names = [row["scheduleName"] for row in schema_df2.select("Body.scheduleName").distinct().collect()]
schedule_names

['deviceStatus', 'basicUpdate', 'counterUpdate', 'suppliesUpdate']

In [13]:
from delta.tables import DeltaTable
from pyspark.sql.functions import *

In [55]:
schedule_df = schema_df2.filter(col("Body.scheduleName") == 'basicUpdate')
schedule_df.show()

+--------------------+--------------------+--------------------+--------------------+
|     EnqueuedTimeUtc|          Properties|    SystemProperties|                Body|
+--------------------+--------------------+--------------------+--------------------+
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e

In [64]:
flatschedule_df = schedule_df.select(
    F.col("Body.type").alias("Body_type"),
    F.col("Body.mnsn").alias("Body_mnsn"),
    F.col("Body.timestamp").alias("Body_timestamp"),
    F.col("Body.scheduleName").alias("Body_scheduleName"),
    F.col("EnqueuedTimeUtc").alias("EnqueuedTimeUtc"),
    F.col("Properties.appTopic").alias("Properties_appTopic"),
    F.col("SystemProperties.connectionDeviceId").alias("SystemProperties_connectionDeviceId"),
    F.col("SystemProperties.connectionAuthMethod").alias("SystemProperties_connectionAuthMethod"),
    F.col("SystemProperties.connectionDeviceGenerationId").alias("SystemProperties_connectionDeviceGenerationId"),
    F.col("SystemProperties.contentType").alias("SystemProperties_contentType"),
    F.col("SystemProperties.contentEncoding").alias("SystemProperties_contentEncoding"),
    F.col("SystemProperties.enqueuedTime").alias("SystemProperties_enqueuedTime")
).orderBy("Body_timestamp")

In [65]:
flatschedule_df.show()
flatschedule_df.count()

+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|           Body_mnsn|Body_timestamp|Body_scheduleName|     EnqueuedTimeUtc| Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
| smsProps|mn=TVgwMDAx:sn=MD...| 1724147

500

In [66]:
latest_data=flatschedule_df.sort(flatschedule_df.Body_timestamp.desc()).limit(1)
latest_data.show()

+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|           Body_mnsn|Body_timestamp|Body_scheduleName|     EnqueuedTimeUtc| Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
| smsProps|mn=TVgwNTAw:sn=MD...| 1724147

In [67]:
remaining_data=flatschedule_df.filter(~(flatschedule_df['Body_timestamp']==1724147813655))
remaining_data.show()
remaining_data.count()

+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|           Body_mnsn|Body_timestamp|Body_scheduleName|     EnqueuedTimeUtc| Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
| smsProps|mn=TVgwMDAx:sn=MD...| 1724147

499

In [86]:
# remaining_data2 = schedule_df.exceptAll(latest_data)
# remaining_data2.show()
# remaining_data2.count()

In [68]:
table_path=f"./message/basicUpdate"

In [69]:
latest_data.write.format('delta').mode('overwrite').save(table_path)

In [52]:
from delta.tables import *

In [70]:
dt=DeltaTable.forPath(spark, tabe_path)
check=dt.toDF()
check.show()

+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|           Body_mnsn|Body_timestamp|Body_scheduleName|     EnqueuedTimeUtc| Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
| smsProps|mn=TVgwNTAw:sn=MD...| 1724147

In [88]:
remaining_data.filter(remaining_data['Body_mnsn'].isNull()).show()

+---------+---------+--------------+-----------------+---------------+-------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|Body_mnsn|Body_timestamp|Body_scheduleName|EnqueuedTimeUtc|Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+---------+--------------+-----------------+---------------+-------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
+---------+---------+--------------+-----------------+---------------+-------------------+-

In [63]:
cond = (F.col("t.Body.mnsn") == F.col("s.Body.mnsn"))
dt.alias("t").merge(remaining_data.alias("s"),cond).whenMatchedUpdate(
condition=(F.col("t.Body_timestamp") < F.col("s.Body_timestamp")),,
set={
    "Body.timestamp":"s.Body.timestamp",
    "EnqueuedTimeUtc":"s.EnqueuedTimeUtc",
    "Properties.appTopic":"s.Properties.appTopic",
    "SystemProperties.connectionDeviceId":"s.SystemProperties.connectionDeviceId",
    "SystemProperties.connectionAuthMethod":"s.SystemProperties.connectionAuthMethod",
    "SystemProperties.connectionDeviceGenerationId":"s.SystemProperties.connectionDeviceGenerationId",
    "SystemProperties.contentType":"s.SystemProperties.contentType",
    "SystemProperties.contentEncoding":"s.SystemProperties.contentEncoding",
    "SystemProperties.enqueuedTime":"s.SystemProperties.enqueuedTime",  
}).whenNotMatchedInsert(
values = {
    "Body.type":"s.Body.type",
    "Body.mnsn":"s.Body.mnsn",
    "Body.timestamp":"s.Body.timestamp",
    "Body.scheduleName":"s.Body.scheduleName",
    "EnqueuedTimeUtc":"s.EnqueuedTimeUtc",
    "Properties.appTopic":"s.Properties.appTopic",
    "SystemProperties.connectionDeviceId":"s.SystemProperties.connectionDeviceId",
    "SystemProperties.connectionAuthMethod":"s.SystemProperties.connectionAuthMethod",
    "SystemProperties.connectionDeviceGenerationId":"s.SystemProperties.connectionDeviceGenerationId",
    "SystemProperties.contentType":"s.SystemProperties.contentType",
    "SystemProperties.contentEncoding":"s.SystemProperties.contentEncoding",
    "SystemProperties.enqueuedTime":"s.SystemProperties.enqueuedTime",  
    }
).execute()

AnalysisException: [DELTA_UNSUPPORTED_NESTED_FIELD_IN_OPERATION] Nested field is not supported in the INSERT clause of MERGE operation (field = `SystemProperties`.`enqueuedTime`).

In [89]:
cond = (F.col("t.Body_mnsn") == F.col("s.Body_mnsn"))
dt.alias("t").merge(remaining_data.alias("s"),cond).whenMatchedUpdate(
set={
    "Body_timestamp":"s.Body_timestamp",
    "EnqueuedTimeUtc":"s.EnqueuedTimeUtc",
    "Properties_appTopic":"s.Properties_appTopic",
    "SystemProperties_connectionDeviceId":"s.SystemProperties_connectionDeviceId",
    "SystemProperties_connectionAuthMethod":"s.SystemProperties_connectionAuthMethod",
    "SystemProperties_connectionDeviceGenerationId":"s.SystemProperties_connectionDeviceGenerationId",
    "SystemProperties_contentType":"s.SystemProperties_contentType",
    "SystemProperties_contentEncoding":"s.SystemProperties_contentEncoding",
    "SystemProperties_enqueuedTime":"s.SystemProperties_enqueuedTime",  
}).whenNotMatchedInsert(
values = {
    "Body_type":"s.Body_type",
    "Body_mnsn":"s.Body_mnsn",
    "Body_timestamp":"s.Body_timestamp",
    "Body_scheduleName":"s.Body_scheduleName",
    "EnqueuedTimeUtc":"s.EnqueuedTimeUtc",
    "Properties_appTopic":"s.Properties_appTopic",
    "SystemProperties_connectionDeviceId":"s.SystemProperties_connectionDeviceId",
    "SystemProperties_connectionAuthMethod":"s.SystemProperties_connectionAuthMethod",
    "SystemProperties_connectionDeviceGenerationId":"s.SystemProperties_connectionDeviceGenerationId",
    "SystemProperties_contentType":"s.SystemProperties_contentType",
    "SystemProperties_contentEncoding":"s.SystemProperties_contentEncoding",
    "SystemProperties_enqueuedTime":"s.SystemProperties_enqueuedTime",  
    }
).execute()

In [90]:
df1=spark.read.format("delta").load("./message/basicUpdate")
df1.show()

+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
|Body_type|           Body_mnsn|Body_timestamp|Body_scheduleName|     EnqueuedTimeUtc| Properties_appTopic|SystemProperties_connectionDeviceId|SystemProperties_connectionAuthMethod|SystemProperties_connectionDeviceGenerationId|SystemProperties_contentType|SystemProperties_contentEncoding|SystemProperties_enqueuedTime|
+---------+--------------------+--------------+-----------------+--------------------+--------------------+-----------------------------------+-------------------------------------+---------------------------------------------+----------------------------+--------------------------------+-----------------------------+
| smsProps|mn=TVgwMDA0:sn=MD...| 1724147

In [80]:
df1.select('Body_type','Body_mnsn','Body_timestamp','Body_scheduleName').show(truncate=False)

+---------+-------------------------------+--------------+-----------------+
|Body_type|Body_mnsn                      |Body_timestamp|Body_scheduleName|
+---------+-------------------------------+--------------+-----------------+
|smsProps |mn=TVgwMTQ3:sn=MDUxMTMxMzAxNDc=|1724147812920 |basicUpdate      |
|smsProps |mn=TVgwMTQ4:sn=MDUxMTMxMzAxNDg=|1724147812922 |basicUpdate      |
|smsProps |mn=TVgwMTQ5:sn=MDUxMTMxMzAxNDk=|1724147812927 |basicUpdate      |
|smsProps |mn=TVgwMTUw:sn=MDUxMTMxMzAxNTA=|1724147812929 |basicUpdate      |
|smsProps |mn=TVgwMTUx:sn=MDUxMTMxMzAxNTE=|1724147812931 |basicUpdate      |
|smsProps |mn=TVgwMTUy:sn=MDUxMTMxMzAxNTI=|1724147812934 |basicUpdate      |
|smsProps |mn=TVgwMTUz:sn=MDUxMTMxMzAxNTM=|1724147812936 |basicUpdate      |
|smsProps |mn=TVgwMTU0:sn=MDUxMTMxMzAxNTQ=|1724147812937 |basicUpdate      |
|smsProps |mn=TVgwMTU1:sn=MDUxMTMxMzAxNTU=|1724147812938 |basicUpdate      |
|smsProps |mn=TVgwMTU2:sn=MDUxMTMxMzAxNTY=|1724147812940 |basicUpdate      |

In [77]:
df1.select('Body_mnsn').distinct().count()

500

In [83]:
path1='C:/Users/acer/Downloads/printermessages.json'
schema_df = spark.read.schema(schema).json(path1)

In [82]:
schema_df.select("Body.scheduleName").show()

+--------------+
|  scheduleName|
+--------------+
|  manualUpdate|
|  deviceStatus|
|suppliesUpdate|
|  deviceStatus|
|suppliesUpdate|
|  deviceStatus|
+--------------+



In [60]:
from pyspark.sql import Window
wind=Window.partitionBy("Body.mnsn").orderBy(F.col("Body.timestamp").desc())
latest=schedule_df.withColumn("Row_Num",F.row_number().over(wind)).filter(F.col("Row_Num")==1).drop("Row_Num")
latest.show()

+--------------------+--------------------+--------------------+--------------------+
|     EnqueuedTimeUtc|          Properties|    SystemProperties|                Body|
+--------------------+--------------------+--------------------+--------------------+
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e0b7-4e...|{smsProps, mn=TVg...|
|2024-08-20T09:56:...|{rp.mib/dev/96d1f...|{96d1f469-e

In [61]:
latest.write.format("delta").mode("overwrite").save(tabe_path)