<a href="https://colab.research.google.com/github/RajaSuhashKesari/MyDataEngineeringPractices/blob/main/IncremantalLoadusingHashColumn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col

# Start Spark
spark = SparkSession.builder.appName("IncrementalLoadDemo").getOrCreate()

# 🟢 Simulate the source data (e.g., new incoming data)
source_data = [
    (1, "Alice"),
    (2, "Bob"),
    (3, "Charlie"),
    (4, "David"),
    (5, "Eve"),
    (6, "Frank"),
    (7, "Grace")
]

# 🔵 Simulate the target (already loaded) data
target_data = [
    (1, "Alice"),
    (2, "Bob"),
    (3, "Charlie")
]

# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

# Create DataFrames
source_df = spark.createDataFrame(source_data, schema)
target_df = spark.createDataFrame(target_data, schema)

# Show both
print("🔵 Target Data:")
target_df.show()

print("🟢 Source Data:")
source_df.show()


🔵 Target Data:
+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

🟢 Source Data:
+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
|  5|    Eve|
|  6|  Frank|
|  7|  Grace|
+---+-------+



In [1]:
from pyspark.sql.functions import sha2,concat_ws

In [5]:
staging_df = source_df
staging_df = staging_df.withColumn("hash",sha2(concat_ws("||",*staging_df.columns),256))
staging_df.show()
target_df = target_df.withColumn("hash",sha2(concat_ws("||",*target_df.columns),256))
target_df.show()

+---+-------+--------------------+
| id|   name|                hash|
+---+-------+--------------------+
|  1|  Alice|7b487f35c19464008...|
|  2|    Bob|47e43681306972ff1...|
|  3|Charlie|6c4c2863a1da2e2e0...|
|  4|  David|d1e2e0f0a17c16174...|
|  5|    Eve|3c50586e6386ea2fa...|
|  6|  Frank|8741e8d992c9a3f0c...|
|  7|  Grace|e48755cf95108e3a3...|
+---+-------+--------------------+

+---+-------+--------------------+
| id|   name|                hash|
+---+-------+--------------------+
|  1|  Alice|7b487f35c19464008...|
|  2|    Bob|47e43681306972ff1...|
|  3|Charlie|6c4c2863a1da2e2e0...|
+---+-------+--------------------+



In [6]:
change_data = staging_df.join(target_df,on="id",how="left_anti")
change_data.show()

+---+-----+--------------------+
| id| name|                hash|
+---+-----+--------------------+
|  6|Frank|8741e8d992c9a3f0c...|
|  5|  Eve|3c50586e6386ea2fa...|
|  4|David|d1e2e0f0a17c16174...|
|  7|Grace|e48755cf95108e3a3...|
+---+-----+--------------------+



In [7]:
target_df = target_df.union(change_data).drop("hash")
target_df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  6|  Frank|
|  5|    Eve|
|  4|  David|
|  7|  Grace|
+---+-------+

