# Basic Operation 

###**Creating Schema**
Creating Source and Target Schemas in Spark SQL

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.source")  # Create source schema if it doesn't exist
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.target")  # Create target schema if it doesn't exist

DataFrame[]

### **Loading and Saving SCD1 Data Table**
Read from AccuWeather Source and Overwrite to Workspace Source Schema

In [0]:
scd1 = spark.sql("select * from samples.accuweather.forecast_hourly_metric")
scd1.write.mode("overwrite").saveAsTable("workspace.source.scd1")

### **Read and Display SCD1 Source Table**
Load Data from workspace.source.scd1 and Visualize Contents

In [0]:
source = spark.read.table('workspace.source.scd1')
source.display()

### **Concatenate All Columns into 'ConCatValue'**
Transform Source Data by Merging All Columns into a Single Column

In [0]:
from pyspark.sql import functions as F
# Load Data From Source and concatenate all columns into 'ConCatValue'
source = source.withColumn('ConCatValue', F.concat_ws('', *source.columns))
display(source)

### Add Metadata Columns to Source Data
Include IndCurrent, CreatedDate, and ModifiedDate in the Dataset


> 
These columns are metadata as they describe data, not business values:
IndCurrent: Flags current active record.
CreatedDate: Timestamp when inserted.
ModifiedDate: Timestamp of last update.

  Used for tracking, versioning, and auditing.

In [0]:
# Add IndCurrent, CreatedDate, and ModifiedDate columns
source = source.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())
source.display()

### Add storage_id in Ascending Order
Assign Unique Row Numbers and Place storage_id as the First Column


In [0]:
from pyspark.sql.window import Window

window_spec = Window.orderBy(F.monotonically_increasing_id())
source = source.withColumn("storage_id", F.row_number().over(window_spec))

first_cols = ["storage_id"]
other_cols = [col for col in source.columns if col not in first_cols]
source = source.select(first_cols + other_cols)
display(source)

### Generate Row-Level Hash Using SHA-256
Create RowHash from Concatenated Values and Drop ConCatValue

SHA-256 is a cryptographic hash function that generates a fixed 256-bit (64-character) hash value.
It ensures data integrity by uniquely representing input data with a secure digital fingerprint.

In [0]:
# Generate SHA-256 hash of concatenated column values and drop 'ConCatValue'
source = source.withColumn("RowHash", F.sha2(F.col("ConCatValue"), 256)).drop('ConCatValue')
display(source)

### Write to Target Schema and Display Data
Append Transformed Data to workspace.target.scd1 and View Contents



In [0]:
#writing to the target schema  
source.write.mode("append").saveAsTable("workspace.target.scd1")
# Display data from the target_table schema

target_df = spark.sql("SELECT * FROM workspace.target.scd1") 
display(target_df)

# SCD_1

### Define Source and Target Table Names
Set SourceTable and TargetTable Variables for Reusability

In [0]:
SourceTable='workspace.source.scd1'
TargetTable='workspace.target.scd1'

### Load Source and Target Tables into DataFrames

In [0]:
SourceDf=spark.read.table(SourceTable)  # Read source table into DataFrame
TargetDf=spark.read.table(TargetTable)  # Read target table into DataFrame

In [0]:
SourceDf.display()

### Filter Rows by Latitude and Inspect City

In [0]:
from pyspark.sql.functions import col

# Filter the DataFrame to show only rows where '("latitude") == "22.36851"'
# Display the filtered DataFrame for inspection
SourceDf.filter(col("latitude") == "22.36851").display()

# The 'city' value for rows with ("latitude") == "22.36851" is 'Pune'

### Update City Name Based on Latitude

In [0]:
from pyspark.sql.functions import col, when

# Update the 'city_name' column in SourceDf:
# For rows where "latitude" == "22.36851", set the 'city_name' value to 'Pune'.
# For all other rows, retain the original 'city_name' value.
SourceDf = SourceDf.withColumn(
    "city_name",
    when(col("latitude") == "22.36851", "Pune").otherwise(col("city_name"))
)

# Display rows where "latitude" == "22.36851" to verify the 'city_name' column update.
SourceDf.filter(col("latitude") == "22.36851").display()

### Create RowHash by Concatenating All Columns

In [0]:
# Create a hash key by concatenating all columns into a single string column 'RowHash'
from pyspark.sql import functions as F

# Concatenate all columns in 'source' DataFrame into 'RowHash'
SourceDf = SourceDf.withColumn('RowHash', F.concat_ws('', *SourceDf.columns))

#### Add Metadata Columns to SourceDf

In [0]:
# Add three new columns to SourceDf:
# 1. 'IndCurrent': Set to 1 for all rows, indicating the current/active record.
# 2. 'CreatedDate': Set to the current timestamp, representing when the record was created.
# 3. 'ModifiedDate': Set to the current timestamp, representing when the record was last modified.
SourceDf = SourceDf.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())
SourceDf.display()   

In [0]:
SourceDf.filter(col("latitude") == "22.36851").display()

#### Implement SCD Type 1 Using Delta Lake Merge
Update Target Table with Latest Data Based on storage_id and RowHash Comparison


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, col

# Static configuration
table_name = "workspace.target_table.sales"
key_column = "storage_id"
timestamp_column = "ModifiedDate"
hash_column = "RowHash"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Aliases
src = SourceDf.alias("src")
tgt = target_table.alias("tgt")

# Columns to update (exclude key, timestamp, and created date)
columns_to_update = [
    col_name for col_name in SourceDf.columns 
    if col_name not in [key_column, timestamp_column, created_column]
]

# Construct SET dictionary for update
set_dict = {col_name: col(f"src.{col_name}") for col_name in columns_to_update}
set_dict[timestamp_column] = current_timestamp()  # Add ModifiedDate explicitly

# Perform SCD Type 1 MERGE
tgt.merge(
    src,
    f"tgt.{key_column} = src.{key_column}"
).whenMatchedUpdate(
    condition=col(f"src.{hash_column}") != col(f"tgt.{hash_column}"),
    set=set_dict
).whenNotMatchedInsertAll().execute()

<delta.connect.tables.DeltaMergeBuilder at 0xff9df62b2e50>

### Query Target Table to Verify SCD1 Merge for Specific Row

In [0]:
display(spark.sql("select * from workspace.target.scd1 where storage_id = '1' "))