##Full loading data and implementing scd1

In [0]:
from delta.tables import *
from pyspark.sql.functions import col,concat_ws,lit,current_timestamp,xxhash64

In [0]:
%sql
USE CATALOG vsarthicat;

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS formula1_implem ;

In [0]:
# %sql 
# DROP TABLE formula1_implem.circuits_scd2;

In [0]:
src_df = spark.table("global_temp.circuits_view")

In [0]:
src_schema = src_df.schema

In [0]:
empty_tgt_table = spark.createDataFrame([],src_schema)

In [0]:
table_scd1_exists = spark.catalog.tableExists("formula1_implem.circuits_scd1")

In [0]:
if not table_scd1_exists:
    spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")
    empty_tgt_table.write.format("delta").saveAsTable("vsarthicat.formula1_implem.circuits_scd1")
else:
    tgt_table_df = spark.table("vsarthicat.formula1_implem.circuits_scd1")

    src_insert_df = tgt_table_df.join(
        src_df,
        on=col("tgt_table_df.circuit_id") == col("src_df.circuit_id"),
        how="left_anti"
    )



In [0]:
if table_scd1_exists:
    src_df = src_df.withColumn("row_hash", xxhash64(*src_df.columns))
    tgt_table_df = tgt_table_df.withColumn("row_hash", xxhash64(*tgt_table_df.columns))
    join_cond = [
        src_df["circuit_id"] == tgt_table_df["circuit_id"],
        src_df["row_hash"] == tgt_table_df["row_hash"]
    ]

    src_modify_df = src_df.join(tgt_table_df, on=join_cond, how="left_anti")


In [0]:
if table_scd1_exists:
    src_update_df = src_modify_df.union(src_insert_df)
else:
    src_update_df = src_df.drop("row_hash")

In [0]:
src_df.display()

In [0]:
deltaTableCircuitsScd1 = DeltaTable.forName(spark, 'vsarthicat.formula1_implem.circuits_scd1')


deltaTableCircuitsScd1.alias('tgt') \
  .merge(
    src_update_df.alias('src'),
    'src.circuit_id = tgt.circuit_id'
  ) \
  .whenMatchedUpdateAll() \
  .whenNotMatchedInsertAll() \
  .whenNotMatchedBySourceDelete()\
  .execute()

In [0]:
%sql
SELECT * FROM formula1_implem.circuits_scd1;

In [0]:
if not spark.catalog.tableExists("vsarthicat.formula1_implem.circuits_scd2"):
    final_empty_tgt_table = empty_tgt_table\
        .withColumn("insertion_timestamp",lit(current_timestamp()))\
        .withColumn("end_timestamp",lit(current_timestamp()))\
        .withColumn("commit_version",lit(0))
    final_empty_tgt_table.write.format("delta").saveAsTable("vsarthicat.formula1_implem.circuits_scd2")

In [0]:
deltaTableCircuitsScd2 = DeltaTable.forName(spark, 'vsarthicat.formula1_implem.circuits_scd2')
circuits_scd2_df = deltaTableCircuitsScd2.toDF()

row = circuits_scd2_df.select("commit_version").orderBy("commit_version").first()
if row is not None:
    start_version = row["commit_version"]
else:
    start_version= 0


In [0]:
cdf = spark.read \
  .option("readChangeFeed", "true") \
  .option("startingVersion", start_version) \
  .table("formula1_implem.circuits_scd1")

In [0]:
current_time = current_timestamp()

In [0]:
# cdf.display()

In [0]:
del_df = cdf.filter((col("_change_type") == "update_preimage") | (col("_change_type") == "delete"))\
    .drop("_change_type","_commit_timestamp")\
    .withColumn("merge_key",concat_ws("_", "circuit_id", "circuit_ref"))\
    .withColumnRenamed("_commit_version","commit_version")\
    .withColumn("insertion_timestamp",lit(current_time))\
    .withColumn("end_timestamp",lit(current_time))



In [0]:
# del_df.display()

In [0]:
upd_df = cdf.filter((col("_change_type") == "update_postimage")|(col("_change_type") == "insert"))\
  .drop("_change_type","_commit_timestamp").withColumn("merge_key",lit("NULL"))\
  .withColumnRenamed("_commit_version","commit_version")\
  .withColumn("insertion_timestamp",lit(current_time))\
  .withColumn("end_timestamp",lit(None))

In [0]:
upd_df.display()

In [0]:
union_df = del_df.union(upd_df)

In [0]:
deltaTableCircuitsScd2.alias('tgt') \
  .merge(
    source=union_df.alias('src'),
    condition=concat_ws("_", col("tgt.circuit_id"), col("tgt.circuit_ref")) == col("src.merge_key") 
  ) \
  .whenMatchedUpdate(set = {
      "end_timestamp": current_time,
      "commit_version":col("src.commit_version")

  }) \
  .whenNotMatchedInsertAll() \
  .execute()

In [0]:
%sql 
SELECT * FROM formula1_implem.circuits_scd2;