# **Incremental Parameter**

In [83]:
from pyspark.sql.functions import *
from pyspark.sql.types import *


StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 86, Finished, Available, Finished)

# **CREATING DIMENSION -model**

In [84]:
df_src=spark.sql('''
SELECT DISTINCT(Model_ID),model_category 
FROM parquet.`abfss://incre_ws@onelake.dfs.fabric.microsoft.com/lake_san.Lakehouse/Files/silver/carsales`
''')

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 87, Finished, Available, Finished)

In [85]:
display(df_src)

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 88, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e6d43058-028a-486b-999b-dc79e3cd94da)

## initial and incremental

In [86]:
if spark.catalog.tableExists("dim_model"):
    df_sink=spark.sql('''
    SELECT dim_model_key,Model_ID,model_category
    FROM dim_model''')
else:
    df_sink=spark.sql('''
    SELECT 1 AS dim_model_key,Model_ID,model_category
    FROM parquet.`abfss://incre_ws@onelake.dfs.fabric.microsoft.com/lake_san.Lakehouse/Files/silver/carsales`
    WHERE 1=0 ''') #creating empty table with schema

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 89, Finished, Available, Finished)

In [87]:
df_filter=df_src.join(df_sink,df_src['Model_ID']==df_sink['Model_ID'],'left').select(df_src['Model_ID'],df_src['model_category'],df_sink['dim_model_key'])
df_old=df_filter.filter(col('dim_model_key').isNotNull())
df_new=df_filter.filter(col('dim_model_key').isNull()).select(df_src['Model_ID'],df_src['model_category'])

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 90, Finished, Available, Finished)

In [88]:
display(df_new)

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 91, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 966edc24-05bd-4aa1-baec-cddc95e4ace7)

## Surrogate key creation

In [89]:
if spark.catalog.tableExists("dim_model"):
    max_value_df=spark.sql('''SELECT MAX(dim_model_key) FROM dim_model''') #return as a dataset
    max_value=max_value_df.collect()[0][0]
else:
    max_value=1

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 92, Finished, Available, Finished)

In [90]:
df_new = df_new.withColumn(
    "dim_model_key",
    monotonically_increasing_id() + 1)


StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 93, Finished, Available, Finished)

In [91]:
display(df_new)

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 94, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f215cba9-d118-4b4e-8a07-f8f306cccb5d)

In [92]:
df_final=df_new.union(df_old)
df_final=df_final.dropDuplicates(['dim_model_key'])

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 95, Finished, Available, Finished)

# **SCD TYPE-1: UPSERT**

In [93]:
from delta.tables import DeltaTable

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 96, Finished, Available, Finished)

In [94]:
if spark.catalog.tableExists('dim_model'):            #Incremental run
    delta_table=DeltaTable.forPath(spark,'abfss://incre_ws@onelake.dfs.fabric.microsoft.com/lake_san.Lakehouse/Files/gold/dim_model')
    delta_table.alias("trg").merge(df_final.alias("src"),"trg.Model_ID=src.Model_ID")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()
else:                                                      #Initial run
    df_final.write.format("delta")\
        .mode('overwrite')\
        .option('path','abfss://incre_ws@onelake.dfs.fabric.microsoft.com/lake_san.Lakehouse/Files/gold/dim_model')\
        .saveAsTable('dim_model')

StatementMeta(, 899f9616-f907-4cd5-9f81-1cb89b221f46, 97, Finished, Available, Finished)

In [2]:
%%sql
SELECT * FROM dim_model

StatementMeta(, c8077879-f13b-45f0-817b-6850e6ea663d, 3, Finished, Available, Finished)

<Spark SQL result set with 282 rows and 3 fields>

In [1]:
%%sql
SELECT *
FROM dim_model
WHERE dim_model_key IN (
    SELECT dim_model_key
    FROM dim_model
    GROUP BY dim_model_key
    HAVING COUNT(*) = 1
);


StatementMeta(, c8077879-f13b-45f0-817b-6850e6ea663d, 2, Finished, Available, Finished)

<Spark SQL result set with 282 rows and 3 fields>