`
Version History :
Created by - Shaurya Rawat
`

</br>

## Identity Conversion Utility
- Backup Delta Files: Copying EDW table files from Current_Location to Old_Location 
- Creating unmanaged table for Old_Location
- Converting Identity Column of current table from <b>GENERATED ALWAYS AS IDENTITY</b> to <b> GENERATED BY DEFAULT AS IDENTITY</b>
- Resyncing Identity Column

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Set the following parameters for backup location from Folder1 to Folder2
p_table_name_current = 'Invoice_Payment_Status_Dim'
p_table_name_backup = 'Invoice_Payment_Status_Dim_Old'

# Static values for the notebook
p_container_name= 'datalake'
p_adls_zone = 'EDW'

dbutils.widgets.text("p_table_name_current",p_table_name_current) 
dbutils.widgets.text("p_table_name_backup",p_table_name_backup) 
dbutils.widgets.text("p_container_name",p_container_name) 
dbutils.widgets.text("p_adls_zone",p_adls_zone) 

In [0]:
# Creating Delta Lake backup folder in EDW
p_table_name_current = dbutils.widgets.get("p_table_name_current")
p_table_name_backup = dbutils.widgets.get("p_table_name_backup")
p_container_name = dbutils.widgets.get("p_container_name")
p_adls_zone = dbutils.widgets.get("p_adls_zone")
storage_account_name = dbutils.secrets.get(scope = "kv-edw-scope", key = "ADLS-StorageAccount")

Current_query_schema = f"Show create table edw.{p_table_name_current}"
EDW_table_schema = spark.sql(Current_query_schema).collect()[0][0]

source_path = f"abfss://{p_container_name}@{storage_account_name}.dfs.core.windows.net/{p_adls_zone}/{p_table_name_current}/Internal/"
destination_path = f"abfss://{p_container_name}@{storage_account_name}.dfs.core.windows.net/{p_adls_zone}/{p_table_name_backup}/Internal/"

try:
    Drop_Current_Table = f"Drop table edw.{p_table_name_current}"
    # Drop existing EDW table 
    spark.sql(Drop_Current_Table)
    print(f"Dropped EDW External Table: {p_table_name_current}")
except Exception as e:
    print("Error dropping table", e)

try:
    print(f"Moving files from {source_path} to {destination_path}")
    dbutils.fs.mv(source_path, destination_path, True)
    print("All files processed...")
    print("Number of files processed: " + str(len(dbutils.fs.ls(destination_path) )) )
except:
    print("Error processing files")

Dropped EDW External Table: Invoice_Payment_Status_Dim
Moving files from abfss://datalake@[REDACTED].dfs.core.windows.net/EDW/Invoice_Payment_Status_Dim/Internal/ to abfss://datalake@[REDACTED].dfs.core.windows.net/EDW/Invoice_Payment_Status_Dim_Old/Internal/
All files processed...
Number of files processed: 8


In [0]:
query = f"""
CREATE TABLE IF NOT EXISTS edw.`{p_table_name_backup}`
USING DELTA 
LOCATION 'abfss://{p_container_name}@{storage_account_name}.dfs.core.windows.net/{p_adls_zone}/{p_table_name_backup}/Internal'
"""

try:
    spark.sql(query)
    print(query)
    print(f"Table {p_table_name_backup} created successfully or already exists.")
except Exception as e:
    error_message = str(e)
    print(f"Table creation failed due to: {error_message}")

# displayHTML(query) 


CREATE TABLE IF NOT EXISTS edw.`Invoice_Payment_Status_Dim_Old`
USING DELTA 
LOCATION 'abfss://datalake@[REDACTED].dfs.core.windows.net/EDW/Invoice_Payment_Status_Dim_Old/Internal'

Table Invoice_Payment_Status_Dim_Old created successfully or already exists.


In [0]:

try:
    Sync_identity_query = f"ALTER TABLE edw.{p_table_name_current} ALTER COLUMN {p_table_name_current}_Id SYNC IDENTITY"
    insert_query = f"INSERT INTO edw.{p_table_name_current} SELECT * FROM edw.{p_table_name_backup}";
    modified_schema = EDW_table_schema.replace(f"GENERATED ALWAYS AS IDENTITY", f"GENERATED BY DEFAULT AS IDENTITY")
    
    # Delete existing directory in ADLS for EDW table to avoid sync issues
    dbutils.fs.rm(source_path, recurse=True)

    # Recreate EDW table with new Identity features
    spark.sql(modified_schema)

    # Insert data into EDW table
    spark.sql(insert_query)

    # Syncing Identity Value
    spark.sql(Sync_identity_query)

    print(modified_schema)
except Exception as e:
    error_message = str(e)
    print(f"Failed due to: {error_message}")




CREATE TABLE sdev.edw.Invoice_Payment_Status_Dim (
  Invoice_Payment_Status_Dim_Id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),
  Rec_Process_Log_Id BIGINT NOT NULL,
  Rec_Src_Application_Id INT NOT NULL,
  Rec_Src_Key VARCHAR(150) NOT NULL,
  Rec_Src_Create_Date TIMESTAMP,
  Rec_Src_Create_User STRING,
  Rec_Src_Update_Date TIMESTAMP,
  Rec_Src_Update_User STRING,
  Rec_EDW_Hash VARCHAR(255),
  Rec_EDW_Create_Date TIMESTAMP NOT NULL,
  Rec_EDW_Update_Date TIMESTAMP NOT NULL,
  Invoice_Payment_Status_Code INT NOT NULL,
  Invoice_Payment_Status VARCHAR(100) NOT NULL)
USING delta
LOCATION 'abfss://datalake@[REDACTED].dfs.core.windows.net/EDW/Invoice_Payment_Status_Dim/Internal'
TBLPROPERTIES (
  'delta.enableDeletionVectors' = 'true',
  'delta.feature.deletionVectors' = 'supported',
  'delta.feature.identityColumns' = 'supported',
  'delta.feature.invariants' = 'supported',
  'delta.minReaderVersion' = '3',
  'delta.minWriterVersion' = '7')



In [0]:
Backup_table_validation = 0

result_backup_minus_current = spark.sql(f"""
    SELECT * FROM edw.`{p_table_name_backup}`
    MINUS
    SELECT * FROM edw.`{p_table_name_current}`
""")

if result_backup_minus_current.count() == 0:
    Backup_table_validation = 1
    print("Success: Both tables have the same data.")
else:
    print("Error: Data mismatch found.")
    result_backup_minus_current.show()

Success: Both tables have the same data.
