Notebook: GSA_SAM_Initial_Delta_Table_Creation_PUBLIC<br>
Created by: Joshua Wilshere<br>
Created On: 4/3/24<br><br>
Synapse Spark Pool Version: 3.4<br><br>
Dependencies:<br> 
1. Unzipped pipe-delimited SAM monthly file in source_dir
    - Historical Public Files available here: https://sam.gov/data-services/Entity%20Registration/Public%20-%20Historical?privacy=Public
    - Current Public Files available here: https://sam.gov/data-services/Entity%20Registration/Public%20V2?privacy=Public
2. Accurate list of SAM column headers in GSA_SAM_COLUMN_HEADERS_PUBLIC.csv<br><br></ol>
Purpose: Write initial monthly SAM file to target data source to serve as a base for future daily and monthly file incremental loads

In [None]:
# Processes in this notebook
# 1. Load SAM file column headers into Pandas dataframe
# 2. Create dictionary of column headers with all dtypes set to string
# 3. Load initial monthly SAM file into Pandas dataframe using dictionary of column headers and datatypes
# 4. Convert Pandas dataframe to Spark dataframe
# 5. Combine composite key columns into single unique entity key
# 6. Add initial audit columns
# 7. Convert NullType() Columns to StringType()
# 8. Create hash code of fields to track changes in (AUD_HASH_CODE)
# 9. Create surrogate unique key (AUD_SEQ_ID) and derive partition key
# 10. Update spark dataframe with final column order
# 11. Write initial data to silver delta file
# 12 (optional). Run Z-Order clustering on data

# Adapted from: https://iterationinsights.com/article/how-to-implement-slowly-changing-dimensions-scd-type-2-using-delta-table/

# <font size="5">0. Initialize packages, file names, and file paths

In [1]:
import pandas as pd
import os

# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html
#from pyspark.sql.functions import concat, col, coalesce, lit, sha2, concat_ws, row_number, cast, window
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

StatementMeta(spk3u4py3u10, 2, 2, Finished, Available, Finished)

In [2]:
# Set the file name of column header definitions
header_csv = 'GSA_SAM_COLUMN_HEADERS_PUBLIC.csv'

# Set the source file to be loaded
source_file = 'SAM_PUBLIC_MONTHLY_V2_20241103.dat'

# Set the source_date as the file date
source_date = source_file[-12:-4]

StatementMeta(spk3u4py3u10, 2, 3, Finished, Available, Finished)

In [None]:
source_dir = 'abfss://bronze@<storage account name>.dfs.core.usgovcloudapi.net/GSA_SAM_PUBLIC/'
silver_path = 'abfss://silver@<storage account name>.dfs.core.usgovcloudapi.net/GSA_SAM_PUBLIC/'
gold_path = 'abfss://gold@<storage account name>.dfs.core.usgovcloudapi.net/GSA_SAM_PUBLIC/'

StatementMeta(spk3u4py3u10, 2, 4, Finished, Available, Finished)

# <font size="5">1. Load SAM file column headers into Pandas dataframe

In [None]:
# Create dataframe with column names
#col_df = pd.read_csv(source_dir + header_csv,
col_df = pd.read_csv(os.path.join(source_dir + header_csv),
    storage_options = {'linked_service' : '<linked service name>'})

StatementMeta(spk3u4py3u10, 2, 5, Finished, Available, Finished)

In [5]:
#col_df

StatementMeta(spk3u4py3u10, 2, 6, Finished, Available, Finished)

# <font size="5">2. Create dictionary of column headers with all dtypes set to string

In [6]:
# Create list of column names and dictionary of column names with all datatypes set to 'str'
var_dict = {}
var_list = []
for i in range(col_df.shape[1]):
    #var_str = ''
    #var_str = "'{}':'object'".format(col_df.columns[i])
    var_dict[col_df.columns[i]] = 'str'
    var_list.append(col_df.columns[i])

StatementMeta(spk3u4py3u10, 2, 7, Finished, Available, Finished)

In [7]:
#print(var_dict)

StatementMeta(spk3u4py3u10, 2, 8, Finished, Available, Finished)

In [8]:
#print(var_list)

StatementMeta(spk3u4py3u10, 2, 9, Finished, Available, Finished)

In [None]:
source_path = os.path.join(source_dir,source_file)
print(source_path)

# <font size="5">3. Load initial monthly SAM file into Pandas dataframe using dictionary of column headers and datatypes

In [None]:
# Create dataframe from datafile using columns and datatypes set above
pdf = pd.read_csv(source_path,
    #delimiter='|',
    sep='|',
    names=var_list,
    dtype=var_dict,
    quoting=3, #3 = QUOTE_NONE
    doublequote=False,
    # skip garbage header and footer rows
    skiprows=1,
    skipfooter=1,
    # python engine must be specified for skipfooter to work
    engine='python',
    ### for testing errors/rejected records ###
    # 'error', raise an Exception when a bad line is encountered.
    # 'warn', raise a warning when a bad line is encountered and skip that line.
    # 'skip', skip bad lines without raising or warning when they are encountered.
    on_bad_lines = 'warn',

    storage_options = {'linked_service' : '<linked service name>'})

StatementMeta(spk3u4py3u10, 2, 11, Finished, Available, Finished)

In [11]:
# Increase number of rows/columns data frames will display in a print command
# Reference: https://pandas.pydata.org/docs/user_guide/options.html
# Also recommended - enable output scrolling by opening VSCode User Settings, searching for notebook.output.scrolling and checking the box
pd.options.display.max_rows = 999
pd.options.display.max_columns = None

StatementMeta(spk3u4py3u10, 2, 12, Finished, Available, Finished)

In [12]:
pdf.head(2)

StatementMeta(spk3u4py3u10, 2, 13, Finished, Available, Finished)

Unnamed: 0,UNIQUE_ENTITY_ID,BLANK_DEPRECATED,ENTITY_EFT_INDICATOR,CAGE_CODE,DODAAC,SAM_EXTRACT_CODE,PURPOSE_OF_REGISTRATION,INITIAL_REGISTRATION_DATE,REGISTRATION_EXPIRATION_DATE,LAST_UPDATE_DATE,ACTIVATION_DATE,LEGAL_BUSINESS_NAME,DBA_NAME,ENTITY_DIVISION_NAME,ENTITY_DIVISION_NUMBER,PHYSICAL_ADDRESS_LINE_1,PHYSICAL_ADDRESS_LINE_2,PHYSICAL_ADDRESS_CITY,PHYSICAL_ADDRESS_PROVINCE_OR_STATE,PHYSICAL_ADDRESS_ZIP/POSTAL_CODE,PHYSICAL_ADDRESS_ZIP_CODE_4,PHYSICAL_ADDRESS_COUNTRY_CODE,PHYSICAL_ADDRESS_CONGRESSIONAL_DISTRICT,D&B_OPEN_DATA_FLAG,ENTITY_START_DATE,FISCAL_YEAR_END_CLOSE_DATE,ENTITY_URL,ENTITY_STRUCTURE,STATE_OF_INCORPORATION,COUNTRY_OF_INCORPORATION,BUSINESS_TYPE_COUNTER,BUS_TYPE_STRING,PRIMARY_NAICS,NAICS_CODE_COUNTER,NAICS_CODE_STRING,PSC_CODE_COUNTER,PSC_CODE_STRING,CREDIT_CARD_USAGE,CORRESPONDENCE_FLAG,MAILING_ADDRESS_LINE_1,MAILING_ADDRESS_LINE_2,MAILING_ADDRESS_CITY,MAILING_ADDRESS_ZIP/POSTAL_CODE,MAILING_ADDRESS_ZIP_CODE_4,MAILING_ADDRESS_COUNTRY,MAILING_ADDRESS_STATE_OR_PROVINCE,GOVT_BUS_POC_FIRST_NAME,GOVT_BUS_POC_MIDDLE_INITIAL,GOVT_BUS_POC_LAST_NAME,GOVT_BUS_POC_TITLE,GOVT_BUS_POC_ST_ADD_1,GOVT_BUS_POC_ST_ADD_2,GOVT_BUS_POC_CITY,GOVT_BUS_POC_ZIP/POSTAL_CODE,GOVT_BUS_POC_ZIP_CODE_4,GOVT_BUS_POC_COUNTRY_CODE,GOVT_BUS_POC_STATE_OR_PROVINCE,ALT_GOVT_BUS_POC_FIRST_NAME,ALT_GOVT_BUS_POC_MIDDLE_INITIAL,ALT_GOVT_BUS_POC_LAST_NAME,ALT_GOVT_BUS_POC_TITLE,ALT_GOVT_BUS_POC_ST_ADD_1,ALT_GOVT_BUS_POC_ST_ADD_2,ALT_GOVT_BUS_POC_CITY,ALT_GOVT_BUS_POC_ZIP/POSTAL_CODE,ALT_GOVT_BUS_POC_ZIP_CODE_4,ALT_GOVT_BUS_POC_COUNTRY_CODE,ALT_GOVT_BUS_POC_STATE_OR_PROVINCE,PAST_PERF_POC_POC_FIRST_NAME,PAST_PERF_POC_POC_MIDDLE_INITIAL,PAST_PERF_POC_POC_LAST_NAME,PAST_PERF_POC_POC_TITLE,PAST_PERF_POC_ST_ADD_1,PAST_PERF_POC_ST_ADD_2,PAST_PERF_POC_CITY,PAST_PERF_POC_ZIP/POSTAL_CODE,PAST_PERF_POC_ZIP_CODE_4,PAST_PERF_POC_COUNTRY_CODE,PAST_PERF_POC_STATE_OR_PROVINCE,ALT_PAST_PERF_POC_FIRST_NAME,ALT_PAST_PERF_POC_MIDDLE_INITIAL,ALT_PAST_PERF_POC_LAST_NAME,ALT_PAST_PERF_POC_TITLE,ALT_PAST_PERF_POC_ST_ADD_1,ALT_PAST_PERF_POC_ST_ADD_2,ALT_PAST_PERF_POC_CITY,ALT_PAST_PERF_POC_ZIP/POSTAL_CODE,ALT_PAST_PERF_POC_ZIP_CODE_4,ALT_PAST_PERF_POC_COUNTRY_CODE,ALT_PAST_PERF_POC_STATE_OR_PROVINCE,ELEC_BUS_POC_FIRST_NAME,ELEC_BUS_POC_MIDDLE_INITIAL,ELEC_BUS_POC_LAST_NAME,ELEC_BUS_POC_TITLE,ELEC_BUS_POC_ST_ADD_1,ELEC_BUS_POC_ST_ADD_2,ELEC_BUS_POC_CITY,ELEC_BUS_POC_ZIP/POSTAL_CODE,ELEC_BUS_POC_ZIP_CODE_4,ELEC_BUS_POC_COUNTRY_CODE,ELEC_BUS_POC_STATE_OR_PROVINCE,ALT_ELEC_POC_BUS_POC_FIRST_NAME,ALT_ELEC_POC_BUS_POC_MIDDLE_INITIAL,ALT_ELEC_POC_BUS_POC_LAST_NAME,ALT_ELEC_POC_BUS_POC_TITLE,ALT_ELEC_POC_BUS_ST_ADD_1,ALT_ELEC_POC_BUS_ST_ADD_2,ALT_ELEC_POC_BUS_CITY,ALT_ELEC_POC_BUS_ZIP/POSTAL_CODE,ALT_ELEC_POC_BUS_ZIP_CODE_4,ALT_ELEC_POC_BUS_COUNTRY_CODE,ALT_ELEC_POC_BUS_STATE_OR_PROVINCE,NAICS_EXCEPTION_COUNTER,NAICS_EXCEPTION_STRING,DEBT_SUBJECT_TO_OFFSET_FLAG,EXCLUSION_STATUS_FLAG,SBA_BUSINESS_TYPES_COUNTER,SBA_BUSINESS_TYPES_STRING,NO_PUBLIC_DISPLAY_FLAG,DISASTER_RESPONSE_COUNTER,DISASTER_RESPONSE_STRING,ENTITY_EVS_SOURCE,FLEX_FIELD_1,FLEX_FIELD_2,FLEX_FIELD_3,FLEX_FIELD_4,FLEX_FIELD_5,FLEX_FIELD_6,FLEX_FIELD_7,FLEX_FIELD_8,FLEX_FIELD_9,FLEX_FIELD_10,FLEX_FIELD_11,FLEX_FIELD_12,FLEX_FIELD_13,FLEX_FIELD_14,FLEX_FIELD_15,FLEX_FIELD_16,FLEX_FIELD_17,FLEX_FIELD_18,FLEX_FIELD_19,END_OF_RECORD_INDICATOR
0,C111ATT311C8,,,53YC5,,A,Z2,20131112,20250625,20240627,20240627,K & K CONSTRUCTION SUPPLY INC,,,,11400 WHITE ROCK RD,,RANCHO CORDOVA,CA,95742,6600,USA,6,,20060525,1215,www.kkconstructionsupply.com,2L,NV,USA,5,2X~8W~A2~HQ~XS,423390.0,9,423310Y~423320Y~423390Y~423510Y~423710Y~423990...,1,5680.0,Y,,11400 WHITE ROCK ROAD,,RANCHO CORDOVA,95742,7518,USA,CA,TRACY,,LOVELAND,DIRECTOR,11400 WHITE ROCK ROAD,,RANCHO CORDOVA,95742,,USA,CA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TRACY,,LOVELAND,PRESIDENT,11400 WHITE ROCK ROAD,,RANCHO CORDOVA,95742,,USA,CA,,,,,,,,,,,,0,,N,,0,,,0,,E&Y,,,,,,,,,,,,,,,,,,,,!end
1,C111BG66D155,,,6M9A6,,A,Z1,20111228,20250724,20240729,20240726,NEW ADVANCES FOR PEOPLE WITH DISABILITIES,NAPD,NAPD,,3400 N SILLECT AVE,,BAKERSFIELD,CA,93308,6363,USA,20,,19750301,922,www.napd-bak.org,8H,CA,USA,1,A8,,0,,0,,Y,,3400 N. SILLECT AVENUE,,BAKERSFIELD,93308,1815,USA,CA,RICHARD,,BARENCHI,DIRECTOR OF OPERATIONS & FINANCE,3400 N. SILLECT AVENUE,,BAKERSFIELD,93308,,USA,CA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,RICHARD,,BARENCHI,DIRECTOR OF OPERATIONS & FINANCE,3400 N. SILLECT AVENUE,,BAKERSFIELD,93308,,USA,CA,,,,,,,,,,,,0,,N,,0,,,0,,E&Y,,,,,,,,,,,,,,,,,,,,!end


In [13]:
pdf.shape[0]

StatementMeta(spk3u4py3u10, 2, 14, Finished, Available, Finished)

848539

# <font size="5">4. Convert Pandas dataframe to Spark dataframe

In [14]:
# Convert pandas dataframe to spark dataframe
sam_main_df = spark.createDataFrame(pdf)

StatementMeta(spk3u4py3u10, 2, 15, Finished, Available, Finished)

In [15]:
base_cols = sam_main_df.columns

StatementMeta(spk3u4py3u10, 2, 16, Finished, Available, Finished)

In [16]:
display(sam_main_df.limit(10))

StatementMeta(spk3u4py3u10, 2, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ceb40075-1e2a-4f6e-af57-f53be17b840d)

# <font size="5">5. Combine composite key columns into single unique entity key

In [17]:
# Create unique entity key by concatenating UNIQUE_ENTITY_ID,CAGE_CODE,DODAAC

# Add column entityKey to dataframe
# Replace null/undefined values with '' in concat expression
sam_main_df= sam_main_df.withColumn("entityKey",concat(coalesce(col("UNIQUE_ENTITY_ID"),lit('')),
    coalesce(col("CAGE_CODE"),lit('')),
    coalesce(col("DODAAC"),lit(''))))

StatementMeta(spk3u4py3u10, 2, 18, Finished, Available, Finished)

In [18]:
# Confirm column was added
sam_main_df.columns

StatementMeta(spk3u4py3u10, 2, 19, Finished, Available, Finished)

['UNIQUE_ENTITY_ID',
 'BLANK_DEPRECATED',
 'ENTITY_EFT_INDICATOR',
 'CAGE_CODE',
 'DODAAC',
 'SAM_EXTRACT_CODE',
 'PURPOSE_OF_REGISTRATION',
 'INITIAL_REGISTRATION_DATE',
 'REGISTRATION_EXPIRATION_DATE',
 'LAST_UPDATE_DATE',
 'ACTIVATION_DATE',
 'LEGAL_BUSINESS_NAME',
 'DBA_NAME',
 'ENTITY_DIVISION_NAME',
 'ENTITY_DIVISION_NUMBER',
 'PHYSICAL_ADDRESS_LINE_1',
 'PHYSICAL_ADDRESS_LINE_2',
 'PHYSICAL_ADDRESS_CITY',
 'PHYSICAL_ADDRESS_PROVINCE_OR_STATE',
 'PHYSICAL_ADDRESS_ZIP/POSTAL_CODE',
 'PHYSICAL_ADDRESS_ZIP_CODE_4',
 'PHYSICAL_ADDRESS_COUNTRY_CODE',
 'PHYSICAL_ADDRESS_CONGRESSIONAL_DISTRICT',
 'D&B_OPEN_DATA_FLAG',
 'ENTITY_START_DATE',
 'FISCAL_YEAR_END_CLOSE_DATE',
 'ENTITY_URL',
 'ENTITY_STRUCTURE',
 'STATE_OF_INCORPORATION',
 'COUNTRY_OF_INCORPORATION',
 'BUSINESS_TYPE_COUNTER',
 'BUS_TYPE_STRING',
 'PRIMARY_NAICS',
 'NAICS_CODE_COUNTER',
 'NAICS_CODE_STRING',
 'PSC_CODE_COUNTER',
 'PSC_CODE_STRING',
 'CREDIT_CARD_USAGE',
 'CORRESPONDENCE_FLAG',
 'MAILING_ADDRESS_LINE_1',
 'MAILI

In [19]:
# Confirm contents of entityKey
display(sam_main_df.limit(10).select('entityKey', 'UNIQUE_ENTITY_ID','CAGE_CODE','DODAAC'))

StatementMeta(spk3u4py3u10, 2, 20, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 03ca53e5-1420-49ed-8dcc-a43461b06ab7)

# <font size="5">6. Add initial audit columns

In [20]:
# Add audit directory and filename columns
# Spark SQL date/time patterns: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
sam_main_df = sam_main_df.withColumn("AUD_DIRECTORY",lit(source_dir)).\
                withColumn("AUD_FILENAME",lit(source_file)).\
                withColumn("AUD_ACTIVE_FLAG", lit("Y")).\
                withColumn("EffectiveFromDate", to_date(lit(source_date), "yyyyMMdd")).\
                withColumn("EffectiveToDate", lit(None).cast("date"))


StatementMeta(spk3u4py3u10, 2, 21, Finished, Available, Finished)

In [21]:
# Confirm column was added
#sam_main_df.columns

StatementMeta(spk3u4py3u10, 2, 22, Finished, Available, Finished)

In [22]:
# Create list of audit columns shared across tables
aud_cols = ['AUD_DIRECTORY', 'AUD_FILENAME', 'AUD_ACTIVE_FLAG', 'EffectiveFromDate', 'EffectiveToDate']

StatementMeta(spk3u4py3u10, 2, 23, Finished, Available, Finished)

In [23]:
# Confirm column contents
display(sam_main_df.limit(2).select(*aud_cols))

StatementMeta(spk3u4py3u10, 2, 24, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3f676005-0cff-4d70-ba36-874357162397)

# <font size="5">7. Convert NullType() Columns to StringType()

In [24]:
# Assign the spark dataframe schema to variable schemaStruct
# .schema returns a StructType object
schemaStruct = sam_main_df.schema

StatementMeta(spk3u4py3u10, 2, 25, Finished, Available, Finished)

In [25]:
# Convert the spark dataframe schema into its own dataframe
# This loop iterates over the object and extracts the necessary elements
# https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.types.StructType.html
# https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.types.StructField.html#pyspark.sql.types.StructField
schemaList = []
for i in schemaStruct.fieldNames():
    schemaList.append(
        {
            'fieldname' : schemaStruct[i].name,
            'datatype': str(schemaStruct[i].dataType),
            'nullable': schemaStruct[i].nullable
        }
    )

schema_df = pd.DataFrame(schemaList)
schema_df

StatementMeta(spk3u4py3u10, 2, 26, Finished, Available, Finished)

Unnamed: 0,fieldname,datatype,nullable
0,UNIQUE_ENTITY_ID,StringType(),True
1,BLANK_DEPRECATED,NullType(),True
2,ENTITY_EFT_INDICATOR,StringType(),True
3,CAGE_CODE,StringType(),True
4,DODAAC,StringType(),True
5,SAM_EXTRACT_CODE,StringType(),True
6,PURPOSE_OF_REGISTRATION,StringType(),True
7,INITIAL_REGISTRATION_DATE,StringType(),True
8,REGISTRATION_EXPIRATION_DATE,StringType(),True
9,LAST_UPDATE_DATE,StringType(),True


In [26]:
# Check to see if any of the columns are are NullType() prior to writing spark dataframe to delta lake
# NullType() columns will not be written to the delta table, so must be converted to another data type
schema_df.loc[schema_df['datatype']==r'NullType()']

StatementMeta(spk3u4py3u10, 2, 27, Finished, Available, Finished)

Unnamed: 0,fieldname,datatype,nullable
1,BLANK_DEPRECATED,NullType(),True
23,D&B_OPEN_DATA_FLAG,NullType(),True
38,CORRESPONDENCE_FLAG,NullType(),True
122,FLEX_FIELD_1,NullType(),True
123,FLEX_FIELD_2,NullType(),True
124,FLEX_FIELD_3,NullType(),True
125,FLEX_FIELD_4,NullType(),True
126,FLEX_FIELD_5,NullType(),True
127,FLEX_FIELD_6,NullType(),True
128,FLEX_FIELD_7,NullType(),True


In [27]:
# Iterates over the dataframe's schema and casts any NullType() columns to StringType()
for i in schemaStruct.fieldNames():
    if str(schemaStruct[i].dataType) == 'NullType()':
        sam_main_df = sam_main_df.withColumn(i,col(i).cast('string'))

StatementMeta(spk3u4py3u10, 2, 28, Finished, Available, Finished)

In [28]:
sam_main_df.columns

StatementMeta(spk3u4py3u10, 2, 29, Finished, Available, Finished)

['UNIQUE_ENTITY_ID',
 'BLANK_DEPRECATED',
 'ENTITY_EFT_INDICATOR',
 'CAGE_CODE',
 'DODAAC',
 'SAM_EXTRACT_CODE',
 'PURPOSE_OF_REGISTRATION',
 'INITIAL_REGISTRATION_DATE',
 'REGISTRATION_EXPIRATION_DATE',
 'LAST_UPDATE_DATE',
 'ACTIVATION_DATE',
 'LEGAL_BUSINESS_NAME',
 'DBA_NAME',
 'ENTITY_DIVISION_NAME',
 'ENTITY_DIVISION_NUMBER',
 'PHYSICAL_ADDRESS_LINE_1',
 'PHYSICAL_ADDRESS_LINE_2',
 'PHYSICAL_ADDRESS_CITY',
 'PHYSICAL_ADDRESS_PROVINCE_OR_STATE',
 'PHYSICAL_ADDRESS_ZIP/POSTAL_CODE',
 'PHYSICAL_ADDRESS_ZIP_CODE_4',
 'PHYSICAL_ADDRESS_COUNTRY_CODE',
 'PHYSICAL_ADDRESS_CONGRESSIONAL_DISTRICT',
 'D&B_OPEN_DATA_FLAG',
 'ENTITY_START_DATE',
 'FISCAL_YEAR_END_CLOSE_DATE',
 'ENTITY_URL',
 'ENTITY_STRUCTURE',
 'STATE_OF_INCORPORATION',
 'COUNTRY_OF_INCORPORATION',
 'BUSINESS_TYPE_COUNTER',
 'BUS_TYPE_STRING',
 'PRIMARY_NAICS',
 'NAICS_CODE_COUNTER',
 'NAICS_CODE_STRING',
 'PSC_CODE_COUNTER',
 'PSC_CODE_STRING',
 'CREDIT_CARD_USAGE',
 'CORRESPONDENCE_FLAG',
 'MAILING_ADDRESS_LINE_1',
 'MAILI

# <font size="5">8. Create hash code of fields to track changes in (AUD_HASH_CODE)

In [29]:
# Compile list of columns to not include the AUD_HASH_CODE column of the sam_main table
key_list = ['UNIQUE_ENTITY_ID', 'CAGE_CODE', 'DODAAC', 'END_OF_RECORD_INDICATOR']
remove_list = ['SAM_EXTRACT_CODE', 'END_OF_RECORD_INDICATOR']
remove_list.extend(key_list)

StatementMeta(spk3u4py3u10, 2, 30, Finished, Available, Finished)

In [30]:
#print(remove_list)

StatementMeta(spk3u4py3u10, 2, 31, Finished, Available, Finished)

In [31]:
# Create list of columns (hash_cols) that should have changes tracked in sam_main
hash_cols = var_list.copy()
for i in remove_list:
    if i in hash_cols:
        hash_cols.remove(i)

StatementMeta(spk3u4py3u10, 2, 32, Finished, Available, Finished)

In [32]:
#print(hash_cols)

StatementMeta(spk3u4py3u10, 2, 33, Finished, Available, Finished)

In [33]:
# Create the audit hash code of change-tracked columns for sam_main and sam_login_info
sam_main_df = sam_main_df.withColumn("AUD_HASH_CODE",lit(sha2(concat_ws("~", *hash_cols), 256)))

StatementMeta(spk3u4py3u10, 2, 34, Finished, Available, Finished)

# <font size="5">9. Create surrogate unique key (AUD_SEQ_ID)

In [34]:
# Create the temporary surrogate key/sequence id for the record in sam_main
window_sort = ['entityKey','AUD_HASH_CODE']
w = Window().orderBy(*window_sort)
sam_main_df = sam_main_df.withColumn("AUD_SEQ_ID", row_number().over(w))
sam_main_df = sam_main_df.withColumn("AUD_SEQ_ID",col("AUD_SEQ_ID").cast('long'))

StatementMeta(spk3u4py3u10, 2, 35, Finished, Available, Finished)

In [35]:
# Create list with order of audit columns to appear at beginning of the table
initial_aud_cols = ['entityKey','AUD_HASH_CODE', 'AUD_SEQ_ID',  'AUD_ACTIVE_FLAG', 'AUD_DIRECTORY', 'AUD_FILENAME','EffectiveFromDate', 'EffectiveToDate']

StatementMeta(spk3u4py3u10, 2, 36, Finished, Available, Finished)

# <font size="5">10. Update spark dataframe with final column order

In [36]:
# After generating all audit columns, reorder column list
# Delta Lake "indexes" on initial 32 columns in table, so join conditions should be up front
sam_main_df = sam_main_df.select(*initial_aud_cols,*base_cols)

StatementMeta(spk3u4py3u10, 2, 37, Finished, Available, Finished)

In [37]:
display(sam_main_df.limit(2).select("entityKey", "AUD_SEQ_ID", "AUD_HASH_CODE", "AUD_DIRECTORY", "AUD_FILENAME", "AUD_ACTIVE_FLAG", "EffectiveFromDate", "EffectiveToDate"))

StatementMeta(spk3u4py3u10, 2, 38, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8cca1d57-a47e-4b5f-b2ab-4aba14ee448c)

In [38]:
# Convert INITIAL_REGISTRATION_DATE to date type
sam_main_df = sam_main_df.withColumn("INITIAL_REGISTRATION_DATE",to_date(col("INITIAL_REGISTRATION_DATE"), "yyyyMMdd"))
display(sam_main_df.limit(2).select("INITIAL_REGISTRATION_DATE"))

StatementMeta(spk3u4py3u10, 2, 39, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d514dfae-b181-4030-9129-355844cc9f99)

In [39]:
# Ensure no columns in sam_main_df are NullType() - final check prior to writing data to delta file
# This cell will only print output if NullType() columns are found
for i in sam_main_df.schema.fieldNames():
    if str(sam_main_df.schema[i].dataType) == 'NullType()':
        print('sam_main_df column {} is NullType()'.format(i))

StatementMeta(spk3u4py3u10, 2, 40, Finished, Available, Finished)

# <font size="5">11. Write initial data to silver delta file

In [40]:
# Create Partition Key column based on the first letter of the Unique Entity ID, which has a relatively equal distribution across possible values
sam_main_df = sam_main_df.withColumn("PARTITION_KEY", substring(col("UNIQUE_ENTITY_ID"), 1, 1))

StatementMeta(spk3u4py3u10, 2, 41, Finished, Available, Finished)

In [41]:
# Write first sam_main data frame to silver as the baseline
# Write Dataframe as Delta Table (silver)
# Create partition on PARTITION_KEY
sam_main_df.write.format("delta").mode("overwrite").partitionBy("PARTITION_KEY").save(silver_path)

StatementMeta(spk3u4py3u10, 2, 42, Finished, Available, Finished)

In [42]:
# Read Delta Log of the sam_main write
[log_line.value for log_line in spark.read.text(silver_path + "_delta_log/").collect()]

StatementMeta(spk3u4py3u10, 2, 43, Finished, Available, Finished)

['{"commitInfo":{"timestamp":1749694120845,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[\\"PARTITION_KEY\\"]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"22","numOutputRows":"848539","numOutputBytes":"305075978"},"engineInfo":"Apache-Spark/3.4.3.5.3.20250408.3 Delta-Lake/2.4.0.23","txnId":"7873476c-15cf-4531-ba13-3e356d1a27c7"}}',
 '{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}',
 '{"metaData":{"id":"4f220f5b-88e8-4dff-93ca-82a0f5463da8","format":{"provider":"parquet","options":{}},"schemaString":"{\\"type\\":\\"struct\\",\\"fields\\":[{\\"name\\":\\"entityKey\\",\\"type\\":\\"string\\",\\"nullable\\":true,\\"metadata\\":{}},{\\"name\\":\\"AUD_HASH_CODE\\",\\"type\\":\\"string\\",\\"nullable\\":true,\\"metadata\\":{}},{\\"name\\":\\"AUD_SEQ_ID\\",\\"type\\":\\"long\\",\\"nullable\\":true,\\"metadata\\":{}},{\\"name\\":\\"AUD_ACTIVE_FLAG\\",\\"type\\":\\"string\\",\\"nullable\\":true,\\"metadata\\":{}

# <font size="5">12 (optional). Run Z-Order clustering on data

In [43]:
# Run Z-Order clustering on data
# Note - order by keys may not be optimal for queries in daily file load yet
delta.DeltaTable.forPath(spark, silver_path) \
    .optimize() \
    .executeZOrderBy("AUD_SEQ_ID", "AUD_HASH_CODE") 

StatementMeta(spk3u4py3u10, 2, 44, Finished, Available, Finished)

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemovedBreakdown:array<struct<reason:string,metrics:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>>>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClust

In [45]:
# Preview record in this monthly file that has an update in the daily file
row_df = spark.read.format("delta").load(silver_path).filter(col("UNIQUE_ENTITY_ID") == lit('F743Q1LZ3VN9'))
display(row_df.select("UNIQUE_ENTITY_ID", "LEGAL_BUSINESS_NAME", "entityKey", "AUD_SEQ_ID", "AUD_HASH_CODE", "AUD_DIRECTORY", "AUD_FILENAME", "AUD_ACTIVE_FLAG", "EffectiveFromDate", "EffectiveToDate"))

StatementMeta(spk3u4py3u10, 2, 46, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2507452b-844a-49b9-bcd3-144141c31569)

In [46]:
print(row_df.columns)

StatementMeta(spk3u4py3u10, 2, 47, Finished, Available, Finished)

['entityKey', 'AUD_HASH_CODE', 'AUD_SEQ_ID', 'AUD_ACTIVE_FLAG', 'AUD_DIRECTORY', 'AUD_FILENAME', 'EffectiveFromDate', 'EffectiveToDate', 'UNIQUE_ENTITY_ID', 'BLANK_DEPRECATED', 'ENTITY_EFT_INDICATOR', 'CAGE_CODE', 'DODAAC', 'SAM_EXTRACT_CODE', 'PURPOSE_OF_REGISTRATION', 'INITIAL_REGISTRATION_DATE', 'REGISTRATION_EXPIRATION_DATE', 'LAST_UPDATE_DATE', 'ACTIVATION_DATE', 'LEGAL_BUSINESS_NAME', 'DBA_NAME', 'ENTITY_DIVISION_NAME', 'ENTITY_DIVISION_NUMBER', 'PHYSICAL_ADDRESS_LINE_1', 'PHYSICAL_ADDRESS_LINE_2', 'PHYSICAL_ADDRESS_CITY', 'PHYSICAL_ADDRESS_PROVINCE_OR_STATE', 'PHYSICAL_ADDRESS_ZIP/POSTAL_CODE', 'PHYSICAL_ADDRESS_ZIP_CODE_4', 'PHYSICAL_ADDRESS_COUNTRY_CODE', 'PHYSICAL_ADDRESS_CONGRESSIONAL_DISTRICT', 'D&B_OPEN_DATA_FLAG', 'ENTITY_START_DATE', 'FISCAL_YEAR_END_CLOSE_DATE', 'ENTITY_URL', 'ENTITY_STRUCTURE', 'STATE_OF_INCORPORATION', 'COUNTRY_OF_INCORPORATION', 'BUSINESS_TYPE_COUNTER', 'BUS_TYPE_STRING', 'PRIMARY_NAICS', 'NAICS_CODE_COUNTER', 'NAICS_CODE_STRING', 'PSC_CODE_COUNTER',