# Update Silver Schema

The goal of this Notebook is to update the Schema of all silver tables. It means, that it get the bonze tables schema (Lakehouse/Tables/tableschema) , remove all columns that all values are null, blank or single valued. And save them as the Silver tables schema in a table called tableschema_silver (Lakehouse/Tables/tableschema_silver) 

In [1]:
from pyspark.sql import functions as F
from functools import reduce
from pyspark.sql.functions import monotonically_increasing_id, col, count, countDistinct, broadcast


StatementMeta(, 0daed764-000e-4f11-8491-9fc798f56906, 3, Finished, Available)

In [2]:
'''
The extract_names function get all bronze tables names,
remove the listed tables in tables_to__be_excluded list
and return another list with all tables that will heve their schema updated 
'''
# If you don't want to updade automatically update some table, insert them here in tables_to_be_excluded
tables_to_be_excluded = ['tableschema','tablesschema_silver', 'sys_company', 'sys_usr',]

def extract_names(file_info_list,tables_to_be_excluded):

    '''
    Get all tables in silver_protheus lakehouse Tables folder that don't contains *_silver in the name
    and also is not present in tables_to_be_excluded list
    '''
    
    names = []
    for file_info in file_info_list:
        name = file_info.name
        if "_silver" not in name and name not in tables_to_be_excluded:
            names.append(name)
    return names

# list all fileInfo objects of tables in silver_protheus/Tables folder
file_info_of_tables = mssparkutils.fs.ls('Tables/')

# Parse the fileInfo objects returning a list of desired tables to get the schema updeted of 
tables_list = extract_names(file_info_of_tables,tables_to_be_excluded)
tables_list

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 4, Finished, Available)

['ak5010',
 'aov010',
 'cnf010',
 'fp0010',
 'fp1010',
 'fpa010',
 'fpg010',
 'fq4010',
 'p10010',
 'sa1010',
 'sa2010',
 'sa6010',
 'sc5010',
 'sc7010',
 'scp010',
 'scr010',
 'sd1010',
 'se1010',
 'se2010',
 'se5010',
 'sed010',
 'sf1010',
 'st7010',
 'st9010',
 'stj010',
 'stl010',
 'tqy010']

In [3]:
'''
This function get a dataframe and a list of columns to not be removed
and than search for all columns in the dataframe that just have a single-value or non-value
'''
def find_completely_null_and_single_value_columns(df,columns_to_not_be_removed):
    # Create an expression list to count non-nulls in each column
    exprs = [countDistinct(col(c)).alias(c) for c in df.columns]

    # Apply the expressions in a single aggregation to get non-null counts for all columns
    non_null_counts = df.agg(*exprs).first()

    # Find columns where the count of non-nulls is 0
    empty_and_single_value_columns = [c for c in df.columns if non_null_counts[c] <= 1 ]

    # Remove from the list the columns that we want to persist in the schema
    empty_and_single_value_columns = [item for item in empty_and_single_value_columns if not any(substring in item for substring in columns_to_not_be_removed)]

    return empty_and_single_value_columns

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 5, Finished, Available)

In [4]:
'''
This function get the df_tablesschema_silver
and than remove all columns listed in columns_to_be_removed for the especified table
'''

def generate_silver_table_schema(df_tablesschema_silver,table,columns_to_be_removed,idx):

    # select from the schema just the rows of the especified table, removing the columns_to_be_removed
    df_aux = df_tablesschema_silver \
    .select('TABLE_NAME','COLUMN_NAME','ORDINAL_POSITION','DATA_TYPE') \
    .where(
        (~col('COLUMN_NAME').isin(list(columns_to_be_removed))) & 
        (col('TABLE_NAME')==table.upper())
        )

    # create a sequential numbers column to keep the Protheus ordenation of the tables
    df_silver_table_schema = df_aux.withColumn("ORDINAL_POSITION", monotonically_increasing_id())

    # modify the ORDINAL_POSITION column to keep the Protheus ordenation of the tables
    df_silver_table_schema = df_silver_table_schema.withColumn("ORDINAL_POSITION", col("ORDINAL_POSITION")+(idx*1000))
    return df_silver_table_schema

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 6, Finished, Available)

In [5]:
'''
This function get some schema dataframe remove the rows for actual table
and than insert (union) with the new updated Data
at the end the function update the table in df_tablesschema_silver
with the data of df_silver_table_schema (related to table)
'''
def updade_silver_full_tables_schema(df_tablesschema_silver,table,df_silver_table_schema):
    
    # select all rows that table name are diferrent of table variable
    df_aux = df_tablesschema_silver.select('TABLE_NAME','COLUMN_NAME','ORDINAL_POSITION','DATA_TYPE').where(col('TABLE_NAME')!=table.upper())
    
    # Union in the df_aux the new table data, creating the updated schema
    df_full_silver_updated = df_aux.union(df_silver_table_schema)
   
    return df_full_silver_updated

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 7, Finished, Available)

In [6]:
'''
This function get a spark dataframe and save as a delta table.
The path and table name are defined inside the function, also the overwrite mode are activate, so be careful!!
'''

def write_parquet_file(df_full_silver_updated):
    
    table_name = f'tableschema_silver'
    
    df_full_silver_updated.write \
    .option("overwriteSchema", "true") \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(table_name, mode="overwrite", ifNotExists=True)

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 8, Finished, Available)

In [21]:
'''
This cell is the main logic of the notebook. Here we are doing these steps:
- Copy bronze layer Schema
- Automatically detect and remove from silver the columns in bronze with non-values or single-values.
  Note that a colum with non-values AND single values (e.g. {None,0}) will not be removed because it was't a single-value it was doble-valued (None and 0)
- Update recursivaly (inside the "for in") the bronze schema with the new silver schema
- Modify the original 'ORDINAL_POSITION' in a way that we can use this column to order the columns like in Protheus
- Save the updated silver schema as a delta table named tableschema_silver
'''

df_tablesschema_raw = spark.sql("SELECT * FROM silver_protheus.tableschema ORDER BY sequential_number")

df_tablesschema = spark.sql("SELECT TABLE_NAME,COLUMN_NAME,ORDINAL_POSITION,DATA_TYPE FROM silver_protheus.tableschema ORDER BY sequential_number")

columns_to_not_be_removed = ['_FILIAL', 'INGESTION_DATE', 'D_E_L_E_T_', 'R_E_C_N_O_','R_E_C_D_E_L_']

log_table_succs = list([])
print(log_table_succs)
log_table_error = list([])
output = {}

for idx,table in enumerate(tables_list):
    try:
      # Get the bronze table to analyze the columns to be removed
      df = broadcast(spark.sql(f"SELECT * FROM silver_protheus.{table}"))

      # create a list of columns with non-value and single-value
      empty_and_single_value_columns = find_completely_null_and_single_value_columns(df,columns_to_not_be_removed)

      columns_to_be_removed = tuple(empty_and_single_value_columns + list(['a', 'b']))
      
      # Generate the silver schema for the actual table item on loop
      df_silver_table_schema = generate_silver_table_schema(df_tablesschema_raw,table,columns_to_be_removed,(idx+1))
      
      # update the tableschema with the new tables schema generated earlier
      df_tablesschema = updade_silver_full_tables_schema(df_tablesschema,table,df_silver_table_schema)
      
      log_table_succs.append(f'{table}')
      print(log_table_succs)
    except Exception as e:

      log_table_error.append(f'{table}: {e}')

df_tablesschema_silver = df_tablesschema.orderBy('ORDINAL_POSITION')
# write the full updated silever schema as Delta Table (also parque file associated with it)
write_parquet_file(df_tablesschema_silver)


output['successes'] = f'Silver schema update for tables in ({log_table_succs})'
output['Fail'] = f'Silver schema update for tables in ({log_table_error})'
print(output)
mssparkutils.notebook.exit(output)  

StatementMeta(, 60e13ed9-4d11-4d36-bba1-35a9a03f76cd, 23, Finished, Available)

[]
['ak5010']
['ak5010', 'aov010']
['ak5010', 'aov010', 'cnf010']
{'successes': "Silver schema update for tables in (['ak5010', 'aov010', 'cnf010'])", 'Fail': 'Silver schema update for tables in ([])'}
ExitValue: {'successes': "Silver schema update for tables in (['ak5010', 'aov010', 'cnf010'])", 'Fail': 'Silver schema update for tables in ([])'}