In [0]:
import os

def read_all_delta_tables(base_path):
    # Get all subfolders in the SalesLT directory
    folders = [f.name for f in dbutils.fs.ls(base_path) if f.isDir() and f.name != '_delta_log']
    
    dataframes = {}
    
    for folder in folders:
        folder_path = f"{base_path}/{folder}"
        
        # List the files in the folder
        files = dbutils.fs.ls(folder_path)
        
        # Check if the folder contains any parquet files
        parquet_files = [file for file in files if file.name.endswith(".parquet")]
        
        if parquet_files:
            try:
                # Read the Delta table using the folder path
                df = spark.read.format("delta").load(folder_path)
                
                # Extract folder name without any slashes
                folder_name = os.path.basename(folder.rstrip('/'))
                
                # Create the DataFrame name as per the naming convention
                dataframe_name = f"{folder_name[:1].lower() + folder_name[1:]}_modified"
                
                # Store the DataFrame in globals() to create a variable with the desired name
                globals()[dataframe_name] = df
                dataframes[dataframe_name] = df
            except Exception as e:
                print(f"Failed to read {folder}: {e}")
    return dataframes

# Define the base path to SalesLT
base_path = "/mnt/silver/SalesLT"

# Read all Delta tables from the SalesLT folder
dataframes = read_all_delta_tables(base_path)

In [0]:
from pyspark.sql import DataFrame

def standardize_column_names(dataframe: DataFrame) -> DataFrame:
    """
    Standardizes column names by converting them from camel case to capitalized words separated by underscores.
    Example: ThisIsColumn -> This_Is_Column

    Args:
    dataframe (DataFrame): The input DataFrame with original column names.

    Returns:
    DataFrame: A new DataFrame with standardized column names.
    """
    def split_camel_case(column_name: str) -> str:
        result = []
        for i, char in enumerate(column_name):
            if i > 0 and char.isupper() and column_name[i - 1].islower():
                result.append('_')
            result.append(char)
        
        # Convert the list of characters back to a string
        split_name = ''.join(result)
        
        # Capitalize words and join with '_'
        final_name = '_'.join([word.capitalize() for word in split_name.split('_')])
        return final_name
    
    # Store modified column names
    columns = dataframe.columns
    renamed_columns = [split_camel_case(col) for col in columns]
    
    # Rename columns in the DataFrame using selectExpr
    renamed_exprs = [f"`{col}` as `{new_col}`" for col, new_col in zip(columns, renamed_columns)]
    standardized_df = dataframe.selectExpr(*renamed_exprs)
    
    return standardized_df

In [0]:
def apply_standardization_and_overwrite(dataframes: dict) -> None:
    """
    Applies column standardization to all DataFrames and rewrites the original ones.

    Args:
    dataframes (dict): Dictionary of DataFrames with keys as DataFrame names.
    """
    for name, df in dataframes.items():
        standardized_df = standardize_column_names(df)
        
        # Overwrite the original DataFrame in the dictionary and globals()
        dataframes[name] = standardized_df
        globals()[name] = standardized_df

In [0]:
# Read all DataFrames
dataframes = read_all_delta_tables(base_path)

# Apply renaming to all DataFrames and update them in place
apply_standardization_and_overwrite(dataframes)

In [0]:
def process_all_dataframes(created_dataframes, base_folder="SalesLT", layer="gold"):
    """
    Processes all created DataFrames by renaming columns and saving them to the specified layer.

    Args:
    created_dataframes (dict): Dictionary of DataFrame names to be processed.
    base_folder (str): Base folder name for saving DataFrames (default: "SalesLT").
    layer (str): The target layer to save DataFrames ('gold' by default).
    """
    for dataframe_name in created_dataframes:
        original_df = globals()[dataframe_name]
        
        # Apply column standardization
        modified_df = standardize_column_names(original_df)
        
        # Update the global variable with the modified DataFrame
        globals()[dataframe_name.lower()] = modified_df
        
        # Extract folder name from DataFrame name
        folder_name = dataframe_name.capitalize().split('_')[0]
        
        # Define the path for saving the modified DataFrame in the gold layer
        gold_path = f"/mnt/{layer}/{base_folder}/{folder_name}/"
        
        # Save DataFrame as Delta format to 'gold' container in overwrite mode
        modified_df.write.format("delta").mode("overwrite").save(gold_path)

# Example usage with your created_dataframes dictionary
process_all_dataframes(dataframes.keys())

✅ Successfully saved DataFrame 'address_modified' to '/mnt/gold/SalesLT/Address/'
✅ Successfully saved DataFrame 'customer_modified' to '/mnt/gold/SalesLT/Customer/'
✅ Successfully saved DataFrame 'customeraddress_modified' to '/mnt/gold/SalesLT/Customeraddress/'
✅ Successfully saved DataFrame 'product_modified' to '/mnt/gold/SalesLT/Product/'
✅ Successfully saved DataFrame 'productcategory_modified' to '/mnt/gold/SalesLT/Productcategory/'
✅ Successfully saved DataFrame 'productdescription_modified' to '/mnt/gold/SalesLT/Productdescription/'
✅ Successfully saved DataFrame 'productmodel_modified' to '/mnt/gold/SalesLT/Productmodel/'
✅ Successfully saved DataFrame 'productmodelproductdescription_modified' to '/mnt/gold/SalesLT/Productmodelproductdescription/'
✅ Successfully saved DataFrame 'salesorderdetail_modified' to '/mnt/gold/SalesLT/Salesorderdetail/'
✅ Successfully saved DataFrame 'salesorderheader_modified' to '/mnt/gold/SalesLT/Salesorderheader/'
