In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import monotonically_increasing_id
import sys
import inspect

Upsert Data

In [0]:
# utils notebook

def upsert_table(source_table: str, target_table: str, merge_keys: list):
    """
    Perform an upsert (MERGE) from source_table into target_table.
    """
    merge_condition = " AND ".join([f"target.{k} = source.{k}" for k in merge_keys])
    
    # Ensure target exists (empty schema copy if new)
    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {target_table}
    USING DELTA
    AS SELECT * FROM {source_table} WHERE 1=0
    """)
    
    merge_sql = f"""
    MERGE INTO {target_table} AS target
    USING {source_table} AS source
    ON {merge_condition}
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
    """
    
    print(f"MERGE {source_table} → {target_table} ON {merge_keys}")
    spark.sql(merge_sql)




Surrogate Keys

In [0]:
def add_surrogate_key(df: DataFrame, key_name: str = "surrogate_key") -> DataFrame:
    """
    Adds a surrogate key column to the DataFrame using Spark's
    monotonically_increasing_id function.
    
    Parameters:
        df (DataFrame): Input dataframe
        key_name (str): Name of the surrogate key column

    Returns:
        DataFrame: DataFrame with surrogate key column added
    """
    return df.withColumn(key_name, monotonically_increasing_id())

In [0]:
current_module = sys.modules[__name__]

# Collect all callables defined here (exclude builtins/imports)
utils_functions = [
    name for name, obj in vars(current_module).items()
    if callable(obj) and inspect.isfunction(obj) and obj.__module__ == __name__
]

print("✅ Utils notebook loaded.")
print("Available functions:", utils_functions)
print("Tip: Use help(function_name) for details.")

In [0]:
# Databricks notebook source
print("✅ Utils notebook loaded.")
print("Available functions: ['add_surrogate_key', 'clean_strings']")
