In [None]:
#Flatten JSON files

def flatten(df):
   # compute Complex Fields (Lists and Structs) in Schema   
   complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
   while len(complex_fields)!=0:
      col_name=list(complex_fields.keys())[0]
      print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
    
      # if StructType then convert all sub element to columns.
      # i.e. flatten structs
      if (type(complex_fields[col_name]) == StructType):
         expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
         df=df.select("*", *expanded).drop(col_name)
    
      # if ArrayType then add the Array Elements as Rows using the explode function
      # i.e. explode Arrays
      elif (type(complex_fields[col_name]) == ArrayType):    
         df=df.withColumn(col_name,explode_outer(col_name))
    
      # recompute remaining Complex Fields in Schema       
      complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
   return df

In [2]:
# Find latest date folder

def find_latest_date_folder(directory_path):
    #List all folders in the directory
    folders = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))]

    # Folder names in YYYYMMDD format
    sorted_folders = sorted(folders, key=lambda x: int(x))
    
    # Sort for latest date folder
    latest_date_folder = sorted_folders[-1]
    
    return latest_date_folder

StatementMeta(, aeccec2a-b75c-4a05-8abb-afe5b063ac9a, 4, Finished, Available, Finished)

In [None]:
# OAuth 2.0 API request as a function 

def get_access_token():
    authorisation_endpoint = ""

    params = {
        "client_id": "",
        "scope": "",
        "redirect_uri": "",
        "response_type": ""
    }

    try:
        # HTTP GET request
        response = requests.get(authorisation_endpoint, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors
        code = response.content 
    
    except requests.exceptions.RequestException as e:
        # Handle request exceptions such as network errors or invalid URLs
        print("Error making request:", e)

    # Token request
    # Token endpoint URL
    token_endpoint = ""

    params = {
        "grant_type": "",
        "scope": ""
    }

    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        "Host": "",
        "Authorization": "Basic " + credentials,
    }

    # Send HTTP POST request
    response = requests.post(token_endpoint, data=params, headers=headers)

    # Parse response JSON
    token_data = response.json()

    return token_data.get("access_token")

In [None]:
# Cleansing column names
# Function to remove spaces and capitalize the first letter of each word in column names
def camel_case(column_name):
    return ''.join(word.capitalize() for word in column_name.split())

# Create new column names
new_column_names = [camel_case(c) for c in df.columns]

# Rename columns
for old_name, new_name in zip(df.columns, new_column_names):
    df = df.withColumnRenamed(old_name, new_name)

# Remove leading and trailing spaces
for column in df.columns:
        df = df.withColumn(column, trim(col(column)))

In [None]:
# Gold layer testing fnction. Compare NULL and destinct values pro column: source vs after transformation (transformed)
def differences(source_table: DataFrame, transformed_table: DataFrame) -> DataFrame:
    def compute_counts(df: DataFrame, columns: list) -> DataFrame:
        results = []
        for field in columns:
            null_count = df.filter(col(field).isNull()).count()
            distinct_count = df.select(countDistinct(col(field))).collect()[0][0]
            results.append((field, null_count, distinct_count))
        return spark.createDataFrame(results, ["Column", "NullCount", "DistinctCount"])

    # Compute counts for source table
    source_fields = source_table.columns
    source_results_df = compute_counts(source_table, source_fields)

    # Compute counts for transformed table
    transformed_fields = transformed_table.columns
    transformed_results_df = compute_counts(transformed_table, transformed_fields) \
        .withColumnRenamed("Column", "TransformedColumn") \
        .withColumnRenamed("NullCount", "TransformedNullCount") \
        .withColumnRenamed("DistinctCount", "TransformedDistinctCount")

    # Join and find differences
    differences_df = source_results_df.join(transformed_results_df, source_results_df.Column == transformed_results_df.TransformedColumn)
    differences_df = differences_df.filter(
        (col("DistinctCount") != col("TransformedDistinctCount")) | 
        (col("NullCount") != col("TransformedNullCount"))
    )
    
    if differences_df.count() > 0:
        return differences_df

    else:
        print("Sorce and transformed DataFrames match.")
        return None

## Example use case:
# Use differences(source, transformed) to retund a DataFrame where fields do not match

#source = absenceevents  # Replace with actual DataFrame (source read in)
#transformed = cez_person_absence_events  # Replace with actual DataFrame (name of delta table out of source)

#differences_df = differences(source, transformed)
#if differences_df is not None:
    #differences_df.show(100, truncate=False)