In [None]:
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.5.1

In [None]:
from pyspark.sql import SparkSession

# Importing functions from the custom utility package
from custom_utils import dataframe, helper
from custom_utils.dp_storage import reader, writer, initialize_config
from custom_utils.dp_storage.validation import verify_paths_and_files
from custom_utils.dp_storage.connector import mount

# Standardization

## Setup

In [None]:
# Initialize configuration and helper objects
config = initialize_config(dbutils, helper, '<source_environment>', '<destination_environment>', '<source_container>', '<source_datasetidentifier>')

In [None]:
# Verify paths and files
schema_file_path, data_file_path, file_type = verify_paths_and_files(dbutils, config, helper)

## Read
Reads data from storage

In [None]:
source_file_path = reader.get_path_to_triggering_file(
    config.source_folder_path,
    config.source_filename,
    config_for_triggered_dataset=config.source_environment
)

# To get the path to a source dataset:
# source_dataset_path = reader.get_dataset_path(config.source_environment)

# Read and parse the JSON content using schema
schema, spark_schema = reader.json_schema_to_spark_struct(schema_file_path)
df_raw = reader.read_json_from_binary(spark, spark_schema, data_file_path)

# Rewrite the line above if your file is not JSON.
# Examples:
# df_raw = spark.read.option("delimiter", ",").csv(source_file_path, header=True)
# df_raw = spark.read.parquet(source_file_path)

## Standardize the data

Standardize the data here. Follow this style guide: https://github.com/palantir/pyspark-style-guide

In [None]:
# Examples of functionality
df = dataframe.flatten_df(df_raw, depth_level=config.depth_level, type_mapping=dataframe.get_type_mapping())
df = dataframe.rename_columns(df, replacements={'.': '_'})

## Merge and upload

In [None]:
destination_path = writer.get_destination_path(config.destination_environment)
database_name_databricks, table_name_databricks = writer.get_databricks_table_info(config.destination_environment)

# Insert the processed data
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", destination_path) \
    .saveAsTable(f'{database_name_databricks}.{table_name_databricks}')