In [None]:
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.6.0

In [None]:
from pyspark.sql import SparkSession

# Importing functions from the custom utility package
from custom_utils import dataframe, helper
from custom_utils.dp_storage import reader, writer, initialize_config, table_management, merge_management, feedback_management, quality
from custom_utils.validation import verify_paths_and_files
from pyspark.sql.utils import AnalysisException

# Standardization Template

## Setup

### Configuration Handling with `initialize_config`
This template initializes the configuration, sets up paths, and prepares the environment for the data processing pipeline. The `initialize_config` function handles the configuration centrally, making it easy to reuse across notebooks.

In [None]:
# Initialize configuration and helper objects
config = initialize_config(dbutils, helper, '<source_environment>', '<destination_environment>', '<source_container>', '<source_datasetidentifier>')
spark = config.spark_session
config.unpack(globals())
config.print_params()

## Read
In this section, we load the JSON schema and source data, handle nested structures, and prepare the data for standardization.

In [None]:
# Verify paths and files
schema_file_path, data_file_path, file_type = verify_paths_and_files(dbutils, config, helper)

# Read and parse the JSON content using schema
schema_json, spark_schema = reader.json_schema_to_spark_struct(schema_file_path)
df_raw = reader.read_json_from_binary(spark, spark_schema, data_file_path)
display(df_raw)

## Data Standardization

### Flattening and Renaming Data
We flatten complex nested structures (like arrays and structs) using the `flatten_df` function, which also applies type mappings and handles column renaming.

In [None]:
# Flatten and standardize the DataFrame
df, df_flattened, columns_of_interest, view_name = dataframe.process_and_flatten_json(
    spark=spark,
    config=config,
    schema_file_path=schema_file_path,
    data_file_path=data_file_path,
    helper=helper
)
display(df_flattened)

## Merge and Upload
In this step, we manage table creation and data merging using Delta Lake. The logic tracks Delta table versions to monitor changes during the merge operation.

In [None]:
# Manage table creation if it does not exist
table_management.manage_table_creation(
    spark=spark,
    destination_environment=destination_environment,
    source_datasetidentifier=source_datasetidentifier,
    helper=helper
)

# Manage data merge
merge_management.manage_data_merge(
    spark=spark,
    destination_environment=destination_environment,
    source_datasetidentifier=source_datasetidentifier,
    view_name=view_name,
    key_columns=key_columns,
    helper=helper
)


## Feedback Timestamps
The final step is to generate feedback timestamps for tracking data processing intervals.

In [None]:
# Generate feedback timestamps
feedback_management.generate_feedback_timestamps(
    spark=spark,
    view_name=view_name,
    feedback_column=feedback_column,
    dbutils=dbutils,
    helper=helper
)

## Notebook Completion
The notebook exits after processing and logging the feedback results.

In [None]:
# Exit the notebook with success message
dbutils.notebook.exit("Notebook completed successfully.")