In [None]:
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.5.1

In [None]:
import pyspark.sql.functions as F

from custom_utils.dp_storage.connector import mount
from custom_utils.dp_storage import reader, writer
from custom_utils import adf
from custom_utils import dataframe

# Standardization

## Setup

In [None]:
# Define source and destination configurations
default_source_config = {"<dataset_identifier>": {"type":"adls", "dataset":"<dataset_name>", "container":"<container>", "account":"<storage_account>"}}
default_destination_config = {"<dataset_identifier>": {"type":"adls", "dataset":"<dataset_name>", "container":"<container>", "account":"<storage_account>"}}

# Get the configs from ADF if executed from ADF
source_config = adf.get_source_config(dbutils, default_source_config)
destination_config = adf.get_destination_config(dbutils, default_destination_config)

In [None]:
# Add or remove parameters below.
source_folder_path = adf.get_parameter(dbutils, 'SourceFolderPath')  # Remember that it has the format "<container>/<directory>"
source_filename = adf.get_parameter(dbutils, 'SourceFileName')

## Read
Reads data from storage

In [None]:
source_file_path = reader.get_path_to_triggering_file(
    source_folder_path,
    source_filename,
    config_for_triggered_dataset=source_config['<dataset_identifier>']
)

# To get the path to a source dataset:
# source_dataset_path = reader.get_dataset_path(source_config['<dataset_identifier>'])

df_raw = spark \
    .read \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .format('json').load(source_file_path)

# Rewrite the line above, if your file is not JSON. 
# Examples:
# df_raw = spark.read.option("delimiter", ",").csv(source_path, header=True)
# df_raw = spark.read.parquet(source_file_path)

## Standardize the data

Standardize the data here. Follow this style guide: https://github.com/palantir/pyspark-style-guide

In [None]:
# Examples of functionality
df = dataframe.flatten(df_raw, layer_separator='_')
df = dataframe.rename_columns(df, replacements={'.': '_'})

## Merge and upload

In [None]:
destination_path = writer.get_destination_path(destination_config)
database_name_databricks, table_name_databricks = writer.get_databricks_table_info(destination_config)

# Chech if the delta table exists. Else it should do a first-time-write.
if DeltaTable.isDeltaTable(spark, destination_path):
    dest_table = DeltaTable.forPath(spark, destination_path)
    
    # TODO: Specify the write pattern. Below is an example with upsert.
    dest_table.alias("t") \
        .merge(
            df.alias("s"),
            "s.some_column = t.some_column and s.another_column = t.another_column"
        ) \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()

else:
    # Insert for the first time
    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .option("path", destination_path) \
        .saveAsTable(f'{database_name_databricks}.{table_name_databricks}')