In [None]:
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.3.0

In [None]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F

from custom_utils.dp_storage.connector import mount, unmount_if_prod
from custom_utils import adf

# Standardization

## Setup

In [None]:
# Define source and destination configurations
default_source_config = {"<dataset_identifier>": {"type":"adls", "dataset":"<dataset_name>", "container":"landing", "account":"dplandingstorage"}}
default_destination_config = {"<dataset_identifier>": {"type":"adls", "dataset":"<dataset_name>", "container":"uniform", "account":"dpuniformstorage"}}

# Get the configs from ADF if executed from ADF
source_config = adf.get_source_config(dbutils, default_source_config)
destination_config = adf.get_destination_config(dbutils, default_destination_config)

In [None]:
source_config, destination_config = mount(dbutils, source_config, destination_config)

In [None]:
# Get other parameters from ADF
dbutils.widgets.removeAll()

# Add or remove parameters below.
source_folder_path = adf.get_parameter(dbutils, 'SourceFolderPath')  # Remember that it has the format "<container>/<directory>"
source_filename = adf.get_parameter(dbutils, 'SourceFileName')

## Read
Reads data from storage

In [None]:
source_file_path = dp_storage.reader.get_path_to_triggering_file(
    source_folder_path,
    source_filename,
    config_for_triggered_dataset=source_config['TODO: dataset_identifier(not guid)']
)

df = spark.read.parquet(source_file_path)
# Rewrite the line above, if your file is not parquet. Example for csv:
# df = spark.read.option("delimiter", ",").csv(source_path, header=True)

In [None]:
df.show(3)

## Standardize the data

Standardize the data here. Follow this style guide: https://github.com/palantir/pyspark-style-guide

## Merge and upload

In [None]:
destination_path = dp_storage.writer.get_destination_path(destination_config)
database_name_databricks, table_name_databricks = dp_storage.writer.get_databricks_table_info(destination_config)

# In case of full load
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", destination_path) \
    .saveAsTable(f'{database_name_databricks}.{table_name_databricks}')

In [None]:
# Always keep this at the end of your notebook
unmount_if_prod(dbutils, source_config, destination_config)