In [None]:
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.2.2

In [None]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F

from custom_utils.dp_storage import connector, reader, writer
from custom_utils import adf

# Transformation

## Setup

In [None]:
mount_point = connector.mount(dbutils)

In [None]:
dbutils.widgets.removeAll()
adf.initialize_config_widgets(dbutils)  # Included to make it easier to add the configs when setting up the notebook.

# Get the parameters from ADF. When developing, insert values in the text widgets.
source_config = adf.get_source_config(dbutils)
destination_config = adf.get_destination_config(dbutils)

source_folder_path = adf.get_parameter(dbutils, 'SourceFolderPath')  # Remember that it has the format "<container>/<directory>"
source_filename = adf.get_parameter(dbutils, 'SourceFileName')

# TODO: If you have want more parameters from ADF, add them here.

## Read
Reads data from storage

In [None]:
source_file_path = reader.get_path_to_triggering_file(
    mount_point,
    source_folder_path,
    source_filename,
    config_for_triggered_dataset=source_config['TODO: dataset_identifier(not guid)']
)

df = spark.read.parquet(source_file_path)
# Rewrite the line above, if your file is not parquet. Example for csv:
# df = spark.read.option("delimiter", ",").csv(source_path, header=True)

In [None]:
df.show(3)

## Transform

Transform the data here. Follow this style guide: https://github.com/palantir/pyspark-style-guide

## Merge and upload

In [None]:
# TODO: Update the following parameters
timestamp_column = 'NAME_OF_TIMESTAMP_COLUMN'
index_columns = [timestamp_column, 'ANOTHER_KEY_COLUMN']
time_resolution_egress = 'month'  # 'hour', 'month', 'day', or 'hour'
egress_identifier = destination_config['YOUR_EGRESS_DATASET_IDENTIFIER(not guid)']['dataset']

# Merge data into the existing egress data.
# If the timestamp column is a string column, add the format in the key word parameter time_format.
# (example: time_format="yyyy-MM-dd'T'HH:mm:ss'Z'")
writer.merge_and_upload(
    df,
    egress_identifier,
    timestamp_column,
    index_columns,
    time_resolution_egress,
    mount_point,
    spark
)

In [None]:
# Always keep this at the end of your notebook
connector.unmount_if_prod(mount_point, dbutils)