
# JSON Standardization and Flattening
This notebook is responsible for standardizing and flattening JSON files. It reads the raw JSON data from the landing zone, applies a schema for validation, flattens nested structures, and then writes the transformed data as Delta Parquet files.

Key Features:
- Reads JSON files from the landing zone.
- Applies a predefined schema for data validation (ensure the schema file is available at landing/schemachecks/[datasetidentifier]/[datasetidentifier]_schema.json).
- Flattens nested JSON structures for easier querying and analysis.
- Saves the processed data as Delta Parquet files for efficient storage and querying.

## Widget Initialization and Configuration

### triton__flow_plans example

In [None]:
dbutils.widgets.removeAll()

# Source and Destination Storage Accounts
dbutils.widgets.text("SourceStorageAccount", "dplandingstoragetest", "Source Storage Account")
dbutils.widgets.text("DestinationStorageAccount", "dpuniformstoragetest", "Destination Storage Account")

# Source container and dataset identifier
dbutils.widgets.text("SourceContainer", "landing", "Source Container")
dbutils.widgets.text("SourceDatasetidentifier", "triton__flow_plans", "Source Datasetidentifier")

# Source file and key columns
dbutils.widgets.text("SourceFileName", "*", "Source File Name")
dbutils.widgets.text("KeyColumns", "Guid", "Key Columns")

# Feedback column and flattening option
dbutils.widgets.text("FeedbackColumn", "EventTimestamp", "Feedback Column")
dbutils.widgets.text("DepthLevel", "1", "Depth Level")
dbutils.widgets.text("SchemaFolderName", "schemachecks", "Schema Folder Name")

## Setup

### Package Installation and Management

In [None]:
# Install necessary packages
# Ensure to use the specific versions needed.

# Uninstall the existing package version (uncomment if a previous version is present).
# %pip uninstall databricks-custom-utils -y

# Install the package from the GitHub repository's specified branch.
%pip install git+https://github.com/Open-Dataplatform/utils-databricks.git@v0.6.8 sqlparse openpyxl

### Initialize notebook and get parameters

In [None]:
from custom_utils.logging.logger import Logger
from custom_utils.config.config import Config

# Instantiate the logger class
logger = Logger(debug=True)

# Initialize the Config object
config = Config.initialize(dbutils=dbutils, logger=logger, debug=True)

# Unpack parameters into the current scope
config.unpack(globals())

### Verify paths and files

In [None]:
from custom_utils.validation.validation import Validator

# Initialize the Validator class with the provided configuration and logger
validator = Validator(config, logger=logger, debug=True)

# Perform validation of schema and source paths, returning validated file folder paths and file type.
schema_file_path, data_file_path, matched_data_files, data_file_type = validator.verify_paths_and_files()

### Exit Notebook if Validate fails

In [None]:
# Check for exit flag
if Validator.exit_notebook:
    print(Validator.exit_notebook_message)  # Ensure the message is visible in the notebook
    dbutils.notebook.exit(Validator.exit_notebook_message)

## Processing Workflow

### Flattening and Processing Nested JSON Data

In [None]:
from pyspark.sql.functions import col
from custom_utils.transformations.dataframe import DataFrameTransformer

# Initialize the DataFrameTransformer class
transformer = DataFrameTransformer(config=config, logger=logger, dbutils=dbutils, debug=True)

# Call the process_and_flatten_data method using the transformer instance
df_initial, df_flattened = transformer.process_and_flatten_data(
    schema_file_path=schema_file_path,
    data_file_path=data_file_path,
    file_type=data_file_type,
    matched_data_files=matched_data_files,
    depth_level=depth_level,
    sheet_name=sheet_name,
    include_schema=False
)

# Optionally: Rename the column "Timestamp" to "EventTimestamp" and cast it to timestamp
df_flattened = df_flattened \
    .withColumnRenamed("Timestamp", "EventTimestamp") \
    .withColumn("EventTimestamp", col("EventTimestamp").cast("timestamp"))

# Display the flattened DataFrame to verify the result (useful in Databricks or Jupyter)
print("Initial DataFrame:")
display(df_initial)

print("Flattened DataFrame:")
display(df_flattened)

## Quality check 

### Abort if duplicates exists in new data

In [None]:
from custom_utils.quality.quality import DataQualityManager

# Initialize the DataQualityManager class
quality_manager = DataQualityManager(logger=logger, debug=True)

# Describe available quality checks
quality_manager.describe_available_checks()

# Executing Data Quality Checks on Flattened DataFrame
cleaned_data_view = quality_manager.perform_data_quality_checks(
    spark=spark,
    df=df_flattened,
    key_columns=key_columns,
    order_by=key_columns,  # Default order if feedback_column is not provided
    feedback_column=feedback_column,  # Optional column for ordering during duplicate removal
    join_column=key_columns,
    columns_to_exclude=["input_file_name"],
    use_python=False  # Set to True to use Python DataFrame operations; False for SQL
)

## Write

### Unified Data Storage Management: Table Creation and Data Merging

In [None]:
from custom_utils.catalog.catalog_utils import DataStorageManager

# Initialize the DataStorageManager class
storage_manager = DataStorageManager(logger=logger, debug=True)

storage_manager.manage_data_operation(
    spark=spark,
    dbutils=dbutils,
    cleaned_data_view=cleaned_data_view,  # The view you want to merge
    key_columns=key_columns,  # Key columns to use for merging
    destination_folder_path=destination_folder_path,  # Destination folder path
    destination_environment=destination_environment,  # The database name
    source_datasetidentifier=source_datasetidentifier,  # The table name
    use_python=False  # Set to True to use Python DataFrame operations; False for SQL
)

## Finishing

### Return period (from_datetime, to_datetime) covered by data read

In [None]:
# Run the main function to generate feedback timestamps
notebook_output = storage_manager.generate_feedback_timestamps(
    spark=spark, 
    view_name=cleaned_data_view, 
    feedback_column=feedback_column,
    key_columns=key_columns
)

## Exit the notebook

In [None]:
# Use dbutils to exit the notebook
dbutils.notebook.exit(notebook_output)