
# JSON Standardization and Flattening
This notebook is responsible for standardizing and flattening JSON files. It reads the raw JSON data from the landing zone, applies a schema for validation, flattens nested structures, and then writes the transformed data as Delta Parquet files.

Key Features:
- Reads JSON files from the landing zone.
- Applies a predefined schema for data validation (ensure the schema file is available at landing/schemachecks/[datasetidentifier]/[datasetidentifier]_schema.json).
- Flattens nested JSON structures for easier querying and analysis.
- Saves the processed data as Delta Parquet files for efficient storage and querying.

## Widget Initialization and Configuration

In [None]:
# Source and Destination Storage Accounts
dbutils.widgets.text("SourceStorageAccount", "dplandingstoragetest", "Source Storage Account")
dbutils.widgets.text("DestinationStorageAccount", "dpuniformstoragetest", "Destination Storage Account")

# Source container and dataset identifier
dbutils.widgets.text("SourceContainer", "landing", "Source Container")
dbutils.widgets.text("SourceDatasetidentifier", "triton__flow_plans", "Source Datasetidentifier")

# Source file and key columns
dbutils.widgets.text("SourceFileName", "*", "Source File Name")
dbutils.widgets.text("KeyColumns", "Guid", "Key Columns")

# Feedback column and flattening option
dbutils.widgets.text("FeedbackColumn", "EventTimestamp", "Feedback Column")
dbutils.widgets.text("DepthLevel", "", "Depth Level")
dbutils.widgets.text("SchemaFolderName", "schemachecks", "Schema Folder Name")

## Setup

### Package Installation and Management

In [None]:
# Install necessary packages
# Ensure to use the specific versions needed.

# Uninstall the existing package version (uncomment if a previous version is present).
# %pip uninstall databricks-custom-utils -y

# Install the package from the GitHub repository's specified branch.
%pip install --force-reinstall git+https://github.com/xazms/utils-databricks.git@v0.6.4

In [None]:
try:
    import databricks_custom_utils
except ImportError:
    # Package is not installed, so install it
    %pip install git+https://github.com/xazms/utils-databricks.git@v0.6.4

### Initialize notebook and get parameters

In [None]:
from custom_utils.config import initialize_notebook

# Initialize the notebook with the specified configuration and logger
spark, config, logger = initialize_notebook(dbutils=dbutils, debug=True)

# Unpack the configuration parameters into the global namespace for easy access throughout the notebook.
config.unpack(globals())

### Verify paths and files

In [None]:
from custom_utils.validation.validation import PathValidator

# Initialize the PathValidator with the provided configuration
validator = PathValidator(config, logger, debug=True)

# Perform validation of schema and source paths, returning validated paths and file type.
schema_file_path, data_file_path, matched_files, file_type = validator.verify_paths_and_files()

## Processing Workflow

### Flattening and Processing Nested JSON Data

In [None]:
from pyspark.sql.functions import col
from custom_utils.transformations.dataframe import DataFrameTransformer

# Initialize the DataFrameTransformer
transformer = DataFrameTransformer(logger=logger, debug=True)

# Call the process_and_flatten_json method using the transformer instance
df, df_flattened = transformer.process_and_flatten_json(
    schema_file_path=schema_file_path,
    data_file_path=data_file_path,
    logger=logger,
    depth_level=depth_level,  # Override the config depth level if needed
    debug=False,              # Enable detailed debug logs
    include_schema=False      # Control whether to include schema JSON logging
)

# Rename the column "Timestamp" to "EventTimestamp" and cast it to timestamp
df_flattened = df_flattened.withColumnRenamed("Timestamp", "EventTimestamp") \
                           .withColumn("EventTimestamp", col("EventTimestamp").cast("timestamp"))

# Optionally: Display the flattened DataFrame to verify the result (useful in Databricks or Jupyter)
display(df_flattened)

## Quality check 

### Abort if duplicates exists in new data

In [None]:
from custom_utils.quality.quality import Quality

# Initialize the Quality class
quality_checker = Quality(logger=logger, debug=True)

# Assuming 'reference_table' is the table you want to use for referential integrity check
reference_df = spark.table(f"{destination_environment}.{source_datasetidentifier}")

# Apply quality checks, excluding 'input_file_name' and specifying other parameters as needed
temp_view_name = quality_checker.apply_all_checks(
    spark=spark,
    df=df_flattened,
    key_columns=['Guid'],
    critical_columns=['Guid', 'EventTimestamp', 'ValidFrom', 'ValidTo'],
    column_ranges={'CommercialBalance': (-5869732, 1000000), 'PhysicalBalance': (-5869732, 1000000)},
    reference_df=reference_df,
    join_column='Guid',
    consistency_pairs=[('ValidFrom', 'ValidTo')],
    columns_to_exclude=['input_file_name']
)

# Display the new view
spark.sql(f"SELECT * FROM {temp_view_name}").show()

## Write

### Unified Data Storage Management: Table Creation and Data Merging

In [None]:
from custom_utils.catalog.catalog_utils import DataStorageManager

# Initialize the DataStorageManager class
storage_manager = DataStorageManager(logger=logger, debug=True)

storage_manager.manage_data_operation(
    spark=spark,
    dbutils=dbutils,
    destination_environment=destination_environment,
    source_datasetidentifier=source_datasetidentifier,
    temp_view_name=temp_view_name,
    key_columns=key_columns
)

## Finishing

### Return period (from_datetime, to_datetime) covered by data read

In [None]:
from custom_utils.catalog.catalog_utils import DataStorageManager

# Initialize the DataStorageManager class
storage_manager = DataStorageManager(logger=logger, debug=True)

# Run the main function to generate feedback timestamps
notebook_output = storage_manager.generate_feedback_timestamps(
    spark=spark, 
    view_name=temp_view_name, 
    feedback_column=feedback_column
)

### Exit the notebook 

In [None]:
# Use dbutils to exit the notebook
dbutils.notebook.exit(notebook_output)