In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Source Extraction
try:
    # Load data from Unity Catalog tables
    pes_prep_df = spark.table("genai_demo.jnj.pes_prep")
    c19_ivl_data_df = spark.table("genai_demo.jnj.c19_ivl_data")
    c04_ekpo_df = spark.table("genai_demo.jnj.c04_ekpo")
    c04_bseg_df = spark.table("genai_demo.jnj.c04_bseg")
    pjotr_df = spark.table("genai_demo.jnj.pjotr_")
    pjotr_in_pes_df = spark.table("genai_demo.jnj.pjotr_in_pes")
    pjotr_prod_df = spark.table("genai_demo.jnj.pjotr_prod")

    # Log record counts and schema details
    logger.info(f"pes_prep_df: {pes_prep_df.count()} records, schema: {pes_prep_df.schema}")
    logger.info(f"c19_ivl_data_df: {c19_ivl_data_df.count()} records, schema: {c19_ivl_data_df.schema}")
    logger.info(f"c04_ekpo_df: {c04_ekpo_df.count()} records, schema: {c04_ekpo_df.schema}")
    logger.info(f"c04_bseg_df: {c04_bseg_df.count()} records, schema: {c04_bseg_df.schema}")
    logger.info(f"pjotr_df: {pjotr_df.count()} records, schema: {pjotr_df.schema}")
    logger.info(f"pjotr_in_pes_df: {pjotr_in_pes_df.count()} records, schema: {pjotr_in_pes_df.schema}")
    logger.info(f"pjotr_prod_df: {pjotr_prod_df.count()} records, schema: {pjotr_prod_df.schema}")
except AnalysisException as e:
    logger.error(f"Error loading source data: {e}")
    raise

# Step 2: Data Cleansing and Standardization
try:
    # Example transformation: trimming whitespace and removing specific values
    pes_prep_df = pes_prep_df.withColumn("trimmed_field", F.trim(F.col("field_name")))
    # Log cleaned schema and record counts
    logger.info(f"Cleaned pes_prep_df: {pes_prep_df.count()} records, schema: {pes_prep_df.schema}")
except Exception as e:
    logger.error(f"Error during data cleansing: {e}")
    raise

# Step 3: Field Selection
try:
    # Select relevant fields
    selected_fields_df = pes_prep_df.select("field1", "field2", "trimmed_field")
    # Log selected fields and schema
    logger.info(f"Selected fields: {selected_fields_df.columns}, schema: {selected_fields_df.schema}")
except Exception as e:
    logger.error(f"Error during field selection: {e}")
    raise

# Step 4: Data Integration
try:
    # Perform join operations
    joined_df = selected_fields_df.join(c19_ivl_data_df, "common_field", "inner")
    # Drop duplicate columns
    joined_df = joined_df.drop("duplicate_field")
    # Log updated record counts and schema
    logger.info(f"Joined data: {joined_df.count()} records, schema: {joined_df.schema}")
except Exception as e:
    logger.error(f"Error during data integration: {e}")
    raise

# Step 5: Custom Calculations
try:
    # Implement custom logic
    custom_calculations_df = joined_df.withColumn("new_field", F.expr("field1 + field2"))
    # Log new or modified fields and schema
    logger.info(f"Custom calculations: {custom_calculations_df.columns}, schema: {custom_calculations_df.schema}")
except Exception as e:
    logger.error(f"Error during custom calculations: {e}")
    raise

# Step 6: Data Merging
try:
    # Merge data streams
    merged_df = custom_calculations_df.union(pjotr_df)
    # Log merged data stream and schema
    logger.info(f"Merged data: {merged_df.count()} records, schema: {merged_df.schema}")
except Exception as e:
    logger.error(f"Error during data merging: {e}")
    raise

# Step 7: Data Filtering
try:
    # Apply filter conditions
    filtered_df = merged_df.filter(F.col("new_field") > 100)
    # Log filtered data and schema
    logger.info(f"Filtered data: {filtered_df.count()} records, schema: {filtered_df.schema}")
except Exception as e:
    logger.error(f"Error during data filtering: {e}")
    raise

# Step 8: Data Aggregation
try:
    # Aggregate data
    aggregated_df = filtered_df.groupBy("group_field").agg(F.sum("new_field").alias("sum_new_field"))
    # Log aggregated fields and schema
    logger.info(f"Aggregated data: {aggregated_df.count()} records, schema: {aggregated_df.schema}")
except Exception as e:
    logger.error(f"Error during data aggregation: {e}")
    raise

# Step 9: Field Cleanup
try:
    # Rename columns
    cleaned_df = aggregated_df.withColumnRenamed("sum_new_field", "total_new_field")
    # Log cleaned schema and record counts
    logger.info(f"Cleaned data: {cleaned_df.count()} records, schema: {cleaned_df.schema}")
except Exception as e:
    logger.error(f"Error during field cleanup: {e}")
    raise

# Step 10: Load to Target
try:
    # Ensure schema exists before creating table
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "final_transformed_data"
    
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")
    
    # Write to Unity Catalog target table
    cleaned_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data successfully loaded to {target_catalog}.{target_schema}.{target_table}")
except Exception as e:
    logger.error(f"Error during data loading: {e}")
    raise
