In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Football Data ETL Process
# MAGIC This notebook performs an ETL process on football data using PySpark in Databricks.

# COMMAND ----------

# MAGIC
import logging
import fs
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configure Logging
# MAGIC Set up logging to capture information about the ETL process.

# COMMAND ----------

# MAGIC
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Source Configuration
# MAGIC Load data from Unity Catalog tables.

# COMMAND ----------

# MAGIC
try:
    logger.info("Loading data from Unity Catalog tables.")

    # Step 2: Data Ingestion
    df_football = spark.table("catalog.db.football_data")
    df_other = spark.table("catalog.db.other_data").select("team_id", "other_column")  # Select only necessary columns

    logger.info("Data ingestion completed successfully.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Data Cleaning
# MAGIC Remove duplicates and fill missing values.

# COMMAND ----------

# MAGIC
logger.info("Starting data cleaning process.")
    df_cleaned = df_football.dropDuplicates().na.fill({'goals': 0})
    logger.info("Data cleaning completed successfully.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Data Transformation
# MAGIC Calculate goal difference.

# COMMAND ----------

# MAGIC
logger.info("Starting data transformation process.")
    df_transformed = df_cleaned.withColumn('goal_difference', F.col('goals_for') - F.col('goals_against'))
    logger.info("Data transformation completed successfully.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Data Integration
# MAGIC Join the transformed data with other data using a broadcast join.

# COMMAND ----------

# MAGIC
logger.info("Starting data integration process.")
    df_combined = df_transformed.join(broadcast(df_other), "team_id", 'inner')  # Use broadcast join if df_other is small
    logger.info("Data integration completed successfully.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Output Generation
# MAGIC Write the final dataset to Unity Catalog.

# COMMAND ----------

# MAGIC
logger.info("Writing the final dataset to Unity Catalog.")
    df_combined.write.format("delta").mode("overwrite").saveAsTable("catalog.db.football_performance_analysis")
    logger.info("Output generation completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process: %s", e)
    raise
