In [21]:
%run /home/jovyan/work/operations/logging_set.ipynb import set_logging
%run /home/jovyan/work/ETL/process/extract.ipynb import extract_process
%run /home/jovyan/work/ETL/process/transform.ipynb import transform_process
%run /home/jovyan/work/ETL/process/load.ipynb import load_process

In [22]:
import logging
from typing import Dict, Type
from pyspark.sql import SparkSession

In [23]:
class ExtractionParamaters:
    """
    Class for defining table names and data sources
    """
    files: Dict[str, str] = {"scorers": "/home/jovyan/work/dataset/goalscorers.csv", 
                             "results": "/home/jovyan/work/dataset/results.csv", 
                             "shootouts": "/home/jovyan/work/dataset/shootouts.csv"}


In [24]:
def ETL_pipeline(spark: SparkSession, config: Type[ExtractionParamaters]) -> None:
    """
    Executes a pipeline for ETL (Extract, Transform, Load) process

    This pipeline performs the following steps:
    1. **Data Extraction**: Connects to a PostgreSQL database and saves raw data.
    2. **Data Transformation**: Preprocess data and transforms to the final form.
    3. **Data Loading**: Loads the data to the Data Warehouse.

    Args:
        spark: Active SparkSession
        config: Class containing variable including raw data paths and names
    """
    try:
        logging.info("Started ETL pipeline")
        set_logging()
        scorers, results, shootouts = extract_process(spark=spark, files=config.files)
        transformed_tables = transform_process(spark, scorers, results, shootouts)
        load_process(spark, transformed_tables)
        logging.info("\nSuccessfully finished ETL pipeline\n")
    except Exception as e:
        logging.error(f"Error while executing ETL pipeline: {e}")
        raise e