In [1]:
%run /home/jovyan/work/etl/src/transform_data_types.ipynb import ScorersDataType, ResultsDataType, ShootoutsDataType
%run /home/jovyan/work/etl/src/queries.ipynb import BestWinRatio, BestScorer, WorldCupWinner, Tournaments, GoalsPerMinute

In [2]:
import logging
from typing import Dict
from pyspark.sql import DataFrame, SparkSession

In [None]:
def transform_process(spark: SparkSession, 
                      scorers: DataFrame, 
                      results: DataFrame, 
                      shootouts: DataFrame
                      ) -> Dict[str, DataFrame]:
    """
    Preprocess and transfroms data into desired form

    Args:
        spark: Active SparkSession
        scorers: DataFrame containing 'scorers' data
        results: DataFrame containing 'results' data
        shootouts: DataFrame containing 'shootouts' data
    Returns:
        Dict[str, DataFrame]: Dictionary containing names of DataFrames as keys and DataFrames as values
    """
    try:
        logging.info("\nStarted the transformation process:")
        scorers_dtype = ScorersDataType()
        scorers = scorers_dtype.define_dtype(scorers)
        results_dtype = ResultsDataType()
        results = results_dtype.define_dtype(results)
        shootouts_dtype = ShootoutsDataType()
        shootouts = shootouts_dtype.define_dtype(shootouts)
        
        best_win_ratio = BestWinRatio().query(spark, results)
        best_scorer = BestScorer().query(spark, scorers)
        world_cup = WorldCupWinner().query(spark, results, shootouts)
        tournaments = Tournaments().query(spark, results)
        goals_per_minute = GoalsPerMinute().query(spark, scorers)

        transformed_tables = {"best_win_ratio": best_win_ratio,
                              "best_scorer": best_scorer,
                              "world_cup": world_cup,
                              "tournament_percentage": tournaments,
                              "goals_per_minute": goals_per_minute}

        logging.info("Finished the transformation process")
        return transformed_tables
    except Exception as e:
        logging.error(f"Error in the transformation process: {e}")
        raise e