In [261]:
%run /home/jovyan/work/operations/spark_db_connection.ipynb import create_spark_session
%run /home/jovyan/work/operations/logging_set.ipynb import set_logging
%run /home/jovyan/work/ETL/process/extract.ipynb import extract_process
%run /home/jovyan/work/ETL/process/transform.ipynb import transform_process

In [262]:
import logging
from typing import Dict, Type

In [263]:
class ExtractionParamaters:
    """
    Class for defining table names and data sources
    """
    files: Dict[str, str] = {"scorers": "/home/jovyan/work/dataset/goalscorers.csv", 
                             "results": "/home/jovyan/work/dataset/results.csv", 
                             "shootouts": "/home/jovyan/work/dataset/shootouts.csv"}


In [264]:
def ETL_pipeline(config: Type[ExtractionParamaters]):
    try:
        set_logging()
        spark = create_spark_session()
        scorers, results, shootouts = extract_process(spark=spark, files=config.files)
        transform_process(spark, scorers, results, shootouts)
    except Exception as e:
        logging.error(f"Error while executing ETL pipeline: {e}")
        raise e

In [265]:
ETL_pipeline(ExtractionParamaters)

Started the extraction process:
Successfully ingested data from: ['/home/jovyan/work/dataset/goalscorers.csv']
Successfully saved table: scorers
Successfully ingested data from: ['/home/jovyan/work/dataset/results.csv']
Successfully saved table: results
Successfully ingested data from: ['/home/jovyan/work/dataset/shootouts.csv']
Successfully saved table: shootouts
Successfully loaded table: scorers
Successfully loaded table: results
Successfully loaded table: shootouts
Finished the extraction process
Started the transformation process:
Error in SQL query (Fifa World Cup trophies): [NESTED_AGGREGATE_FUNCTION] It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; line 16 pos 50;
'WithCTE
:- CTERelationDef 489, false
:  +- SubqueryAlias world_cup
:     +- Project [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neut

AnalysisException: [NESTED_AGGREGATE_FUNCTION] It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; line 16 pos 50;
'WithCTE
:- CTERelationDef 489, false
:  +- SubqueryAlias world_cup
:     +- Project [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, extract(YEAR, date#19092) AS year#19158]
:        +- Filter (tournament#19097 = FIFA World Cup)
:           +- SubqueryAlias results
:              +- View (`results`, [date#19092,home_team#19093,away_team#19094,home_score#19095,away_score#19096,tournament#19097,city#19098,country#19099,neutral#19100])
:                 +- Project [cast(date#19048 as date) AS date#19092, cast(home_team#19049 as string) AS home_team#19093, cast(away_team#19050 as string) AS away_team#19094, cast(home_score#19051 as int) AS home_score#19095, cast(away_score#19052 as int) AS away_score#19096, cast(tournament#19053 as string) AS tournament#19097, cast(city#19054 as string) AS city#19098, cast(country#19055 as string) AS country#19099, cast(neutral#19056 as boolean) AS neutral#19100]
:                    +- Relation [date#19048,home_team#19049,away_team#19050,home_score#19051,away_score#19052,tournament#19053,city#19054,country#19055,neutral#19056] JDBCRelation(results) [numPartitions=1]
:- CTERelationDef 490, false
:  +- SubqueryAlias ranked_matches
:     +- Project [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, year#19158, match_rank#19159]
:        +- Project [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, year#19158, match_rank#19159, match_rank#19159]
:           +- Window [row_number() windowspecdefinition(YEAR#19158, date#19092 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS match_rank#19159], [YEAR#19158], [date#19092 DESC NULLS LAST]
:              +- Project [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, year#19158]
:                 +- SubqueryAlias wc
:                    +- SubqueryAlias world_cup
:                       +- CTERelationRef 489, true, [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, year#19158]
+- 'UnresolvedHaving cast(CASE WHEN ('year IN (1930,1950) AND (games_in_final_date#19157L > cast(1 as bigint))) THEN ('match_rank <= 3) WHEN (NOT 'year IN (1930,1950) AND (games_in_final_date#19157L > cast(1 as bigint))) THEN ('match_rank <= 4) ELSE ('match_rank <= 1) END as boolean)
   +- Aggregate [date#19092], [date#19092, count(max(date#19092)) AS games_in_final_date#19157L]
      +- SubqueryAlias rm
         +- SubqueryAlias ranked_matches
            +- CTERelationRef 490, true, [date#19092, home_team#19093, away_team#19094, home_score#19095, away_score#19096, tournament#19097, city#19098, country#19099, neutral#19100, year#19158, match_rank#19159]
