In [1]:
import logging
from typing import Tuple
from pyspark.sql import DataFrame
from abc import ABC, abstractmethod

In [2]:
class DefineDataType(ABC):
    """
    Abstract class for defining features data types
    """
    @abstractmethod
    def define_dtype(self, df: DataFrame):
        """
        Defines data types

        Args:
            df: DataFrame to transformation
        """
        pass

In [3]:
class ShootoutsDataType(DefineDataType):
    """
    Defines data types for 'shootouts' table
    """
    def define_dtype(self, df: DataFrame) -> DataFrame:
        """
        Defines data types

        Returns:
            DataFrame: Data after defining schema
        """
        try:
            df = df.selectExpr(
                "CAST(date AS DATE) as date",
                "CAST(home_team AS STRING) as home_team",
                "CAST(away_team AS STRING) as away_team",
                "CAST(winner AS STRING) as winner",
                "CAST(first_shooter AS STRING) as first_shooter"
            )
            return df
        except Exception as e:
            logging.error(f"Error while defining data type schema for 'shootouts' table: {e}")
            raise e

In [4]:
class ResultsDataType(DefineDataType):
    """
    Defines data types for 'results' table
    """
    def define_dtype(self, df: DataFrame) -> DataFrame:
        """
        Defines data types

        Returns:
            DataFrame: Data after defining schema
        """
        try:
            df = df.selectExpr(
                "CAST(date AS DATE) as date",
                "CAST(home_team AS STRING) as home_team",
                "CAST(away_team AS STRING) as away_team",
                "CAST(home_score AS INTEGER) as home_score",
                "CAST(away_score AS INTEGER) as away_score",
                "CAST(tournament AS STRING) as tournament",
                "CAST(city AS STRING) as city",
                "CAST(country AS STRING) as country",
                "CAST(neutral AS BOOLEAN) as neutral"
            )
            return df
        except Exception as e:
            logging.error(f"Error while defining data type schema for 'results' table: {e}")
            raise e

In [5]:
class ScorersDataType(DefineDataType):
    """
    Defines data types for 'scorers' table
    """
    def define_dtype(self, df: DataFrame) -> DataFrame:
        """
        Defines data types

        Returns:
            DataFrame: Data after defining schema
        """
        try:
            
            df = df.selectExpr(
                "CAST(date AS DATE) as date",
                "CAST(home_team AS STRING) as home_team",
                "CAST(away_team AS STRING) as away_team",
                "CAST(team AS STRING) as team",
                "CAST(scorer AS STRING) as scorer",
                "CAST(minute AS INTEGER) as minute",
                "CAST(own_goal AS BOOLEAN) as own_goal",
                "CAST(penalty AS BOOLEAN) as penalty"
            )
            return df
        except Exception as e:
            logging.error(f"Error while defining data type schema for 'scorers' table: {e}")
            raise e

In [4]:
# def assign_data_type(scorers: DataFrame, results: DataFrame, shootouts: DataFrame) -> Tuple[DataFrame, ...]:
#     """
#     Corrects data types

#     Args:
#         scorers: 'scorers' DataFrame
#         results: 'results' DataFrame
#         shpptouts: 'shootouts' DataFrame
#     Returns:
#         Tuple[DataFrame, ...]: Corrected data types in DataFrames
#     """
#     try:
#         define_data_type = DefineDataType()
#         scorers = define_data_type.scorers_schema(scorers)
#         results = define_data_type.results_schema(results)
#         shootouts = define_data_type.shootouts_schema(shootouts)
#         return scorers, results, shootouts
#     except Exception as e:
#         logging.error(f"Error while correcting data types: {e}")
#         raise e