In [5]:
%run /home/jovyan/work/database_operations/db_operations.ipynb import DataOperations

In [6]:
import logging
from typing import Tuple, Annotated, List
from pyspark.ml.feature import StringIndexer
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, when, year

In [7]:
class TransformData:
    """
    Class for preparing dataset into model predictions
    """

    def check_empty_fields(self, df: DataFrame) -> DataFrame:
        """
        Deletes rows with empty fields
        
        Args:
            df: DataFrame befor transformation
        Returns:
            DataFrame: DataFrame after transformation
        """
        try:
            df = df.dropna()
            logging.info("\nSuccessfully deleted emty fields")
            return df
        except Exception as e:
            logging.error(f"Error while cleaning 'results' dataset: {e}")
            raise e
    
    def filter_data(self, df: DataFrame) -> DataFrame:
        """
        Picks only nedded data from DataFrame
        
        Args:
            df: DataFrame before transformation
        Return:
            DataFrame: A DataFrame after transformation
        """
        try:
            df = df.filter(df.year >= 1980)
            logging.info("Successfully filtered data")
            return df
        except Exception as e:
            logging.error(f"Error in data filtering: {e}")
            raise e
        
    def convert_date_into_years(self, df: DataFrame) -> DataFrame:
        """
        Converts date into years

        Args:
            df: DataFrame befor trainsformation
        Returns:
            DataFrame: DataFrame after "date" feature transformation
        """
        try:
            df = df.withColumn("year", year("date"))
            df = df.drop("date")
            logging.info("Successfully changed date into years")
            return df
        except Exception as e:
            logging.error(f"Error in converting date into years: {e}")
            raise e
    
        
    def string_into_numeric(self, df: DataFrame) -> Tuple[
        Annotated[DataFrame, "df"],
        Annotated[list, "categorical_features"],
        Annotated[list, "numeric_features"],
        Annotated[list, "targets"]
        ]:
        """
        Changes features with string data type into numeric

        Args:
            df: DataFrame befor transformation
        Returns:
                Tuple[
            Annotated[DataFrame, "df"],
            Annotated[list, "categorical_features"],
            Annotated[list, "numeric_features"],
            Annotated[list, "targets"]
            ]: 
            - df: DataFrame befor transformation
            - categorical_features: List of categorical features
            - numeric_features: List of numeric features
            - targets: List of targets
        """
        try:
            categorical_features = ["away_team", "city", "country", "home_team", "year", "tournament"]
            numeric_features = ["neutral"]
            targets = ["home_score", "away_score"]
            original_columns = df.columns
            columns_to_transformation = [org_col for org_col in original_columns if org_col not in targets and org_col != "year"]

            combined_home_away_team = df.select(col("home_team") \
                                      .alias("team")) \
                                      .union(df.select(col("away_team") \
                                      .alias("team"))) \
                                      .distinct()
            
            team_indexer = StringIndexer(inputCol="team", outputCol="team_index")
            indexed_team = team_indexer.fit(combined_home_away_team).transform(combined_home_away_team)

            df_with_home_index = df \
                .join(indexed_team.withColumnRenamed("team_index", "home_team_index"), 
                      df.home_team == indexed_team.team, "left") \
                .drop("team")
            
            df = df_with_home_index \
                .join(indexed_team.withColumnRenamed("team_index", "away_team_index"), 
                      df_with_home_index.away_team == indexed_team.team, "left") \
                .drop("team")
                    
            string_categorcial = ["city", "country", "tournament"]
            for cat in string_categorcial:
                string_indexer = StringIndexer(inputCol=cat, outputCol=cat + "_index")
                df = string_indexer.fit(df).transform(df)

            df = df.withColumn("neutral_index", when(col("neutral") == True, 1).otherwise(0))

            for tr_col in columns_to_transformation:
                df = df.drop(tr_col)
            
            for new_col in df.columns:
                for col_to_trans in columns_to_transformation:
                    if new_col == col_to_trans + "_index":
                        df = df.withColumnRenamed(new_col, col_to_trans)
            
            df = df.select(*original_columns)
            logging.info("Successfully converted string features into numeric")
            return df, categorical_features, numeric_features, targets
        except Exception as e:
            logging.error(f"Error while changing string features into numeric: {e}")
            raise e

In [8]:
def apply_transform_strategy(spark: SparkSession, df: DataFrame) -> Tuple[
        Annotated[List[str], "categorical_features"], 
        Annotated[List[str], "numeric_features"],
        Annotated[List[str], "targets"]
        ]:
    """
    Handles TransformData operations 
    
    Args:
        spark: Active SparkSession
        df: Dataset for transforming
    Returns:
        Tuple[
        Annotated[List[str], "categorical_features"], 
        Annotated[List[str], "numeric_features"],
        Annotated[List[str], "targets"]
        ]:
            - categorical_features: List of categorical columns
            - numeric_features: List of numeric columns
            - targets: List of targets

    """
    try:
        transform_data = TransformData()
        deleted_empty_fields = transform_data.check_empty_fields(df)
        date_into_years = transform_data.convert_date_into_years(deleted_empty_fields)
        filtered_data = transform_data.filter_data(date_into_years)
        string_into_numeric, categorical_features, numeric_features, targets = transform_data.string_into_numeric(df=filtered_data)
        
        db_operations = DataOperations(spark)
        db_operations.save_data(df=string_into_numeric, table_name="cleaned_data")

        return categorical_features, numeric_features, targets
    except Exception as e:
        logging.error(f"Error in data transformation: {e}")
        raise e