In [20]:
# !pip install pandas
# !pip install tensorflow
# !pip install numpy

In [21]:
import logging
import tensorflow as tf
import numpy as np
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, year
from pyspark.ml.feature import StringIndexer
from typing import Tuple, Annotated, List, Dict, Union
from tensorflow.keras.layers import Input, Embedding, Reshape, Concatenate

In [22]:
class TransformData:
    """
    Class for preparing dataset into model predictions
    """

    def check_empty_fields(self, df: DataFrame) -> DataFrame:
        """
        Deletes rows with empty fields
        
        Args:
            df: DataFrame befor transformation
        Returns:
            DataFrame: DataFrame after transformation
        """
        try:
            df = df.dropna()
            return df
        except Exception as e:
            logging.error(f"Error while cleaning 'results' dataset: {e}")
            raise e
        
    def convert_date_into_years(self, df: DataFrame) -> DataFrame:
        """
        Converts date into years

        Args:
            df: DataFrame befor trainsformation
        Returns:
            DataFrame: DataFrame after "date" feature transformation
        """
        try:
            df = df.withColumn("year", year("date"))
            df = df.drop("date")
            return df
        except Exception as e:
            logging.error(f"Error in converting date into years: {e}")
            raise e
    
        
    def string_into_numeric(self, df: DataFrame) -> Tuple[
        Annotated[DataFrame, "df"],
        Annotated[list, "categorical_features"],
        Annotated[list, "numeric_features"],
        Annotated[list, "labels"]
        ]:
        """
        Changes features with string data type into numeric

        Args:
            df: DataFrame befor transformation
        Returns:
                Tuple[
            Annotated[DataFrame, "df"],
            Annotated[list, "categorical_features"],
            Annotated[list, "numeric_features"],
            Annotated[list, "labels"]
            ]: 
            - df: DataFrame befor transformation
            - categorical_features: List of categorical features
            - numeric_features: List of numeric features
            - labels: List of labels
        """
        try:
            categorical_features = ["away_team", "city", "country", "home_team", "year", "tournament"]
            numeric_features = ["neutral"]
            labels = ["home_score", "away_score"]
            original_columns = df.columns
            columns_to_transformation = [org_col for org_col in original_columns if org_col not in labels and org_col != "year"]

            combined_home_away_team = df.select(col("home_team") \
                                      .alias("team")) \
                                      .union(df.select(col("away_team") \
                                      .alias("team"))) \
                                      .distinct()
            
            team_indexer = StringIndexer(inputCol="team", outputCol="team_index")
            indexed_team = team_indexer.fit(combined_home_away_team).transform(combined_home_away_team)

            df_with_home_index = df \
                .join(indexed_team.withColumnRenamed("team_index", "home_team_index"), 
                      df.home_team == indexed_team.team, "left") \
                .drop("team")
            
            df = df_with_home_index \
                .join(indexed_team.withColumnRenamed("team_index", "away_team_index"), 
                      df_with_home_index.away_team == indexed_team.team, "left") \
                .drop("team")
                    
            string_categorcial = ["city", "country", "tournament"]
            for cat in string_categorcial:
                string_indexer = StringIndexer(inputCol=cat, outputCol=cat + "_index")
                df = string_indexer.fit(df).transform(df)

            df = df.withColumn("neutral_index", when(col("neutral") == True, 1).otherwise(0))

            for tr_col in columns_to_transformation:
                df = df.drop(tr_col)
            
            for new_col in df.columns:
                for col_to_trans in columns_to_transformation:
                    if new_col == col_to_trans + "_index":
                        df = df.withColumnRenamed(new_col, col_to_trans)
            
            df = df.select(*original_columns)
            return df, categorical_features, numeric_features, labels
        except Exception as e:
            logging.error(f"Error while changing string features into numeric: {e}")
            raise e


    def divide_data(self, df: DataFrame) -> Tuple[
        Annotated[DataFrame, "train_df"], 
        Annotated[DataFrame, "val_df"],
        Annotated[DataFrame, "test_df"]
        ]:
        """
        Divides data into training, validation and test datasets
        
        Args:
            df: DataFrame befor spliting
        Returns:
            Tuple[Annotated[DataFrame, "train_df"], 
                  Annotated[DataFrame, "val_df"],
                  Annotated[DataFrame, "test_df"]]:
                - Training dataset 
                - Validation datset
                - Testing datset
        """
        try:
            val_df, test_df = df.randomSplit([0.8, 0.2])
            train_df, val_df = val_df.randomSplit([0.75, 0.25])
            return train_df, val_df, test_df
        except Exception as e:
            logging.error(f"Error while dividing data into training, validation, and test datasets: {e}")
            raise e
   

    def embedding_categorical_data(self, 
                                   df: DataFrame, 
                                   categorical_features: List[str], 
                                   numeric_features: List[str]
                                   ) -> Tuple[tf.Tensor, List[tf.Tensor]]:
        """
        Applies embedding on categorical data and prepares inputs for neural network

        Args:
            df: DataFrame containing a dataset
            categorical_features: List of categorical features
            numeric_features: List of numeric features
        Returns:
            Tuple[tf.Tensor, List[tf.Tensor]]:
                - A tensor representing concatenated continuous features Inputs 
                  and embeddings for the categoircal features 
                - A List containing inputs layers for each feature
        """

        try:
            tf.debugging.enable_check_numerics()
            models = []
            inputs = []

            combined_home_away_team = df.select(col("home_team").alias("team")) \
                                         .union(df.select(col("away_team").alias("team"))) \
                                         .distinct()
            home_team_unique = combined_home_away_team.select("team").count()

            for cat in categorical_features:
                if cat == "home_team" or cat == "away_team":
                    vocab_size = home_team_unique
                else:
                    vocab_size = df.select(cat).distinct().count()


                output_dim = min(50, (vocab_size // 2) + 1)

                inpt = Input(shape=(1,), name="input_" + cat)
                embed = Embedding(vocab_size + 1,
                                  output_dim,
                                  trainable=True,
                                  embeddings_initializer=tf.initializers.random_normal) \
                                  (inpt)

                embed_reshaped = Reshape(target_shape=(output_dim,))(embed)
                models.append(embed_reshaped)
                inputs.append(inpt)

            num_input = Input(shape=(len(numeric_features),), name="input_number_features")
            models.append(num_input)
            inputs.append(num_input)

            merge_models = Concatenate()(models)
            return merge_models, inputs
        except Exception as e:
            logging.error(f"Error while embedding categorical features: {e}")
            raise e


    def prepare_for_model(self, 
                          df: DataFrame, 
                          categorical_features: List[str], 
                          number_features: List[str], 
                          labels: List[str]
                          ) -> Dict[str, Dict[str, np.ndarray]]:
        """
        Prepares data for model entry

        Args:
            df: DataFrame to prepare
            categorical_features: List of categorcial features
            number_features: List of categorcial features
            labels: List of labels
        Returns:
            Dict[str, Dict[str, np.ndarray]]: 
                - A dictionary with two keys:
                    - "input_features": A dictionary with every feature name as a key
                       and each value is a numpy array containing the feature data
                    - "labels": A numpy array containing labels data
        """
        try:
            df = df.toPandas()
            input_dict = {
                "input_features": {},
                "labels": {},
                }
            for cat in categorical_features:
                input_dict["input_features"]["input_" + cat] = df[cat].values
            input_dict["input_features"]["input_number_features"] = df[number_features].values
            
            for lab in labels:
                input_dict["labels"][lab] = df[lab].values

            return input_dict           
        except Exception as e:
            logging.error(f"Error while preparing data for model: {e}")

In [23]:
def apply_transform_strategy(df: DataFrame) -> Tuple[
        Annotated[dict, "training_dataset"], 
        Annotated[dict, "test_dataset"],
        Annotated[dict, "validation_dataset"],
        Annotated[list, "merge_models"],
        Annotated[list, "inputs"]
        ]:
    """
    Handles TransformData operations 
    
    Args:
        df: Dataset for transforming
    Returns:
        Tuple[
        Annotated[dict, "training_dataset"], 
        Annotated[dict, "test_dataset"],
        Annotated[dict, "validation_dataset"],
        Annotated[list, "merge_models"],
        Annotated[list, "inputs"]
        ]:
            - training_dataset: Dictionary with training dataset
            - test_dataset: Dictionary with test dataset
            - validation_dataset: Dictionary with validation dataset
            - merge_models: A tensor representing concatenated continuous features Inputs 
                            and embeddings for the categoircal features 
            - inputs: A List containing inputs layers for each feature
    """
    try:
        transform_data = TransformData()
        deleted_empty_fields = transform_data.check_empty_fields(df)
        date_into_years = transform_data.convert_date_into_years(deleted_empty_fields)
        string_into_numeric, categorical_features, numeric_features, labels = transform_data.string_into_numeric(df=date_into_years)
        training_df, validation_df, test_df = transform_data.divide_data(df=string_into_numeric)
        
        merge_models, inputs = transform_data.embedding_categorical_data(df=string_into_numeric, 
                                                                         categorical_features=categorical_features, 
                                                                         numeric_features=numeric_features)

        training_dataset = transform_data.prepare_for_model(df=training_df, 
                                                            categorical_features=categorical_features, 
                                                            number_features=numeric_features, 
                                                            labels=labels)
        
        test_dataset = transform_data.prepare_for_model(df=validation_df, 
                                                        categorical_features=categorical_features, 
                                                        number_features=numeric_features, 
                                                        labels=labels)
        
        validation_dataset = transform_data.prepare_for_model(df=test_df, 
                                                              categorical_features=categorical_features, 
                                                              number_features=numeric_features, 
                                                              labels=labels)
        
        return training_dataset, test_dataset, validation_dataset, merge_models, inputs
    except Exception as e:
        logging.error(f"Error in preparing data for model: {e}")
        raise e