In [3]:
%run /home/jovyan/work/database_operations/db_operations.ipynb import DataOperations

In [4]:
import logging
import numpy as np
import tensorflow as tf
from pyspark.sql import DataFrame
from typing import Tuple, Annotated, List, Dict
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, monotonically_increasing_id
from tensorflow.keras.layers import Input, Embedding, Reshape, Concatenate

In [5]:
class PreProcess:
    """
    Class for preparing dataset into model predictions
    """

    def divide_data(self, df: DataFrame) -> Tuple[
        Annotated[DataFrame, "train_df"], 
        Annotated[DataFrame, "val_df"],
        Annotated[DataFrame, "test_df"]
        ]:
        """
        Divides data into training, validation and test datasets
        
        Args:
            df: DataFrame befor spliting
        Returns:
            Tuple[Annotated[DataFrame, "train_df"], 
                  Annotated[DataFrame, "val_df"],
                  Annotated[DataFrame, "test_df"]]:
                - Training dataset 
                - Validation datset
                - Testing datset
        """
        try:
            val_df, test_df = df.randomSplit([0.8, 0.2])
            train_df, val_df = val_df.randomSplit([0.75, 0.25])
            logging.info("Successfully divided data into training, validation and test datasets")
            return train_df, val_df, test_df
        except Exception as e:
            logging.error(f"Error while dividing data into training, validation, and test datasets: {e}")
            raise e
        
    def standardize_data(self, 
                         train_df: DataFrame,
                         val_df: DataFrame,
                         test_df: DataFrame,
                         targets: List[str]
                         ) -> Tuple[DataFrame, DataFrame, DataFrame]:
        """
        Standardizes all features in all datasets (training, validation, test)
        
        Args:
            train_df: Training dataset
            val_df: Validation dataset
            test_df: Test dataset
            targets: List of target features
        Returns:
            Tuple[DataFrame, DataFrame, DataFrame]:
                - Standardized training dataset
                - Standardized validation dataset
                - Standardized test dataset
        """
        try:
            original_columns = train_df.columns
            cols_to_transf = []
            for org_col in original_columns:
                if org_col not in targets:
                        cols_to_transf.append(org_col)

            train_targets = train_df.select(*targets).withColumn("index", monotonically_increasing_id())
            val_targets = val_df.select(*targets).withColumn("index", monotonically_increasing_id())
            test_targets = test_df.select(*targets).withColumn("index", monotonically_increasing_id())
            target_datasets = [train_targets, val_targets, test_targets]

            assembler = VectorAssembler(inputCols=cols_to_transf,
                                        outputCol="features")
            train_output = assembler.transform(train_df)
            val_output = assembler.transform(val_df)
            test_output = assembler.transform(test_df)

            scaler = StandardScaler(inputCol="features",
                                    outputCol="scaled_features",
                                    withMean=True,
                                    withStd=True)
            
            scaler_model = scaler.fit(train_output)
            train_scaled = scaler_model.transform(train_output)
            val_scaled = scaler_model.transform(val_output)
            test_scaled = scaler_model.transform(test_output)
            
            scaled_datasets = [train_scaled, val_scaled, test_scaled]
            final_datasets = []
            for x, dataset in enumerate(scaled_datasets):
                temp = dataset.select("scaled_features")
                temp = temp.rdd.map(lambda x:[float(y) for y in x["scaled_features"]]).toDF(cols_to_transf)
                temp = temp.withColumn("index", monotonically_increasing_id())
                scaled_df = temp.join(target_datasets[x], on="index").drop("index")
                final_datasets.append(scaled_df)
            
            final_train = final_datasets[0]
            final_val = final_datasets[1]
            final_test = final_datasets[2]
            logging.info("Successfully standardized datasets")
            return final_train, final_val, final_test
        except Exception as e:
            logging.error(f"Error while data standardization")
            raise e

    def embedding_categorical_data(self, 
                                   df: DataFrame, 
                                   categorical_features: List[str], 
                                   numeric_features: List[str]
                                   ) -> Tuple[tf.Tensor, List[tf.Tensor]]:
        """
        Applies embedding on categorical data and prepares inputs for neural network

        Args:
            df: DataFrame containing a dataset
            categorical_features: List of categorical features
            numeric_features: List of numeric features
        Returns:
            Tuple[tf.Tensor, List[tf.Tensor]]:
                - A tensor representing concatenated continuous features Inputs 
                  and embeddings for the categoircal features 
                - A List containing inputs layers for each feature
        """

        try:
            tf.debugging.enable_check_numerics()
            models = []
            inputs = []

            combined_home_away_team = df.select(col("home_team").alias("team")) \
                                         .union(df.select(col("away_team").alias("team"))) \
                                         .distinct()
            home_away_unique = combined_home_away_team.select("team").count()

            for cat in categorical_features:
                if cat == "home_team" or cat == "away_team":
                    vocab_size = home_away_unique
                else:
                    vocab_size = df.select(cat).distinct().count()


                output_dim = min(50, (vocab_size // 2) + 1)

                inpt = Input(shape=(1,), name="input_" + cat)
                embed = Embedding(vocab_size + 1,
                                  output_dim,
                                  trainable=True,
                                  embeddings_initializer=tf.initializers.random_normal) \
                                  (inpt)

                embed_reshaped = Reshape(target_shape=(output_dim,))(embed)
                models.append(embed_reshaped)
                inputs.append(inpt)

            num_input = Input(shape=(len(numeric_features),), name="input_number_features")
            models.append(num_input)
            inputs.append(num_input)

            merge_models = Concatenate()(models)
            logging.info("Successfully created inputs and embedding layers for model deployment")
            return merge_models, inputs
        except Exception as e:
            logging.error(f"Error while embedding categorical features: {e}")
            raise e


    def prepare_for_model(self, 
                          df: DataFrame, 
                          df_name: str,
                          categorical_features: List[str], 
                          number_features: List[str], 
                          targets: List[str]
                          ) -> Dict[str, Dict[str, np.ndarray]]:
        """
        Prepares data for model entry

        Args:
            df: DataFrame to prepare
            df_name: Name of data to prepare
            categorical_features: List of categorcial features
            number_features: List of categorcial features
            targets: List of targets
        Returns:
            Dict[str, Dict[str, np.ndarray]]: 
                - A dictionary with two keys:
                    - "input_features": A dictionary with every feature name as a key
                       and each value is a numpy array containing the feature data
                    - "targets": A numpy array containing targets values
        """
        try:
            df = df.toPandas()
            input_dict = {
                "input_features": {},
                "targets": {},
                }
            for cat in categorical_features:
                input_dict["input_features"]["input_" + cat] = df[cat].values
            input_dict["input_features"]["input_number_features"] = df[number_features].values
            
            for tar in targets:
                input_dict["targets"][tar] = df[tar].values
            logging.info(f"Successfully prepared dataset for model: {df_name}")
            return input_dict           
        except Exception as e:
            logging.error(f"Error while preparing data for model: {e}")