In [1]:
%run /home/jovyan/work/model/src/preprocess_data.ipynb import PreProcess
%run /home/jovyan/work/operations/db_operations.ipynb import DataOperations

In [3]:
import logging
from pyspark.sql import SparkSession
from typing import List, Annotated, Tuple

In [None]:
def preprocess(spark: SparkSession,
               categorical_features: List[str],
               numeric_features: List[str],
               targets: List[str]
               ) -> Tuple[
                   Annotated[dict, "training_dataset"], 
                   Annotated[dict, "validation_dataset"],
                   Annotated[dict, "test_dataset"],
                   Annotated[list, "merge_models"],
                   Annotated[list, "inputs"]
                   ]:
    """
    Handles preprocessing operations and saves processed datasets into database
    
    Args:
        spark: Active SparkSession
        categorical_features: List of categorical columns
        numeric_features: List of numeric columns
        targets: List of targets
    Returns:
        Tuple[
        Annotated[dict, "training_dataset"], 
        Annotated[dict, "validation_dataset"],
        Annotated[dict, "test_dataset"],
        Annotated[list, "merge_models"],
        Annotated[list, "inputs"]
        ]:
            - training_dataset: Dictionary with training dataset
            - test_dataset: Dictionary with test dataset
            - validation_dataset: Dictionary with validation dataset
            - merge_models: A tensor representing concatenated continuous features Inputs 
                            and embeddings for the categoircal features 
            - inputs: A List containing inputs layers for each feature
    """
    try:
        logging.info("Started preprocessing")
        data_oper = DataOperations(spark=spark)
        df = data_oper.load_data("cleaned_data")
        train = data_oper.load_data("train")
        val = data_oper.load_data("val")
        test = data_oper.load_data("test")
        
        preprocess = PreProcess()
        
        merge_models, inputs = preprocess.embedding_categorical_data(df=df, 
                                                                     categorical_features=categorical_features, 
                                                                     numeric_features=numeric_features)

        training_dataset = preprocess.prepare_for_model(df=train,
                                                        df_name="training", 
                                                        categorical_features=categorical_features, 
                                                        number_features=numeric_features, 
                                                        targets=targets)
        
        test_dataset = preprocess.prepare_for_model(df=val, 
                                                    df_name="validation",                                                 
                                                    categorical_features=categorical_features, 
                                                    number_features=numeric_features, 
                                                    targets=targets)
        
        validation_dataset = preprocess.prepare_for_model(df=test, 
                                                          df_name="test", 
                                                          categorical_features=categorical_features, 
                                                          number_features=numeric_features, 
                                                          targets=targets)
        logging.info("Successfully finished preprocessing \n")
        return training_dataset, validation_dataset, test_dataset, merge_models, inputs
    except Exception as e:
        logging.error(f"Error in data preprocessing: {e}")
        raise e