IMPORTS

In [None]:
import sys
import logging
from typing import Tuple, List, Dict, cast
from dataclasses import dataclass
import datetime as dt
import pandas as pd
import numpy as np
from joblib import Memory
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

UTILS

In [None]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')
    return train, test

In [None]:
def setup_logger(name: str | None = None) -> logging.Logger:
    """A function to setup the logger

    Args:
        name (str | None, optional): Logger name. Defaults to None.

    Returns:
        logging.Logger: Configured logger
    """

    file_handler = logging.FileHandler("logs.log", mode="a")
    stream_handler = logging.StreamHandler(stream=sys.stdout)

    logger = logging.getLogger(name or __name__)
    if logger.hasHandlers():
        logger.handlers.clear()

    logging.basicConfig(
        level=logging.WARNING,
        format="%(levelname)s %(name)s %(asctime)s | %(message)s",
        datefmt="%H:%M:%S",
        handlers=[file_handler, stream_handler],
    )

    
    logger.setLevel(logging.INFO)

    return logger

logger = setup_logger("Main")
logger.info("Logger setup complete")

In [None]:
@dataclass
class ColumnTypes:
    """
    A class to store the features and target columns of the dataset.

    Attributes:
    - ids: List[str]
    - labels: List[str]
    - numerical: List[str]
    - categorical: List[str]
    - targets: List[str]
    """

    ids: List[str]
    """An array of the columns that are used as the ID of the dataset."""
    labels: List[str]
    """An array of the columns that are used as the labels of the dataset. 
    Label encoding is to be used when there is a sequential correlation between the labels."""

    numerical: List[str]
    """An array of the columns that are numerical in nature."""
    categorical: List[str]
    """An array of the columns that are categorical in nature."""
    targets: List[str]
    """An array of the columns that are the target of the dataset."""   

    def to_list(self) -> List[str]:
        """A function that returns a list of all the columns in the ColumnTypes object.

        Returns:
            List[str]: A list of all the columns in the ColumnTypes object.
        """
        return self.ids + self.labels + self.numerical + self.categorical + self.targets


    def filter(self, columns: List[str], keep_ids_targets : bool = False) -> "ColumnTypes":
        """A function that returns a subset of the ColumnTypes object.

        Args:
            columns (List[str]): A list of columns to be taken from the ColumnTypes object.

        Returns:
            ColumnTypes: A ColumnTypes object that contains the subset of columns.
        """
        return ColumnTypes(
            ids=self.ids if keep_ids_targets else [col for col in columns if col in self.ids],
            labels=[col for col in columns if col in self.labels],
            numerical=[col for col in columns if col in self.numerical],
            categorical=[col for col in columns if col in self.categorical],
            targets=self.targets if keep_ids_targets else [col for col in columns if col in self.targets],
        )
    
    def adapt_names(self, columns: List[str]) -> "ColumnTypes":
        """A function that adapts the names of the columns in the ColumnTypes object.
        The basic idea is that after the pipeline transformations, the names of the columns will be changed.
        So we want to preserve the types of the original columns and updated them to any possibilites that include the original name in the new name.

        Args:
            columns (List[str]): A list of columns to be taken from the ColumnTypes object.

        Returns:
            ColumnTypes: A ColumnTypes object that contains the adapted names of the columns.
        """
        
        adaped_ids = [col for col in columns if any([f"__{id}" in col for id in self.ids])]
        adaped_labels = [col for col in columns if any([f"__{label}" in col for label in self.labels])]
        adaped_numerical = [col for col in columns if any([f"__{numerical}" in col for numerical in self.numerical])]
        adaped_categorical = [col for col in columns if any([f"__{categorical}" in col for categorical in self.categorical])]
        adaped_targets = [col for col in columns if any([f"__{target}" in col for target in self.targets])]

        return ColumnTypes(
            ids=adaped_ids,
            labels=adaped_labels,
            numerical=adaped_numerical,
            categorical=adaped_categorical,
            targets=adaped_targets
        )
        
@dataclass
class ColumnSet:
    """A class to store the information about the features of the dataset."""

    is_optional: bool
    """A boolean value to determine if the column list is optional."""
    columns: List[str]
    """An array of the columns that are to be used in the dataset."""

def get_possible_columns_configs(column_sets : List[ColumnSet]) -> List[List[str]]:
    """A function that returns the possible column configurations for the dataset.

    Returns:
        List[List[str]]: A list of lists that contain the possible column configurations.
    """
    optional_columns_sets = [
        column_set for column_set in column_sets if column_set.is_optional
    ]
    mandatory_columns_sets = [
        column_set for column_set in column_sets if not column_set.is_optional
    ]

    if len(optional_columns_sets) > 10:
        logger.warning(
            f"The number of optional columns sets is too high (more than 10) - {len(optional_columns_sets)}"
        )
    else:
        logger.info(
            f"The number of optional columns sets is {len(optional_columns_sets)}"
        )

    bitmap = 2 ** len(optional_columns_sets) - 1

    possible_columns_configs: List[List[str]] = []
    mandatory_columns: List[str] = [
        column
        for mandatory_set in mandatory_columns_sets
        for column in mandatory_set.columns
    ]
    if len(mandatory_columns) > 0:
        possible_columns_configs.append(mandatory_columns)
        
    for i in range(1, bitmap + 1):
        columns_config: List[List[str]] = [*mandatory_columns]
        for j in range(len(optional_columns_sets)):
            if i & (1 << j):
                columns_config.extend(optional_columns_sets[j].columns)

        possible_columns_configs.append(columns_config)

    return possible_columns_configs

def verify_all_columns_types_exist(column_types: ColumnTypes, X: pd.DataFrame) -> bool:
    """A function that verifies if all the columns in the dataset exist in the ColumnTypes object.

    Args:
        X (pd.DataFrame): A pandas DataFrame that contains the features of the dataset.

    Returns:
        bool: A boolean value that determines if all the columns in the dataset exist in the ColumnTypes object.
    """
    for column in X.columns:
        if column not in column_types.ids + column_types.labels + column_types.numerical + column_types.categorical + column_types.targets:
            logger.error(f"Column {column} does not exist in the column types")
            return False

    return True

def verify_all_columns_sets_exist(column_sets: List[ColumnSet], X: pd.DataFrame) -> bool:
    """A function that verifies if all the columns in the dataset exist in the ColumnSets object.

    Args:
        X (pd.DataFrame): A pandas DataFrame that contains the features of the dataset.

    Returns:
        bool: A boolean value that determines if all the columns in the dataset exist in the ColumnSets object.
    """
    for column_set in column_sets:
        for column in column_set.columns:
            if column not in X.columns:
                logger.error(f"Column {column} does not exist in the column sets")
                return False

    return True

SETUP

In [None]:
def get_columns_types() -> ColumnTypes:
    """A function that defines the types of columns in the dataset.

    Returns:
        ColumnTypes: ColumnTypes object that contains the columns of the dataset.
    """
    return ColumnTypes(
        ids=["id"],
        targets=["class"],
        labels=["gill-spacing"],
        numerical=[
            "cap-diameter",
            "stem-height",
            "stem-width",
        ],
        categorical=[
            "cap-shape",
            "cap-surface",
            "cap-color",
            "does-bruise-or-bleed",
            "gill-attachment",
            "gill-color",
            "stem-root",
            "stem-surface",
            "stem-color",
            "veil-type",
            "veil-color",
            "has-ring",
            "ring-type",
            "spore-print-color",
            "habitat",
            "season",
        ],
    )

def get_column_sets() -> List[ColumnSet]:
    """A function that defines the possible column combinations for the dataset.
    It will create a list of ColumnSet objects that contain the possible column combinations.

    Returns:
        List[ColumnSet]: A list of ColumnSet objects that contain the possible column combinations.
    """
    return [
        ColumnSet(is_optional=True, columns=["does-bruise-or-bleed"]),
        ColumnSet(is_optional=False, columns=["stem-height", "stem-width"]),
        ColumnSet(
            is_optional=True,
            columns=["cap-diameter", "cap-shape", "cap-surface", "cap-color"],
        ),
        ColumnSet(
            is_optional=True, columns=["gill-spacing", "gill-attachment", "gill-color"]
        ),
        ColumnSet(
            is_optional=True, columns=["stem-root", "stem-surface", "stem-color"]
        ),
        ColumnSet(is_optional=True, columns=["veil-type", "veil-color"]),
        ColumnSet(is_optional=True, columns=["has-ring", "ring-type"]),
        ColumnSet(is_optional=True, columns=["spore-print-color", "habitat", "season"]),
    ]

PIPELINES FUNCTIONS

In [None]:
def clean_categorical(X: pd.DataFrame, columns: List[str], name:str) -> pd.DataFrame:
    """A function that cleans the categorical data in the dataset.
    It will remove the outliers (categories that are less than 1% of the total data) and replace them with "nan".
    Additionally it will fill "real" NaNs with "nan" as well.

    Args:
        X (pd.DataFrame): _description_
        columns (List[str]): _description_
        name (str): _description_

    Returns:
        pd.DataFrame: _description_
    """
    categorical_outliers_frequency_limit = 0.01
    logger.info(f"Pipeline {name}: Cleaning categorical data...")
    logger.info(
        f"Outliers frequency limit is {categorical_outliers_frequency_limit}"
    )

    for column in columns:
        value_counts = X[column].value_counts().to_frame()
        sum_value_counts = value_counts["count"].sum()

        outliers = cast(
                pd.DataFrame,
                value_counts[
                    value_counts["count"]
                    < sum_value_counts * categorical_outliers_frequency_limit
                ],
            ).index.to_list()
        logger.info(f"Outliers for column '{column}' are {outliers}")

        X.loc[:, column] = (
            pd.Series(
                X[column].apply(
                    lambda el: el if el not in outliers else "nan"
                )
            )
            .rename(column)
            .fillna("nan")
            .astype("category")
        )

    logger.info(f"Pipeline {name}: Cleaning categorical data complete")
    return X

def clean_numerical(X: pd.DataFrame, columns: List[str], name:str) -> pd.DataFrame:
    logger.info(f"Pipeline {name}: Cleaning numerical data...")
    for column in columns:
        n_nans = X[column].isna().sum()
        if n_nans > 0:
            logger.info(f"Column '{column}' has {n_nans} NaNs. Filling with mean {X[column].mean()}")
            X.loc[:, column] = X[column].fillna(X[column].mean())
        
    logger.info(f"Pipeline {name}: Cleaning numerical data complete")
    return X

def encode_label(X: pd.DataFrame, columns: List[str], name:str) -> pd.DataFrame:
    logger.info(f"Pipeline {name}: Encoding labels...")
    for column in columns:
        X.loc[:, column] = pd.Series(LabelEncoder().fit_transform(X[column])).rename(column)

    logger.info(f"Pipeline {name}: Encoding labels complete")
    return X

def encode_target(X: pd.DataFrame, name:str) -> pd.DataFrame:
    logger.info(f"Pipeline {name}: Encoding target...")
    X.loc[:, "class"] = X["class"].apply(lambda el: 1.0 if el == "e" else 0.0)
    logger.info(f"Pipeline {name}: Encoding target complete")
    return X

def convert_raw_pipeline_output_to_df(X: np.ndarray, pipeline: Pipeline, pipeline_step: str = "column_transformer") -> pd.DataFrame:
    """A function that converts the output of a pipeline to a DataFrame.

    Args:
        X (np.ndarray): Raw pipeline output
        pipeline (Pipeline): Pipeline that was used to transform the data
        pipeline_step (str, optional): Steps from which the out features_names should be taken. Defaults to "column_transformer".

    Returns:
        pd.DataFrame: A DataFrame that contains the transformed data.
    """

    logger.info(f"Converting raw pipeline output to DataFrame using pipeline_step: {pipeline_step}")

    if len(pipeline_step) == 0:
        raise ValueError("Pipeline step cannot be empty")
    
    pipeline_transformer = pipeline.named_steps[pipeline_step]
    if not pipeline_transformer:
        raise ValueError(f"Pipeline step {pipeline_step} not found in the pipeline")
    
    features_names = cast(ColumnTransformer, pipeline_transformer).get_feature_names_out()
    
    return pd.DataFrame(X, columns=features_names)

def filter_columns(X: pd.DataFrame, columns: List[str], name:str) -> pd.DataFrame:
    logger.info(f"Pipeline {name}: Filtering columns...")
    X = X[columns]
    logger.info(f"Pipeline {name}: Filtering columns complete")
    return X


PIPELINES

In [None]:
def create_preprocessing_pipeline(column_types: ColumnTypes, name:str = 'preprocessing_default') -> Pipeline:
    logger.info("Creating preprocessing pipeline...")
    
    column_transformer = ColumnTransformer(
        transformers=[
            ("categorical", OneHotEncoder(drop=None, sparse_output=False, handle_unknown="error"), column_types.categorical),
        ],
        remainder="passthrough",
    )
    preprocessing_pipeline = Pipeline(
        steps=[
            ("clean_numerical", FunctionTransformer(clean_numerical, kw_args={"columns": column_types.numerical, "name": name})),
            ("clean_categorical", FunctionTransformer(clean_categorical, kw_args={"columns": column_types.categorical, "name": name})),
            ("encode_labels", FunctionTransformer(encode_label, kw_args={"columns": column_types.labels, "name": name})),
            ("encode_targets", FunctionTransformer(encode_target, kw_args={"name": name})),
            ("column_transformer", column_transformer),
        ],
        memory=Memory(location=f"./cache/preprocessing/{name}", verbose=0),
    )
    logger.info("Preprocessing pipeline created")
    return preprocessing_pipeline

def create_scaling_filtering_pipeline(column_types: ColumnTypes, name:str = 'scaling_filtering_default') -> Pipeline:
    logger.info("Creating scaling filtering pipeline...")

    filtered_columns : List[str] = list(set(column_types.to_list()) - set(column_types.ids) - set(column_types.targets))

    column_transformer = ColumnTransformer(
        transformers=[
            ("scaled", StandardScaler(), column_types.numerical),
        ],
        remainder="passthrough",
    )
    scaling_pipeline = Pipeline(
        steps=[
            ("filter_transformer", FunctionTransformer(filter_columns, kw_args={"columns": filtered_columns, "name": name})),
            ("column_transformer", column_transformer),
        ],
        memory=None,
    )

    logger.info("Scaling filtering pipeline created")
    return scaling_pipeline


DEBUG

In [None]:
DEBUG = False

In [None]:
if DEBUG:
    train, test = load_data()
    column_types = get_columns_types()
    column_sets = get_column_sets()
    possible_columns_configs = get_possible_columns_configs(column_sets)

    display(verify_all_columns_types_exist(column_types, train.head(10).copy()))
    display(verify_all_columns_sets_exist(column_sets, train.head(10).copy()))


In [None]:
if DEBUG:
    preprocessing_pipeline = create_preprocessing_pipeline(column_types)

    raw_preprocessed_data = preprocessing_pipeline.fit_transform(train.head(10*1000).copy())
    df_preprocessed_data = convert_raw_pipeline_output_to_df(raw_preprocessed_data, preprocessing_pipeline)

    display(df_preprocessed_data.head())


In [None]:
if DEBUG:
    dev_column_set = possible_columns_configs[5]
    dev_column_types = column_types.filter(dev_column_set, keep_ids_targets=True)
    preprocessed_column_names = df_preprocessed_data.columns.to_list()
    display(dev_column_set)
    display(dev_column_types)

    adapted_column_types = dev_column_types.adapt_names(preprocessed_column_names)
    display(adapted_column_types)

    scaling_filtering_pipeline = create_scaling_filtering_pipeline(adapted_column_types)

    raw_scaled_data = scaling_filtering_pipeline.fit_transform(df_preprocessed_data.copy())
    df_scaled_data = convert_raw_pipeline_output_to_df(raw_scaled_data, scaling_filtering_pipeline)

    display(df_scaled_data.head())


TESTING

In [None]:
def get_models() -> List[BaseEstimator]:
    return [
        RandomForestClassifier(),
        SVC(),
        KNeighborsClassifier(),
        LogisticRegression(),
        RidgeClassifier(),
    ]


In [None]:
train, test = load_data()

In [None]:
def get_cv_config_accuracy(column_types : ColumnTypes, model: BaseEstimator, columns: List[str]) -> float:
    """A function that returns the accuracy of the model using cross-validation.

    Args:
        column_types (ColumnTypes): Defaul column types of the dataset
        model (BaseEstimator): Model to be used for training
        columns (List[str]): A list of columns to be used in the training

    Returns:
        float: Accuracy of the model using cross-validation
    """
    logger.info(f"Starting the training process of model: {model.__class__.__name__} for columns: {column_set}")

    config_column_types = (
        column_types
            .filter(columns, keep_ids_targets=True)
            .adapt_names(preprocessed_column_names)
    )

    scaling_filtering_pipeline = create_scaling_filtering_pipeline(config_column_types)

    X = data.copy()
    y = data.copy()[config_column_types.targets].squeeze()

    scores = cross_val_score(
        scaling_filtering_pipeline,
        X,
        y,
        cv=5,
        scoring="accuracy",
    )
    acc = scores.mean()
    logger.info(f"Accuracy: {acc}")
    
    return acc

In [None]:
column_types = get_columns_types()
column_sets = get_column_sets()
possible_columns_configs = get_possible_columns_configs(column_sets)
preprocessing_pipeline = create_preprocessing_pipeline(column_types)
models = get_models()

data = train.head(100*1000).copy()

preprocessed_data = convert_raw_pipeline_output_to_df(preprocessing_pipeline.fit_transform(data), preprocessing_pipeline)
preprocessed_column_names = preprocessed_data.columns.to_list() 

logger.info("Starting the test training process...")
logger.info(f"Number of combinations: {len(possible_columns_configs) * len(models)}")

accuracies_df : pd.DataFrame = pd.DataFrame(columns=["model", "columns", "accuracy"])

for columns in possible_columns_configs:
    for model in models:
        acc = get_cv_config_accuracy(column_types, model, columns)

        accuracies_df = accuracies_df.append(
            {
                "model": model.__class__.__name__,
                "columns": columns,
                "accuracy": acc,
            },
            ignore_index=True,
        )

display(accuracies_df.sort_values(by="accuracy", ascending=False).head(10))

accuracies_df.to_csv(f"accuracies_{dt.datetime.now().isoformat()}.csv", index=False)





