IMPORTS

In [None]:
import sys
import logging
from typing import Tuple, List, TypedDict, cast
from dataclasses import dataclass
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



LOGBOOK:

- Obervations:
    - There is a lot of missing values in the dataset
    - In columns sometimes there is additional irrelevant information (like the name of the column or random gibberish)

- Ideas:
    - Why don't we try to implement a solution that would predict only for rows that have all the values, rows without one certain column, rows without two certain columns, and so on?
    - Why don't we make models for each of the case and then compare them to the one that predicts for all rows?
    If we use a model that would output the probability instead of the class, we could ensamble the predictions by getting the predictions from different models, comparing the probabilities vs accuracy, getting the weights and ensambling the final model.

LOAD DATA

In [None]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv('./data/train.csv')
    test = pd.read_csv('./data/test.csv')
    return train, test

LOGGING

In [None]:
def setup_logger(name: str | None = None) -> logging.Logger:
    """A function to setup the logger

    Args:
        name (str | None, optional): Logger name. Defaults to None.

    Returns:
        logging.Logger: Configured logger
    """

    file_handler = logging.FileHandler("logs.log", mode="a")
    stream_handler = logging.StreamHandler(stream=sys.stdout)

    logger = logging.getLogger(name or __name__)
    if logger.hasHandlers():
        logger.handlers.clear()

    logging.basicConfig(
        level=logging.WARNING,
        format="%(levelname)s %(name)s %(asctime)s | %(message)s",
        datefmt="%H:%M:%S",
        handlers=[file_handler, stream_handler],
    )

    
    logger.setLevel(logging.INFO)

    return logger

logger = setup_logger("Main")
logger.info("Logger setup complete")

UTILS

In [None]:
@dataclass
class ColumnTypes(TypedDict):
    """
    A class to store the features and target columns of the dataset.

    Attributes:
    - ids: List[str]
    - labels: List[str]
    - numerical: List[str]
    - categorical: List[str]
    - targets: List[str]
    """

    ids: List[str]
    """An array of the columns that are used as the ID of the dataset."""
    labels: List[str]
    """An array of the columns that are used as the labels of the dataset. 
    Label encoding is to be used when there is a sequential correlation between the labels."""

    numerical: List[str]
    """An array of the columns that are numerical in nature."""
    categorical: List[str]
    """An array of the columns that are categorical in nature."""
    targets: List[str]
    """An array of the columns that are the target of the dataset."""


def get_columns_types() -> ColumnTypes:
    """A function that defines the types of columns in the dataset.

    Returns:
        ColumnTypes: ColumnTypes object that contains the columns of the dataset.
    """
    return ColumnTypes(
        ids=["id"],
        targets=["class"],
        labels=["gill-spacing"],
        numerical=[
            "cap-diameter",
            "stem-height",
            "stem-width",
        ],
        categorical=[
            "cap-shape",
            "cap-surface",
            "cap-color",
            "does-bruise-or-bleed",
            "gill-attachment",
            "gill-color",
            "stem-root",
            "stem-surface",
            "stem-color",
            "veil-type",
            "veil-color",
            "has-ring",
            "ring-type",
            "spore-print-color",
            "habitat",
            "season",
        ],
    )


@dataclass
class ColumnSet:
    """A class to store the information about the features of the dataset."""

    is_optional: bool
    """A boolean value to determine if the column list is optional."""
    columns: List[str]
    """An array of the columns that are to be used in the dataset."""


def get_column_sets() -> List[ColumnSet]:
    """A function that defines the possible column combinations for the dataset.
    It will create a list of ColumnSet objects that contain the possible column combinations.

    Returns:
        List[ColumnSet]: A list of ColumnSet objects that contain the possible column combinations.
    """
    return [
        ColumnSet(is_optional=True, columns=["does-bruise-or-bleed"]),
        ColumnSet(is_optional=False, columns=["stem_height", "stem_width"]),
        ColumnSet(
            is_optional=True,
            columns=["cap-diameter", "cap-shape", "cap-surface", "cap-color"],
        ),
        ColumnSet(
            is_optional=True, columns=["gill-spacing", "gill-attachment", "gill-color"]
        ),
        ColumnSet(
            is_optional=True, columns=["stem-root", "stem-surface", "stem-color"]
        ),
        ColumnSet(is_optional=True, columns=["veil-type", "veil-color"]),
        ColumnSet(is_optional=True, columns=["has-ring", "ring-type"]),
        ColumnSet(is_optional=True, columns=["spore-print-color", "habitat", "season"]),
    ]


def get_column_transformer(
    column_types: ColumnTypes, categorical_outliers_frequency_limit: float
) -> Pipeline:
    """A function that creates a pipeline object that contains the transformers to be used for the columns.

    Returns:
        Pipeline: A pipeline object that contains the transformers to be used for the columns.
    """

    def label_encode(x: pd.DataFrame) -> pd.DataFrame:
        logger.info("Label encoding started")
        for column in x.columns:
            if column in [*column_types["labels"], *column_types["targets"] ]:
                x[column] = pd.Series(LabelEncoder().fit_transform(x[column])).rename(
                    column
                )

        logger.info("Label encoding complete")
        return x

    def clean_categorical_columns(x: pd.DataFrame) -> pd.DataFrame:
        logger.info("Cleaning categorical columns started")
        logger.info(
            f"Outliers frequency limit is {categorical_outliers_frequency_limit}"
        )
        for column in x.columns:
            if column in [*column_types["labels"], *column_types["categorical"]]:

                value_counts = x[column].value_counts().to_frame()
                sum_value_counts = value_counts["count"].sum()

                outliers = cast(
                    pd.DataFrame,
                    value_counts[
                        value_counts["count"]
                        < sum_value_counts * categorical_outliers_frequency_limit
                    ],
                ).index.to_list()
                logger.info(f"Outliers for column '{column}' are {outliers}")
                x[column] = (
                    pd.Series(
                        x[column].apply(
                            lambda el: el if el not in outliers else "gibberish"
                        )
                    )
                    .rename(column)
                    .fillna("na")
                    .astype("category")
                )

                cleaned_series = x[column]
                x[column] = cleaned_series

        logger.info("Cleaning categorical columns complete")
        return x

    def clean_numerical_columns(x: pd.DataFrame) -> pd.DataFrame:
        logger.info("Cleaning numerical columns started")
        for column in x.columns:
            if column in column_types["numerical"]:
                n_nans = x[column].isna().sum()
                if n_nans > 0:
                    logger.info(
                        f"Column '{column}' has {n_nans} NaNs. Filling with mean {x[column].mean()}"
                    )
                    x[column] = x[column].fillna(x[column].mean())

                x[column] = x[column].astype("float16")

        logger.info("Cleaning numerical columns complete")
        return x

    categorical_column_cleaner = FunctionTransformer(
        clean_categorical_columns,
    )
    numerical_column_cleaner = FunctionTransformer(clean_numerical_columns)

    label_encoder_setup = FunctionTransformer(label_encode)

    column_transformer = ColumnTransformer(
        transformers=[
            ("numerical", StandardScaler(), column_types["numerical"]),
            (
                "categorical",
                OneHotEncoder(drop="first", sparse_output=False),
                column_types["categorical"],
            ),
        ],
        remainder="passthrough",
    )
    logger.info("Column transformer created")
    transform_pipeline = Pipeline(
        steps=[
            ("categorical_column_cleaner", categorical_column_cleaner),
            ("numerical_column_cleaner", numerical_column_cleaner),
            ("label_transformer", label_encoder_setup),
            ("column_transformer", column_transformer),
        ],
        verbose=True,
    )

    return transform_pipeline


@dataclass
class ColumnCreator:
    """A class to store the information about the columns to be created."""

    columns_sets: List[ColumnSet]
    """An array of the columns that are to be used in the dataset."""
    column_types: ColumnTypes
    """A ColumnTypes object that contains the columns of the dataset."""
    pipeline: Pipeline
    """A pipeline object that contains the transformers to be used for the columns."""

    def get_possible_columns_configs(self) -> List[List[str]]:
        """A function that returns the possible column configurations for the dataset.

        Returns:
            List[List[str]]: A list of lists that contain the possible column configurations.
        """
        optional_columns_sets = [
            column_set for column_set in self.columns_sets if column_set.is_optional
        ]
        mandatory_columns_sets = [
            column_set for column_set in self.columns_sets if not column_set.is_optional
        ]

        if len(optional_columns_sets) > 10:
            logger.warning(
                f"The number of optional columns sets is too high (more than 10) - {len(optional_columns_sets)}"
            )
        else:
            logger.info(
                f"The number of optional columns sets is {len(optional_columns_sets)}"
            )

        bitmap = 2 ** len(optional_columns_sets) - 1

        possible_columns_configs: List[List[str]] = []
        mandatory_columns: List[str] = [
            column
            for mandatory_set in mandatory_columns_sets
            for column in mandatory_set.columns
        ]
        if len(mandatory_columns) > 0:
            possible_columns_configs.append(mandatory_columns)
            
        for i in range(1, bitmap + 1):
            columns_config: List[List[str]] = [*mandatory_columns]
            for j in range(len(optional_columns_sets)):
                if i & (1 << j):
                    columns_config.extend(optional_columns_sets[j].columns)

            possible_columns_configs.append(columns_config)

        return possible_columns_configs

    def get_transformed_columns_out_names(self) -> List[str]:
        """A function that returns the names of the columns after transformation.

        Returns:
            List[str]: A list of strings that contain the names of the columns after transformation.
        """

        return self.pipeline.named_steps["column_transformer"].get_feature_names_out()

    def transform_columns(self, data: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """A function that transforms the columns of the dataset.

        Args:
            data (pd.DataFrame): The dataset to be transformed.

        Returns:
            pd.DataFrame: The transformed dataset.
        """
        if fit is True:
            raw_transformed_data = self.pipeline.fit_transform(data.copy())
        else:
            raw_transformed_data = self.pipeline.transform(data.copy())

        transformed_data_df = pd.DataFrame(
            data=raw_transformed_data,
            columns=self.get_transformed_columns_out_names(),
        )

        return transformed_data_df

In [None]:
def train_cv(column_set : List[str], pipeline : Pipeline, data: pd.DataFrame, n_splits: int = 5) -> float:
    """A function that trains a model using cross-validation.

    Args:
        column_set (List[str]): The columns to be used in the dataset.
        model: The model to be used for training.
        data (pd.DataFrame): The dataset to be used for training.
        n_splits (int, optional): The number of splits to be used for cross-validation. Defaults to 5.

    Returns:
        float: The mean accuracy of the model.
    """
    data_columns = data.columns.to_list()

    matched_columns : List[str] = []

    for transformed_column_name in data_columns:
        for column_name in column_set:
            if column_name in transformed_column_name:
                matched_columns.append(transformed_column_name)
    
    y = data[[*column_types["targets"]]].squeeze()
    X = data[matched_columns]

    scores = cross_val_score(pipeline, X, y, cv=n_splits, scoring="accuracy")

    return scores.mean()
    

DEBUG

In [None]:
train, test = load_data()

In [None]:
column_sets = get_column_sets()
column_types = get_columns_types()
column_transformer = get_column_transformer(column_types=column_types, categorical_outliers_frequency_limit=0.01)

column_creator = ColumnCreator(
    columns_sets=column_sets,
    column_types=column_types,
    pipeline=column_transformer
)
possible_columns_configs = column_creator.get_possible_columns_configs()
transformed_data = column_creator.transform_columns(train.head(10000))

In [None]:
display(train.head())
display(transformed_data.head())
# for i, columns_config in enumerate(possible_columns_configs):
#     logger.info(f"Columns config {i+1}/{len(possible_columns_configs)}: {columns_config}")
    