From 2ac2a3d507963292a00bfdb8a615cb3074c300de Mon Sep 17 00:00:00 2001 From: Patrick Leonardy Date: Mon, 5 Dec 2022 13:07:09 +0100 Subject: [PATCH 1/4] Added drop of columns containing only NANs --- cobra/preprocessing/preprocessor.py | 41 +++++++++++++++- tests/preprocessing/test_preprocessor.py | 61 ++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index f40ab5e..99cbf6e 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -61,7 +61,7 @@ def __init__(self, self._is_fitted = is_fitted self.model_type = categorical_data_processor.model_type - + @classmethod def from_params(cls, model_type: str="classification", @@ -234,6 +234,10 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, # Ensure to operate on separate copy of data train_data = train_data.copy() + + # drop NAN columns if they exist + train_data = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(train_data) + # Fit discretizer, categorical preprocessor & target encoder # Note that in order to fit target_encoder, we first have to transform # the data using the fitted discretizer & categorical_data_processor @@ -486,3 +490,38 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: raise ValueError("Variable var_list is None or empty list.") return var_list + + def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) -> pd.DataFrame: + """Checkes how much missing values are in the dataframe and drops columns that contain only missing values. + It also logs an error message displaying the percentage of missing values in the diffenent columns + (columns are only diosplaied if they contain a missing values) + + Parameters + ---------- + data : pd.DataFrame + Data that should be checked for columns that contain only missing values + + Returns + ------- + pd.DataFrame + Data without columns conatining only missing values + """ + + # Check how much NaN values are in each variable + # and output a warning if a variable has more than 0% of missing values + + perc_na = data.isna().mean() * 100 + + if not perc_na[perc_na > 0].empty: + logging.warning("\nPercentage of missing values per variable:\n" + perc_na[perc_na > 0].round(2).to_string(float_format=lambda x: str(x)+"%")) + + + # drop variables that have only missing values + to_drop = [perc_na.index[i] for i, percentage in enumerate(perc_na) if percentage == 100] + + + if to_drop: + data = data.drop(to_drop, axis=1) + logging.warning(f"Following variables contain only missing values and were droped: {to_drop}") + + return data diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 7d4d46f..2586647 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -178,3 +178,64 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): ) assert "new_column" not in train_data.columns assert "new_column" in result.columns + + + + @pytest.mark.parametrize(("input, expected"), + [ + # example 1 + (pd.DataFrame({ + "a":[1,8,np.nan], + "b":[np.nan,8,np.nan], + "c":[np.nan,np.nan,np.nan], + "d":[np.nan,np.nan,5], + "e":[1,960,np.nan], + "f":[np.nan,np.nan,np.nan] + }), + pd.DataFrame({ + 'a': [1.0, 8.0, np.nan], + 'b': [np.nan, 8.0, np.nan], + 'd': [np.nan, np.nan, 5.0], + 'e': [1.0, 960.0, np.nan] + })), + + #example 2 + (pd.DataFrame({ + "a":[1,8,np.nan], + "b":[np.nan,8,np.nan], + "c":[np.nan,np.nan,np.nan], + "d":[np.nan,np.nan,5], + "e":[1,960,np.nan], + }), + pd.DataFrame({ + 'a': [1.0, 8.0, np.nan], + 'b': [np.nan, 8.0, np.nan], + 'd': [np.nan, np.nan, 5.0], + 'e': [1.0, 960.0, np.nan] + })), + + #example 3 + (pd.DataFrame({ + "a":[1,8,np.nan], + "b":[np.nan,8,np.nan], + "d":[np.nan,np.nan,5], + "e":[1,960,np.nan], + }), + pd.DataFrame({ + 'a': [1.0, 8.0, np.nan], + 'b': [np.nan, 8.0, np.nan], + 'd': [np.nan, np.nan, 5.0], + 'e': [1.0, 960.0, np.nan] + })) + ]) + def test_drops_columns_containing_only_nan(self, input, expected): + + output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(input) + + assert output.equals(expected) + + + + + + From 0cfb3e46af353b0fde6fc118ebea7b9acb900852 Mon Sep 17 00:00:00 2001 From: patrickleonardy <116005033+patrickleonardy@users.noreply.github.com> Date: Mon, 5 Dec 2022 15:13:54 +0100 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Jano Roelandt <48879098+ZlaTanskY@users.noreply.github.com> --- cobra/preprocessing/preprocessor.py | 10 +++++----- tests/preprocessing/test_preprocessor.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 99cbf6e..1ef4294 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -492,9 +492,9 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: return var_list def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) -> pd.DataFrame: - """Checkes how much missing values are in the dataframe and drops columns that contain only missing values. - It also logs an error message displaying the percentage of missing values in the diffenent columns - (columns are only diosplaied if they contain a missing values) + """Checks how much missing values are in the dataframe and drops columns that contain only missing values. + It also logs an error message displaying the percentage of missing values in the different columns + (columns are only displayed if they contain a missing values) Parameters ---------- @@ -504,7 +504,7 @@ def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) Returns ------- pd.DataFrame - Data without columns conatining only missing values + Data without columns containing only missing values """ # Check how much NaN values are in each variable @@ -522,6 +522,6 @@ def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) if to_drop: data = data.drop(to_drop, axis=1) - logging.warning(f"Following variables contain only missing values and were droped: {to_drop}") + logging.warning(f"Following variables contain only missing values and were dropped: {to_drop}") return data diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 2586647..871d053 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -185,7 +185,8 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): [ # example 1 (pd.DataFrame({ - "a":[1,8,np.nan], + "a":[1, 8 , np.nan], + "b":[np.nan,8,np.nan], "c":[np.nan,np.nan,np.nan], "d":[np.nan,np.nan,5], From 6f09fd271747995db04676fdad1de3fc53318bcf Mon Sep 17 00:00:00 2001 From: Patrick Leonardy Date: Mon, 5 Dec 2022 15:35:06 +0100 Subject: [PATCH 3/4] Applied suggested changes --- cobra/preprocessing/preprocessor.py | 298 +++++++++++++---------- tests/preprocessing/test_preprocessor.py | 282 +++++++++++---------- 2 files changed, 325 insertions(+), 255 deletions(-) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 1ef4294..e0e01cc 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -1,4 +1,3 @@ - # standard lib imports import inspect import time @@ -19,6 +18,7 @@ log = logging.getLogger(__name__) + class PreProcessor(BaseEstimator): """This class implements a so-called facade pattern to define a higher-level interface to work with the CategoricalDataProcessor, @@ -48,11 +48,13 @@ class PreProcessor(BaseEstimator): (``classification`` or ``regression``). """ - def __init__(self, - categorical_data_processor: CategoricalDataProcessor, - discretizer: KBinsDiscretizer, - target_encoder: TargetEncoder, - is_fitted: bool = False): + def __init__( + self, + categorical_data_processor: CategoricalDataProcessor, + discretizer: KBinsDiscretizer, + target_encoder: TargetEncoder, + is_fitted: bool = False, + ): self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer @@ -61,26 +63,28 @@ def __init__(self, self._is_fitted = is_fitted self.model_type = categorical_data_processor.model_type - + @classmethod - def from_params(cls, - model_type: str="classification", - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean"): + def from_params( + cls, + model_type: str = "classification", + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + weight: float = 0.0, + imputation_strategy: str = "mean", + ): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes along with good default values. @@ -147,21 +151,28 @@ def from_params(cls, PreProcessor Class encapsulating CategoricalDataProcessor, KBinsDiscretizer, and TargetEncoder instances. - """ - categorical_data_processor = CategoricalDataProcessor(model_type, - regroup, - regroup_name, keep_missing, - category_size_threshold, - p_value_threshold, - scale_contingency_table, - forced_categories) - - discretizer = KBinsDiscretizer(n_bins, strategy, closed, - auto_adapt_bins, - starting_precision, - label_format, - change_endpoint_format) - + """ + categorical_data_processor = CategoricalDataProcessor( + model_type, + regroup, + regroup_name, + keep_missing, + category_size_threshold, + p_value_threshold, + scale_contingency_table, + forced_categories, + ) + + discretizer = KBinsDiscretizer( + n_bins, + strategy, + closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format, + ) + target_encoder = TargetEncoder(weight, imputation_strategy) return cls(categorical_data_processor, discretizer, target_encoder) @@ -189,8 +200,10 @@ def from_pipeline(cls, pipeline: dict): """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline, as it does not " - "contain all and only the required parameters.") + raise ValueError( + "Invalid pipeline, as it does not " + "contain all and only the required parameters." + ) categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( @@ -204,11 +217,20 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(categorical_data_processor, discretizer, target_encoder, - is_fitted=pipeline["_is_fitted"]) + return cls( + categorical_data_processor, + discretizer, + target_encoder, + is_fitted=pipeline["_is_fitted"], + ) - def fit(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, target_column_name: str): + def fit( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, + ): """Fit the data to the preprocessing pipeline. Parameters @@ -224,9 +246,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, """ # get list of all variables - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) log.info("Starting to fit pipeline") start = time.time() @@ -234,9 +256,12 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, # Ensure to operate on separate copy of data train_data = train_data.copy() - # drop NAN columns if they exist - train_data = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(train_data) + train_data = ( + PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( + train_data + ) + ) # Fit discretizer, categorical preprocessor & target encoder # Note that in order to fit target_encoder, we first have to transform @@ -244,35 +269,39 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, if continuous_vars: begin = time.time() self._discretizer.fit(train_data, continuous_vars) - log.info("Fitting KBinsDiscretizer took {} seconds" - .format(time.time() - begin)) + log.info( + "Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin) + ) - train_data = self._discretizer.transform(train_data, - continuous_vars) + train_data = self._discretizer.transform(train_data, continuous_vars) if discrete_vars: begin = time.time() - self._categorical_data_processor.fit(train_data, - discrete_vars, - target_column_name) - log.info("Fitting categorical_data_processor class took {} seconds" - .format(time.time() - begin)) - - train_data = (self._categorical_data_processor - .transform(train_data, discrete_vars)) + self._categorical_data_processor.fit( + train_data, discrete_vars, target_column_name + ) + log.info( + "Fitting categorical_data_processor class took {} seconds".format( + time.time() - begin + ) + ) + + train_data = self._categorical_data_processor.transform( + train_data, discrete_vars + ) begin = time.time() - self._target_encoder.fit(train_data, preprocessed_variable_names, - target_column_name) - log.info("Fitting TargetEncoder took {} seconds" - .format(time.time() - begin)) + self._target_encoder.fit( + train_data, preprocessed_variable_names, target_column_name + ) + log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin)) self._is_fitted = True # set fitted boolean to True - log.info("Fitting pipeline took {} seconds" - .format(time.time() - start)) + log.info("Fitting pipeline took {} seconds".format(time.time() - start)) - def transform(self, data: pd.DataFrame, continuous_vars: list, - discrete_vars: list) -> pd.DataFrame: + def transform( + self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list + ) -> pd.DataFrame: """Transform the data by applying the preprocessing pipeline. Parameters @@ -301,33 +330,36 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, data = data.copy() if not self._is_fitted: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) if discrete_vars: - data = self._categorical_data_processor.transform(data, - discrete_vars) + data = self._categorical_data_processor.transform(data, discrete_vars) - data = self._target_encoder.transform(data, - preprocessed_variable_names) + data = self._target_encoder.transform(data, preprocessed_variable_names) - log.info("Transforming data took {} seconds" - .format(time.time() - start)) + log.info("Transforming data took {} seconds".format(time.time() - start)) return data - def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, - target_column_name: str) -> pd.DataFrame: + def fit_transform( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, + ) -> pd.DataFrame: """Fit preprocessing pipeline and transform the data. Parameters @@ -347,16 +379,17 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, Transformed (preprocessed) data. """ - self.fit(train_data, continuous_vars, discrete_vars, - target_column_name) + self.fit(train_data, continuous_vars, discrete_vars, target_column_name) return self.transform(train_data, continuous_vars, discrete_vars) @staticmethod - def train_selection_validation_split(data: pd.DataFrame, - train_prop: float=0.6, - selection_prop: float=0.2, - validation_prop: float=0.2) -> pd.DataFrame: + def train_selection_validation_split( + data: pd.DataFrame, + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2, + ) -> pd.DataFrame: """Adds `split` column with train/selection/validation values to the dataset. @@ -381,8 +414,10 @@ def train_selection_validation_split(data: pd.DataFrame, DataFrame with additional split column. """ if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): - raise ValueError("The sum of train_prop, selection_prop and " - "validation_prop must be 1.0.") + raise ValueError( + "The sum of train_prop, selection_prop and " + "validation_prop must be 1.0." + ) if train_prop == 0.0: raise ValueError("train_prop cannot be zero!") @@ -394,16 +429,18 @@ def train_selection_validation_split(data: pd.DataFrame, size_train = int(train_prop * nrows) size_select = int(selection_prop * nrows) size_valid = int(validation_prop * nrows) - correction = nrows - (size_train+size_select+size_valid) + correction = nrows - (size_train + size_select + size_valid) - split = ['train'] * size_train \ - + ['train'] * correction \ - + ['selection'] * size_select \ - + ['validation'] * size_valid + split = ( + ["train"] * size_train + + ["train"] * correction + + ["selection"] * size_select + + ["validation"] * size_valid + ) shuffle(split) - data['split'] = split + data["split"] = split return data @@ -417,18 +454,15 @@ def serialize_pipeline(self) -> dict: Return the pipeline as a dictionary. """ pipeline = { - "metadata": { - "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") - } + "metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")} } - pipeline["categorical_data_processor"] = (self - ._categorical_data_processor - .attributes_to_dict()) + pipeline[ + "categorical_data_processor" + ] = self._categorical_data_processor.attributes_to_dict() pipeline["discretizer"] = self._discretizer.attributes_to_dict() - pipeline["target_encoder"] = (self._target_encoder - .attributes_to_dict()) + pipeline["target_encoder"] = self._target_encoder.attributes_to_dict() pipeline["_is_fitted"] = True @@ -445,13 +479,13 @@ def _is_valid_pipeline(pipeline: dict) -> bool: Loaded pipeline from JSON file. """ keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set([key for key in keys - if key not in ["cls", "serialization_path"]]) + valid_keys = set( + [key for key in keys if key not in ["cls", "serialization_path"]] + ) input_keys = set() for key in pipeline: - if key in ["categorical_data_processor", "discretizer", - "target_encoder"]: + if key in ["categorical_data_processor", "discretizer", "target_encoder"]: input_keys = input_keys.union(set(pipeline[key].keys())) elif key != "metadata": input_keys.add(key) @@ -483,17 +517,20 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: ValueError In case both lists are empty. """ - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) + var_list = [col + "_processed" for col in discrete_vars] + [ + col + "_bin" for col in continuous_vars + ] if not var_list: raise ValueError("Variable var_list is None or empty list.") return var_list - def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) -> pd.DataFrame: - """Checks how much missing values are in the dataframe and drops columns that contain only missing values. - It also logs an error message displaying the percentage of missing values in the different columns + def _check_nan_columns_and_drop_columns_containing_only_nan( + data: pd.DataFrame, + ) -> pd.DataFrame: + """Checks how much missing values are in the dataframe and drops columns that contain only missing values. + It also logs an error message displaying the percentage of missing values in the different columns (columns are only displayed if they contain a missing values) Parameters @@ -506,22 +543,33 @@ def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) pd.DataFrame Data without columns containing only missing values """ + # Ensure to operate on separate copy of data + data = data.copy() + + # Check how much NaN values are in each variable + # and output a warning if a variable has more than 0% of missing values - # Check how much NaN values are in each variable - # and output a warning if a variable has more than 0% of missing values - perc_na = data.isna().mean() * 100 if not perc_na[perc_na > 0].empty: - logging.warning("\nPercentage of missing values per variable:\n" + perc_na[perc_na > 0].round(2).to_string(float_format=lambda x: str(x)+"%")) + logging.warning( + "\nPercentage of missing values per variable:\n" + + perc_na[perc_na > 0] + .round(2) + .to_string(float_format=lambda x: str(x) + "%") + ) - # drop variables that have only missing values - to_drop = [perc_na.index[i] for i, percentage in enumerate(perc_na) if percentage == 100] - + to_drop = [ + perc_na.index[i] + for i, percentage in enumerate(perc_na) + if percentage == 100 + ] if to_drop: data = data.drop(to_drop, axis=1) - logging.warning(f"Following variables contain only missing values and were dropped: {to_drop}") - + logging.warning( + f"Following variables contain only missing values and were dropped: {to_drop}" + ) + return data diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 871d053..825c391 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -1,4 +1,3 @@ - from contextlib import contextmanager from typing import Any from unittest.mock import MagicMock @@ -9,38 +8,38 @@ from cobra.preprocessing.preprocessor import PreProcessor + @contextmanager def does_not_raise(): yield class TestPreProcessor: - - @pytest.mark.parametrize("train_prop, selection_prop, validation_prop, " - "expected_sizes", - [(0.6, 0.2, 0.2, {"train": 6, - "selection": 2, - "validation": 2}), - (0.7, 0.3, 0.0, {"train": 7, - "selection": 3}), - # Error "The sum of train_prop, selection_prop and - # validation_prop must be 1.0." should not be - # raised: - (0.7, 0.2, 0.1, {"train": 7, - "selection": 2, - "validation": 1})]) - def test_train_selection_validation_split(self, train_prop: float, - selection_prop: float, - validation_prop: float, - expected_sizes: dict): + @pytest.mark.parametrize( + "train_prop, selection_prop, validation_prop, " "expected_sizes", + [ + (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}), + (0.7, 0.3, 0.0, {"train": 7, "selection": 3}), + # Error "The sum of train_prop, selection_prop and + # validation_prop must be 1.0." should not be + # raised: + (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}), + ], + ) + def test_train_selection_validation_split( + self, + train_prop: float, + selection_prop: float, + validation_prop: float, + expected_sizes: dict, + ): X = np.arange(100).reshape(10, 10) data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)]) data.loc[:, "target"] = np.array([0] * 7 + [1] * 3) - actual = PreProcessor.train_selection_validation_split(data, - train_prop, - selection_prop, - validation_prop) + actual = PreProcessor.train_selection_validation_split( + data, train_prop, selection_prop, validation_prop + ) # check for the output schema assert list(actual.columns) == list(data.columns) @@ -55,14 +54,15 @@ def test_train_selection_validation_split(self, train_prop: float, def test_train_selection_validation_split_error_wrong_prop(self): - error_msg = ("The sum of train_prop, selection_prop and " - "validation_prop must be 1.0.") + error_msg = ( + "The sum of train_prop, selection_prop and " "validation_prop must be 1.0." + ) train_prop = 0.7 selection_prop = 0.3 - self._test_train_selection_validation_split_error(train_prop, - selection_prop, - error_msg) + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) def test_train_selection_validation_split_error_zero_selection_prop(self): @@ -70,29 +70,34 @@ def test_train_selection_validation_split_error_zero_selection_prop(self): train_prop = 0.9 selection_prop = 0.0 - self._test_train_selection_validation_split_error(train_prop, - selection_prop, - error_msg) + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) - def _test_train_selection_validation_split_error(self, - train_prop: float, - selection_prop: float, - error_msg: str): + def _test_train_selection_validation_split_error( + self, train_prop: float, selection_prop: float, error_msg: str + ): df = pd.DataFrame() with pytest.raises(ValueError, match=error_msg): - (PreProcessor - .train_selection_validation_split(df, - train_prop=train_prop, - selection_prop=selection_prop, - validation_prop=0.1)) - - @pytest.mark.parametrize("injection_location, expected", - [(None, True), - ("categorical_data_processor", False), - ("discretizer", False), - ("target_encoder", False)]) - def test_is_valid_pipeline(self, injection_location: str, - expected: bool): + ( + PreProcessor.train_selection_validation_split( + df, + train_prop=train_prop, + selection_prop=selection_prop, + validation_prop=0.1, + ) + ) + + @pytest.mark.parametrize( + "injection_location, expected", + [ + (None, True), + ("categorical_data_processor", False), + ("discretizer", False), + ("target_encoder", False), + ], + ) + def test_is_valid_pipeline(self, injection_location: str, expected: bool): # is_valid_pipeline only checks for relevant keys atm pipeline_dict = { @@ -118,7 +123,7 @@ def test_is_valid_pipeline(self, injection_location: str, "target_encoder": { "weight": None, "imputation_strategy": None, - } + }, } if injection_location: @@ -128,24 +133,30 @@ def test_is_valid_pipeline(self, injection_location: str, assert actual == expected - @pytest.mark.parametrize(("continuous_vars, discrete_vars, expectation, " - "expected"), - [([], [], pytest.raises(ValueError), None), - (["c1", "c2"], ["d1", "d2"], does_not_raise(), - ["d1_processed", "d2_processed", - "c1_bin", "c2_bin"]), - (["c1", "c2"], [], does_not_raise(), - ["c1_bin", "c2_bin"]), - ([], ["d1", "d2"], does_not_raise(), - ["d1_processed", "d2_processed"])]) - def test_get_variable_list(self, continuous_vars: list, - discrete_vars: list, - expectation: Any, - expected: list): + @pytest.mark.parametrize( + ("continuous_vars, discrete_vars, expectation, " "expected"), + [ + ([], [], pytest.raises(ValueError), None), + ( + ["c1", "c2"], + ["d1", "d2"], + does_not_raise(), + ["d1_processed", "d2_processed", "c1_bin", "c2_bin"], + ), + (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]), + ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]), + ], + ) + def test_get_variable_list( + self, + continuous_vars: list, + discrete_vars: list, + expectation: Any, + expected: list, + ): with expectation: - actual = PreProcessor._get_variable_list(continuous_vars, - discrete_vars) + actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars) assert actual == expected @@ -157,11 +168,12 @@ def mock_transform(df: pd.DataFrame, args): def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): """Test if the train_data input is not changed when performing fit_transform.""" - train_data = pd.DataFrame([[1, "2", 3], [10, "20", 30], [100, "200", 300]], columns=["foo", "bar", "baz"]) + train_data = pd.DataFrame( + [[1, "2", 3], [10, "20", 30], [100, "200", 300]], + columns=["foo", "bar", "baz"], + ) preprocessor = PreProcessor.from_params( - model_type="classification", - n_bins=10, - weight= 0.8 + model_type="classification", n_bins=10, weight=0.8 ) preprocessor._categorical_data_processor = MagicMock() preprocessor._categorical_data_processor.transform = self.mock_transform @@ -174,69 +186,79 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): train_data, continuous_vars=["foo"], discrete_vars=["bar"], - target_column_name=["baz"] - ) + target_column_name=["baz"], + ) assert "new_column" not in train_data.columns assert "new_column" in result.columns - - - - @pytest.mark.parametrize(("input, expected"), - [ - # example 1 - (pd.DataFrame({ - "a":[1, 8 , np.nan], - - "b":[np.nan,8,np.nan], - "c":[np.nan,np.nan,np.nan], - "d":[np.nan,np.nan,5], - "e":[1,960,np.nan], - "f":[np.nan,np.nan,np.nan] - }), - pd.DataFrame({ - 'a': [1.0, 8.0, np.nan], - 'b': [np.nan, 8.0, np.nan], - 'd': [np.nan, np.nan, 5.0], - 'e': [1.0, 960.0, np.nan] - })), - - #example 2 - (pd.DataFrame({ - "a":[1,8,np.nan], - "b":[np.nan,8,np.nan], - "c":[np.nan,np.nan,np.nan], - "d":[np.nan,np.nan,5], - "e":[1,960,np.nan], - }), - pd.DataFrame({ - 'a': [1.0, 8.0, np.nan], - 'b': [np.nan, 8.0, np.nan], - 'd': [np.nan, np.nan, 5.0], - 'e': [1.0, 960.0, np.nan] - })), - - #example 3 - (pd.DataFrame({ - "a":[1,8,np.nan], - "b":[np.nan,8,np.nan], - "d":[np.nan,np.nan,5], - "e":[1,960,np.nan], - }), - pd.DataFrame({ - 'a': [1.0, 8.0, np.nan], - 'b': [np.nan, 8.0, np.nan], - 'd': [np.nan, np.nan, 5.0], - 'e': [1.0, 960.0, np.nan] - })) - ]) + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "f": [np.nan, np.nan, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 2 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 3 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + ], + ) def test_drops_columns_containing_only_nan(self, input, expected): - - output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(input) + output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( + input + ) assert output.equals(expected) - - - - - - From d598002c9bed1d430cb99a539c8ee7158d9d2b1c Mon Sep 17 00:00:00 2001 From: Patrick Leonardy Date: Mon, 5 Dec 2022 16:01:50 +0100 Subject: [PATCH 4/4] added example 4 categorical in test_drops_columns_containing_only_nan --- tests/preprocessing/test_preprocessor.py | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 825c391..1239e50 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -254,11 +254,56 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): } ), ), + # example 4 categorical + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + "category_3": [np.nan, np.nan, np.nan], + }, + ).astype( + { + "a": np.float64(), + "b": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + "category_3": pd.CategoricalDtype(), + } + ), + pd.DataFrame( + { + "a": [1, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + } + ).astype( + { + "a": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + } + ), + ), ], ) def test_drops_columns_containing_only_nan(self, input, expected): + print(input) output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( input ) + + print(output) + print(expected) assert output.equals(expected)