diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index f40ab5e..e0e01cc 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -1,4 +1,3 @@ - # standard lib imports import inspect import time @@ -19,6 +18,7 @@ log = logging.getLogger(__name__) + class PreProcessor(BaseEstimator): """This class implements a so-called facade pattern to define a higher-level interface to work with the CategoricalDataProcessor, @@ -48,11 +48,13 @@ class PreProcessor(BaseEstimator): (``classification`` or ``regression``). """ - def __init__(self, - categorical_data_processor: CategoricalDataProcessor, - discretizer: KBinsDiscretizer, - target_encoder: TargetEncoder, - is_fitted: bool = False): + def __init__( + self, + categorical_data_processor: CategoricalDataProcessor, + discretizer: KBinsDiscretizer, + target_encoder: TargetEncoder, + is_fitted: bool = False, + ): self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer @@ -63,24 +65,26 @@ def __init__(self, self.model_type = categorical_data_processor.model_type @classmethod - def from_params(cls, - model_type: str="classification", - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean"): + def from_params( + cls, + model_type: str = "classification", + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + weight: float = 0.0, + imputation_strategy: str = "mean", + ): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes along with good default values. @@ -147,21 +151,28 @@ def from_params(cls, PreProcessor Class encapsulating CategoricalDataProcessor, KBinsDiscretizer, and TargetEncoder instances. - """ - categorical_data_processor = CategoricalDataProcessor(model_type, - regroup, - regroup_name, keep_missing, - category_size_threshold, - p_value_threshold, - scale_contingency_table, - forced_categories) - - discretizer = KBinsDiscretizer(n_bins, strategy, closed, - auto_adapt_bins, - starting_precision, - label_format, - change_endpoint_format) - + """ + categorical_data_processor = CategoricalDataProcessor( + model_type, + regroup, + regroup_name, + keep_missing, + category_size_threshold, + p_value_threshold, + scale_contingency_table, + forced_categories, + ) + + discretizer = KBinsDiscretizer( + n_bins, + strategy, + closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format, + ) + target_encoder = TargetEncoder(weight, imputation_strategy) return cls(categorical_data_processor, discretizer, target_encoder) @@ -189,8 +200,10 @@ def from_pipeline(cls, pipeline: dict): """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline, as it does not " - "contain all and only the required parameters.") + raise ValueError( + "Invalid pipeline, as it does not " + "contain all and only the required parameters." + ) categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( @@ -204,11 +217,20 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(categorical_data_processor, discretizer, target_encoder, - is_fitted=pipeline["_is_fitted"]) + return cls( + categorical_data_processor, + discretizer, + target_encoder, + is_fitted=pipeline["_is_fitted"], + ) - def fit(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, target_column_name: str): + def fit( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, + ): """Fit the data to the preprocessing pipeline. Parameters @@ -224,9 +246,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, """ # get list of all variables - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) log.info("Starting to fit pipeline") start = time.time() @@ -234,41 +256,52 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, # Ensure to operate on separate copy of data train_data = train_data.copy() + # drop NAN columns if they exist + train_data = ( + PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( + train_data + ) + ) + # Fit discretizer, categorical preprocessor & target encoder # Note that in order to fit target_encoder, we first have to transform # the data using the fitted discretizer & categorical_data_processor if continuous_vars: begin = time.time() self._discretizer.fit(train_data, continuous_vars) - log.info("Fitting KBinsDiscretizer took {} seconds" - .format(time.time() - begin)) + log.info( + "Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin) + ) - train_data = self._discretizer.transform(train_data, - continuous_vars) + train_data = self._discretizer.transform(train_data, continuous_vars) if discrete_vars: begin = time.time() - self._categorical_data_processor.fit(train_data, - discrete_vars, - target_column_name) - log.info("Fitting categorical_data_processor class took {} seconds" - .format(time.time() - begin)) - - train_data = (self._categorical_data_processor - .transform(train_data, discrete_vars)) + self._categorical_data_processor.fit( + train_data, discrete_vars, target_column_name + ) + log.info( + "Fitting categorical_data_processor class took {} seconds".format( + time.time() - begin + ) + ) + + train_data = self._categorical_data_processor.transform( + train_data, discrete_vars + ) begin = time.time() - self._target_encoder.fit(train_data, preprocessed_variable_names, - target_column_name) - log.info("Fitting TargetEncoder took {} seconds" - .format(time.time() - begin)) + self._target_encoder.fit( + train_data, preprocessed_variable_names, target_column_name + ) + log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin)) self._is_fitted = True # set fitted boolean to True - log.info("Fitting pipeline took {} seconds" - .format(time.time() - start)) + log.info("Fitting pipeline took {} seconds".format(time.time() - start)) - def transform(self, data: pd.DataFrame, continuous_vars: list, - discrete_vars: list) -> pd.DataFrame: + def transform( + self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list + ) -> pd.DataFrame: """Transform the data by applying the preprocessing pipeline. Parameters @@ -297,33 +330,36 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, data = data.copy() if not self._is_fitted: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) if discrete_vars: - data = self._categorical_data_processor.transform(data, - discrete_vars) + data = self._categorical_data_processor.transform(data, discrete_vars) - data = self._target_encoder.transform(data, - preprocessed_variable_names) + data = self._target_encoder.transform(data, preprocessed_variable_names) - log.info("Transforming data took {} seconds" - .format(time.time() - start)) + log.info("Transforming data took {} seconds".format(time.time() - start)) return data - def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, - target_column_name: str) -> pd.DataFrame: + def fit_transform( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, + ) -> pd.DataFrame: """Fit preprocessing pipeline and transform the data. Parameters @@ -343,16 +379,17 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, Transformed (preprocessed) data. """ - self.fit(train_data, continuous_vars, discrete_vars, - target_column_name) + self.fit(train_data, continuous_vars, discrete_vars, target_column_name) return self.transform(train_data, continuous_vars, discrete_vars) @staticmethod - def train_selection_validation_split(data: pd.DataFrame, - train_prop: float=0.6, - selection_prop: float=0.2, - validation_prop: float=0.2) -> pd.DataFrame: + def train_selection_validation_split( + data: pd.DataFrame, + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2, + ) -> pd.DataFrame: """Adds `split` column with train/selection/validation values to the dataset. @@ -377,8 +414,10 @@ def train_selection_validation_split(data: pd.DataFrame, DataFrame with additional split column. """ if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): - raise ValueError("The sum of train_prop, selection_prop and " - "validation_prop must be 1.0.") + raise ValueError( + "The sum of train_prop, selection_prop and " + "validation_prop must be 1.0." + ) if train_prop == 0.0: raise ValueError("train_prop cannot be zero!") @@ -390,16 +429,18 @@ def train_selection_validation_split(data: pd.DataFrame, size_train = int(train_prop * nrows) size_select = int(selection_prop * nrows) size_valid = int(validation_prop * nrows) - correction = nrows - (size_train+size_select+size_valid) + correction = nrows - (size_train + size_select + size_valid) - split = ['train'] * size_train \ - + ['train'] * correction \ - + ['selection'] * size_select \ - + ['validation'] * size_valid + split = ( + ["train"] * size_train + + ["train"] * correction + + ["selection"] * size_select + + ["validation"] * size_valid + ) shuffle(split) - data['split'] = split + data["split"] = split return data @@ -413,18 +454,15 @@ def serialize_pipeline(self) -> dict: Return the pipeline as a dictionary. """ pipeline = { - "metadata": { - "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") - } + "metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")} } - pipeline["categorical_data_processor"] = (self - ._categorical_data_processor - .attributes_to_dict()) + pipeline[ + "categorical_data_processor" + ] = self._categorical_data_processor.attributes_to_dict() pipeline["discretizer"] = self._discretizer.attributes_to_dict() - pipeline["target_encoder"] = (self._target_encoder - .attributes_to_dict()) + pipeline["target_encoder"] = self._target_encoder.attributes_to_dict() pipeline["_is_fitted"] = True @@ -441,13 +479,13 @@ def _is_valid_pipeline(pipeline: dict) -> bool: Loaded pipeline from JSON file. """ keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set([key for key in keys - if key not in ["cls", "serialization_path"]]) + valid_keys = set( + [key for key in keys if key not in ["cls", "serialization_path"]] + ) input_keys = set() for key in pipeline: - if key in ["categorical_data_processor", "discretizer", - "target_encoder"]: + if key in ["categorical_data_processor", "discretizer", "target_encoder"]: input_keys = input_keys.union(set(pipeline[key].keys())) elif key != "metadata": input_keys.add(key) @@ -479,10 +517,59 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: ValueError In case both lists are empty. """ - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) + var_list = [col + "_processed" for col in discrete_vars] + [ + col + "_bin" for col in continuous_vars + ] if not var_list: raise ValueError("Variable var_list is None or empty list.") return var_list + + def _check_nan_columns_and_drop_columns_containing_only_nan( + data: pd.DataFrame, + ) -> pd.DataFrame: + """Checks how much missing values are in the dataframe and drops columns that contain only missing values. + It also logs an error message displaying the percentage of missing values in the different columns + (columns are only displayed if they contain a missing values) + + Parameters + ---------- + data : pd.DataFrame + Data that should be checked for columns that contain only missing values + + Returns + ------- + pd.DataFrame + Data without columns containing only missing values + """ + # Ensure to operate on separate copy of data + data = data.copy() + + # Check how much NaN values are in each variable + # and output a warning if a variable has more than 0% of missing values + + perc_na = data.isna().mean() * 100 + + if not perc_na[perc_na > 0].empty: + logging.warning( + "\nPercentage of missing values per variable:\n" + + perc_na[perc_na > 0] + .round(2) + .to_string(float_format=lambda x: str(x) + "%") + ) + + # drop variables that have only missing values + to_drop = [ + perc_na.index[i] + for i, percentage in enumerate(perc_na) + if percentage == 100 + ] + + if to_drop: + data = data.drop(to_drop, axis=1) + logging.warning( + f"Following variables contain only missing values and were dropped: {to_drop}" + ) + + return data diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 7d4d46f..1239e50 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -1,4 +1,3 @@ - from contextlib import contextmanager from typing import Any from unittest.mock import MagicMock @@ -9,38 +8,38 @@ from cobra.preprocessing.preprocessor import PreProcessor + @contextmanager def does_not_raise(): yield class TestPreProcessor: - - @pytest.mark.parametrize("train_prop, selection_prop, validation_prop, " - "expected_sizes", - [(0.6, 0.2, 0.2, {"train": 6, - "selection": 2, - "validation": 2}), - (0.7, 0.3, 0.0, {"train": 7, - "selection": 3}), - # Error "The sum of train_prop, selection_prop and - # validation_prop must be 1.0." should not be - # raised: - (0.7, 0.2, 0.1, {"train": 7, - "selection": 2, - "validation": 1})]) - def test_train_selection_validation_split(self, train_prop: float, - selection_prop: float, - validation_prop: float, - expected_sizes: dict): + @pytest.mark.parametrize( + "train_prop, selection_prop, validation_prop, " "expected_sizes", + [ + (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}), + (0.7, 0.3, 0.0, {"train": 7, "selection": 3}), + # Error "The sum of train_prop, selection_prop and + # validation_prop must be 1.0." should not be + # raised: + (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}), + ], + ) + def test_train_selection_validation_split( + self, + train_prop: float, + selection_prop: float, + validation_prop: float, + expected_sizes: dict, + ): X = np.arange(100).reshape(10, 10) data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)]) data.loc[:, "target"] = np.array([0] * 7 + [1] * 3) - actual = PreProcessor.train_selection_validation_split(data, - train_prop, - selection_prop, - validation_prop) + actual = PreProcessor.train_selection_validation_split( + data, train_prop, selection_prop, validation_prop + ) # check for the output schema assert list(actual.columns) == list(data.columns) @@ -55,14 +54,15 @@ def test_train_selection_validation_split(self, train_prop: float, def test_train_selection_validation_split_error_wrong_prop(self): - error_msg = ("The sum of train_prop, selection_prop and " - "validation_prop must be 1.0.") + error_msg = ( + "The sum of train_prop, selection_prop and " "validation_prop must be 1.0." + ) train_prop = 0.7 selection_prop = 0.3 - self._test_train_selection_validation_split_error(train_prop, - selection_prop, - error_msg) + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) def test_train_selection_validation_split_error_zero_selection_prop(self): @@ -70,29 +70,34 @@ def test_train_selection_validation_split_error_zero_selection_prop(self): train_prop = 0.9 selection_prop = 0.0 - self._test_train_selection_validation_split_error(train_prop, - selection_prop, - error_msg) + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) - def _test_train_selection_validation_split_error(self, - train_prop: float, - selection_prop: float, - error_msg: str): + def _test_train_selection_validation_split_error( + self, train_prop: float, selection_prop: float, error_msg: str + ): df = pd.DataFrame() with pytest.raises(ValueError, match=error_msg): - (PreProcessor - .train_selection_validation_split(df, - train_prop=train_prop, - selection_prop=selection_prop, - validation_prop=0.1)) - - @pytest.mark.parametrize("injection_location, expected", - [(None, True), - ("categorical_data_processor", False), - ("discretizer", False), - ("target_encoder", False)]) - def test_is_valid_pipeline(self, injection_location: str, - expected: bool): + ( + PreProcessor.train_selection_validation_split( + df, + train_prop=train_prop, + selection_prop=selection_prop, + validation_prop=0.1, + ) + ) + + @pytest.mark.parametrize( + "injection_location, expected", + [ + (None, True), + ("categorical_data_processor", False), + ("discretizer", False), + ("target_encoder", False), + ], + ) + def test_is_valid_pipeline(self, injection_location: str, expected: bool): # is_valid_pipeline only checks for relevant keys atm pipeline_dict = { @@ -118,7 +123,7 @@ def test_is_valid_pipeline(self, injection_location: str, "target_encoder": { "weight": None, "imputation_strategy": None, - } + }, } if injection_location: @@ -128,24 +133,30 @@ def test_is_valid_pipeline(self, injection_location: str, assert actual == expected - @pytest.mark.parametrize(("continuous_vars, discrete_vars, expectation, " - "expected"), - [([], [], pytest.raises(ValueError), None), - (["c1", "c2"], ["d1", "d2"], does_not_raise(), - ["d1_processed", "d2_processed", - "c1_bin", "c2_bin"]), - (["c1", "c2"], [], does_not_raise(), - ["c1_bin", "c2_bin"]), - ([], ["d1", "d2"], does_not_raise(), - ["d1_processed", "d2_processed"])]) - def test_get_variable_list(self, continuous_vars: list, - discrete_vars: list, - expectation: Any, - expected: list): + @pytest.mark.parametrize( + ("continuous_vars, discrete_vars, expectation, " "expected"), + [ + ([], [], pytest.raises(ValueError), None), + ( + ["c1", "c2"], + ["d1", "d2"], + does_not_raise(), + ["d1_processed", "d2_processed", "c1_bin", "c2_bin"], + ), + (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]), + ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]), + ], + ) + def test_get_variable_list( + self, + continuous_vars: list, + discrete_vars: list, + expectation: Any, + expected: list, + ): with expectation: - actual = PreProcessor._get_variable_list(continuous_vars, - discrete_vars) + actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars) assert actual == expected @@ -157,11 +168,12 @@ def mock_transform(df: pd.DataFrame, args): def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): """Test if the train_data input is not changed when performing fit_transform.""" - train_data = pd.DataFrame([[1, "2", 3], [10, "20", 30], [100, "200", 300]], columns=["foo", "bar", "baz"]) + train_data = pd.DataFrame( + [[1, "2", 3], [10, "20", 30], [100, "200", 300]], + columns=["foo", "bar", "baz"], + ) preprocessor = PreProcessor.from_params( - model_type="classification", - n_bins=10, - weight= 0.8 + model_type="classification", n_bins=10, weight=0.8 ) preprocessor._categorical_data_processor = MagicMock() preprocessor._categorical_data_processor.transform = self.mock_transform @@ -174,7 +186,124 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): train_data, continuous_vars=["foo"], discrete_vars=["bar"], - target_column_name=["baz"] - ) + target_column_name=["baz"], + ) assert "new_column" not in train_data.columns assert "new_column" in result.columns + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "f": [np.nan, np.nan, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 2 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 3 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 4 categorical + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + "category_3": [np.nan, np.nan, np.nan], + }, + ).astype( + { + "a": np.float64(), + "b": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + "category_3": pd.CategoricalDtype(), + } + ), + pd.DataFrame( + { + "a": [1, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + } + ).astype( + { + "a": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + } + ), + ), + ], + ) + def test_drops_columns_containing_only_nan(self, input, expected): + + print(input) + output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( + input + ) + + print(output) + print(expected) + assert output.equals(expected)