From 762149a6e233e1377945da4949f127a664599ef4 Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Thu, 21 Apr 2022 11:35:52 +0200 Subject: [PATCH 1/9] feat: added files to encourage PEP8 --- Makefile | 41 +++++++++++++++++++++++++++++++++++++++++ requirements.dev.txt | 6 ++++++ 2 files changed, 47 insertions(+) create mode 100644 Makefile create mode 100644 requirements.dev.txt diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b31a1db --- /dev/null +++ b/Makefile @@ -0,0 +1,41 @@ +# Makefile with some simple commands to make developer's life easier + + +install-requirements: install-build-essential + pip install -r requirements.txt + +dev/install-requirements: install-requirements + pip install -r requirements.dev.txt + +install-build-essential: + sudo apt-get update + sudo apt-get install build-essential + +update-setuptools: + pip install --upgrade setuptools wheel + +test-unit: + pytest tests + @echo 'unit tests OK' + +lint: + pylint cobra + @echo 'lint OK' + +lint-minimal: + pylint E cobra + @echo 'lint minimal OK' + +typecheck: + mypy cobra + @echo 'typecheck OK' + +codestyle: + pycodestyle cobra + @echo 'codestyle OK' + +docstyle: + pydocstyle cobra + @echo 'docstyle OK' + +code-qa: typecheck codestyle docstyle lint-minimal diff --git a/requirements.dev.txt b/requirements.dev.txt new file mode 100644 index 0000000..3d87710 --- /dev/null +++ b/requirements.dev.txt @@ -0,0 +1,6 @@ +mypy>=0.942 +pycodestyle>=2.8.0 +pydocstyle>=6.1.1 +pylint>=2.13.7 +pytest>=7.1.1 +pytest-mock>=3.7.0 \ No newline at end of file From a488879afee4c283895dc6a105996810763586af Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Thu, 21 Apr 2022 11:36:50 +0200 Subject: [PATCH 2/9] fix: pydocstyle Except for target_encoder _fit_column, transform and _transform_column. --- cobra/__init__.py | 2 + cobra/evaluation/__init__.py | 2 + cobra/evaluation/evaluator.py | 59 +++++------- cobra/evaluation/pigs_tables.py | 5 +- cobra/evaluation/plotting_utils.py | 6 +- cobra/model_building/__init__.py | 2 + cobra/model_building/forward_selection.py | 73 ++++++++------- cobra/model_building/models.py | 48 +++++----- cobra/model_building/univariate_selection.py | 32 +++---- cobra/preprocessing/__init__.py | 2 + .../categorical_data_processor.py | 60 +++++++------ cobra/preprocessing/kbins_discretizer.py | 79 ++++++++-------- cobra/preprocessing/preprocessor.py | 90 ++++++++++--------- cobra/preprocessing/target_encoder.py | 24 +++-- cobra/utils.py | 9 +- 15 files changed, 268 insertions(+), 225 deletions(-) diff --git a/cobra/__init__.py b/cobra/__init__.py index 7152555..8afad45 100644 --- a/cobra/__init__.py +++ b/cobra/__init__.py @@ -1 +1,3 @@ +"""Cobra module.""" + from .version import __version__ \ No newline at end of file diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index 1f8f487..d480bdb 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -1,3 +1,5 @@ +"""The evaluation module includes utils and plots to evaluate a created model.""" + from .pigs_tables import generate_pig_tables from .pigs_tables import compute_pig_table from .pigs_tables import plot_incidence diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 5a530dc..f550431 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -1,3 +1,4 @@ +"""Evaluate the created model.""" import numpy as np import pandas as pd @@ -26,8 +27,7 @@ from sklearn.metrics import r2_score class ClassificationEvaluator(): - """Evaluator class encapsulating classification model metrics - and plotting functionality. + """Evaluator class encapsulating classification model metrics and plotting functionality. Attributes ---------- @@ -56,11 +56,13 @@ class ClassificationEvaluator(): (by default 10, so deciles). """ - def __init__(self, - probability_cutoff: float=None, - lift_at: float=0.05, - n_bins: int = 10): - + def __init__( + self, + probability_cutoff: float=None, + lift_at: float=0.05, + n_bins: int = 10 + ): + """Initialize the ClassificationEvaluator.""" self.y_true = None self.y_pred = None @@ -76,8 +78,7 @@ def __init__(self, self.cumulative_gains = None def fit(self, y_true: np.ndarray, y_pred: np.ndarray): - """Fit the evaluator by computing the relevant evaluation metrics on - the inputs. + """Fit the evaluator by computing the relevant evaluation metrics on the inputs. Parameters ---------- @@ -119,8 +120,7 @@ def _compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_pred_b: np.ndarray, lift_at: float) -> pd.Series: - """Convenient function to compute various scalar performance measures - and return them in a pd.Series. + """Compute various scalar performance measures. Parameters ---------- @@ -168,7 +168,6 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional Tuple with width and length of the plot. """ - if self.roc_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -211,7 +210,6 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), labels : list, optional Optional list of labels, default "0" and "1". """ - if self.confusion_matrix is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -240,7 +238,6 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional Tuple with width and length of the plot. """ - if self.lift_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -291,7 +288,6 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional Tuple with width and length of the plot. """ - if self.lift_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -340,7 +336,6 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional Tuple with width and length of the plot. """ - with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) @@ -375,8 +370,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): @staticmethod def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float: - """Find the optimal probability cut off point for a - classification model. Wrapper around _compute_optimal_cutoff. + """Find the optimal probability cut off point for a classification model. Parameters ---------- @@ -396,8 +390,7 @@ def _find_optimal_cutoff(y_true: np.ndarray, @staticmethod def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray) -> float: - """Find the optimal probability cut-off point for a - classification model. + """Calculate the optimal probability cut-off point for a classification model. The optimal cut-off would be where TPR is high and FPR is low, hence TPR - (1-FPR) should be zero or close to zero for the optimal cut-off. @@ -426,8 +419,7 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, @staticmethod def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple: - """Compute cumulative gains of the model, returns percentages and - gains cumulative gains curves. + """Compute cumulative gains of the model. Code from (https://github.com/reiinakano/scikit-plot/blob/ 2dd3e6a76df77edcbd724c4db25575f70abb57cb/ @@ -445,7 +437,6 @@ def _compute_cumulative_gains(y_true: np.ndarray, tuple With x-labels, and gains. """ - # make y_true a boolean vector y_true = (y_true == 1) @@ -467,8 +458,7 @@ def _compute_cumulative_gains(y_true: np.ndarray, def _compute_lift_per_bin(y_true: np.ndarray, y_pred: np.ndarray, n_bins: int=10) -> tuple: - """Compute lift of the model for a given number of bins, returns x-labels, - lifts and the target incidence to create cumulative response curves. + """Compute lift of the model for a given number of bins. Parameters ---------- @@ -485,7 +475,6 @@ def _compute_lift_per_bin(y_true: np.ndarray, tuple Includes x-labels, lifts per decile, and target incidence. """ - lifts = [ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift) @@ -498,7 +487,7 @@ def _compute_lift_per_bin(y_true: np.ndarray, @staticmethod def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, lift_at: float=0.05) -> float: - """Calculates lift given two arrays on specified level. + """Calculate lift on a specified level. Parameters ---------- @@ -514,7 +503,6 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, float Lift of the model. """ - # Make sure it is numpy array y_true_ = np.array(y_true) y_pred_ = np.array(y_pred) @@ -544,8 +532,7 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, class RegressionEvaluator(): - """Evaluator class encapsulating regression model metrics - and plotting functionality. + """Evaluator class encapsulating regression model metrics and plotting functionality. Attributes ---------- @@ -560,7 +547,7 @@ class RegressionEvaluator(): """ def __init__(self): - + """Initialize the RegressionEvaluator.""" self.y_true = None self.y_pred = None @@ -569,8 +556,7 @@ def __init__(self): self.qq = None def fit(self, y_true: np.ndarray, y_pred: np.ndarray): - """Fit the evaluator by computing the relevant evaluation metrics on - the inputs. + """Fit the evaluator by computing the relevant evaluation metrics on the inputs. Parameters ---------- @@ -591,8 +577,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): @staticmethod def _compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: - """Convenient function to compute various scalar performance measures - and return them in a pd.Series. + """Compute various scalar performance measures. Parameters ---------- @@ -620,8 +605,7 @@ def _compute_scalar_metrics(y_true: np.ndarray, @staticmethod def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: - """Convenience function to compute various scalar performance measures - and return them in a pd.Series. + """Compute various scalar performance measures. Parameters ---------- @@ -698,7 +682,6 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): dim : tuple, optional Tuple with width and length of the plot. """ - if self.qq is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 7f03b42..8915c5e 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -1,3 +1,4 @@ +"""Create Predictor Insight Graph tables.""" import pandas as pd import matplotlib.pyplot as plt @@ -98,7 +99,9 @@ def plot_incidence(pig_tables: pd.DataFrame, model_type: str, column_order: list=None, dim: tuple=(12, 8)): - """Plots a Predictor Insights Graph (PIG), a graph in which the mean + """Plot a Predictor Insights Graph (PIG). + + A PIG is a graph in which the mean target value is plotted for a number of bins constructed from a predictor variable. When the target is a binary classification target, the plotted mean target value is a true incidence rate. diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 7683f24..5aaf1a2 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -1,3 +1,4 @@ +"""Collection of plotting utils.""" # third party imports import numpy as np @@ -22,7 +23,6 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, path : str, optional Path to store the figure. """ - if "AUC selection" in df_metric.columns: metric = "AUC" ascending = False @@ -86,8 +86,7 @@ def plot_performance_curves(model_performance: pd.DataFrame, "selection": "#ff9500", "validation": "#8064a2"}, metric_name: str=None): - """Plot performance curves generated by the forward feature selection - for the train-selection-validation sets. + """Plot performance curves for the train-selection-validation sets. Parameters ---------- @@ -105,7 +104,6 @@ def plot_performance_curves(model_performance: pd.DataFrame, Defaults to RMSE in case of regression and AUC in case of classification. """ - model_type = model_performance["model_type"][0] if metric_name is None: diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py index 7a646c3..288a2c4 100644 --- a/cobra/model_building/__init__.py +++ b/cobra/model_building/__init__.py @@ -1,3 +1,5 @@ +"""This module includes utils to calculate the best features.""" + from .univariate_selection import compute_univariate_preselection from .univariate_selection import get_preselected_predictors from .univariate_selection import compute_correlations diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 29e06b3..693fed3 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -1,3 +1,4 @@ +"""Feature forward selection.""" import logging from typing import Callable, Optional @@ -10,8 +11,7 @@ log = logging.getLogger(__name__) class ForwardFeatureSelection: - """Perform forward feature selection for a given dataset using a given - algorithm. + """Perform forward feature selection for a given dataset using a given algorithm. Predictors are sequentially added to the model, starting with the one that has the highest univariate predictive power, and then proceeding with those that @@ -35,11 +35,13 @@ class ForwardFeatureSelection: List of fitted models. """ - def __init__(self, - model_type: str="classification", - max_predictors: int=50, - pos_only: bool=True): - + def __init__( + self, + model_type: str="classification", + max_predictors: int=50, + pos_only: bool=True + ): + """Initialize the ForwardFeatureSelection class.""" self.model_type = model_type if model_type == "classification": self.MLModel = LogisticRegressionModel @@ -75,14 +77,17 @@ def get_model_from_step(self, step: int): return self._fitted_models[step] - def compute_model_performances(self, data: pd.DataFrame, - target_column_name: str, - splits: list=["train", "selection", "validation"], - metric: Optional[Callable]=None, - ) -> pd.DataFrame: - """Compute for each model the performance for different sets (e.g. - train-selection-validation) and return them along with a list of - predictors used in the model. Note that the computation of the + def compute_model_performances( + self, data: pd.DataFrame, + target_column_name: str, + splits: list=["train", "selection", "validation"], + metric: Optional[Callable]=None, + ) -> pd.DataFrame: + """ + Compute for each model the performance for different sets. + + Different sets could be cross validation, train-selection-validation, ... + Note that the computation of the performance for each split is cached inside the model itself, so it is inexpensive to perform it multiple times! @@ -168,7 +173,6 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, In case the number of forced predictors is larger than the maximum number of allowed predictors in the model. """ - assert "split" in train_data.columns, "The train_data input df does not include a split column." assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \ "The train_data input df does not include a 'train' and 'selection' split." @@ -196,14 +200,18 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, filtered_predictors, forced_predictors) - def _forward_selection(self, - train_data: pd.DataFrame, - target_column_name: str, - predictors: list, - forced_predictors: list = []) -> list: - """Perform the forward feature selection algorithm to compute a list - of models (with increasing performance). The length of the list, - i.e. the number of models, is bounded by the max_predictors class + def _forward_selection( + self, + train_data: pd.DataFrame, + target_column_name: str, + predictors: list, + forced_predictors: list = [] + ) -> list: + """Perform the forward feature selection algorithm. + + The algorithm will compute a list of models (with increasing performance). + The length of the list, i.e. the number of models, + is bounded by the max_predictors class attribute. Parameters @@ -262,12 +270,17 @@ def _forward_selection(self, return fitted_models - def _find_next_best_model(self, - train_data: pd.DataFrame, - target_column_name: str, - candidate_predictors: list, - current_predictors: list): - """Given a list of current predictors which are already selected to + def _find_next_best_model( + self, + train_data: pd.DataFrame, + target_column_name: str, + candidate_predictors: list, + current_predictors: list + ): + """ + Find the next best model with candidate predictors. + + Given a list of current predictors which are already selected to be include in the model, find amongst a list candidate predictors the predictor to add to the selected list so that the resulting model has the best performance. diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 233162c..7c55acf 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -1,3 +1,4 @@ +"""Contains all types of models supported by Cobra.""" from typing import Callable, Optional @@ -15,7 +16,10 @@ from cobra.evaluation import ClassificationEvaluator class LogisticRegressionModel: - """Wrapper around the LogisticRegression class, with additional methods + """ + Cobra's LogisticRegression model. + + Wrapper around the LogisticRegression class, with additional methods implemented such as evaluation (using AUC), getting a list of coefficients, a dictionary of coefficients per predictor, ... for convenience. @@ -28,6 +32,7 @@ class LogisticRegressionModel: """ def __init__(self): + """Initialize the LogisticRegressionModel class.""" self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver='liblinear', random_state=42) self._is_fitted = False @@ -73,7 +78,6 @@ def deserialize(self, model_dict: dict): ValueError In case JSON file is no valid serialized model. """ - if not self._is_valid_dict(model_dict): raise ValueError("No valid serialized model") @@ -87,7 +91,7 @@ def deserialize(self, model_dict: dict): self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] def get_coef(self) -> np.array: - """Returns the model coefficients. + """Return the model coefficients. Returns ------- @@ -97,7 +101,7 @@ def get_coef(self) -> np.array: return self.logit.coef_[0] def get_intercept(self) -> float: - """Returns the intercept of the model. + """Return the intercept of the model. Returns ------- @@ -107,7 +111,7 @@ def get_intercept(self) -> float: return self.logit.intercept_[0] def get_coef_by_predictor(self) -> dict: - """Returns a dictionary mapping predictor (key) to coefficient (value). + """Return a dictionary mapping predictor (key) to coefficient (value). Returns ------- @@ -150,7 +154,10 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str=None, metric: Optional[Callable]=None) -> float: - """Evaluate the model on a given dataset (X, y). The optional split + """ + Evaluate the model on a given dataset (X, y). + + The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets can be cached! @@ -198,8 +205,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, return self._eval_metrics_by_split[split] def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: - """Compute the importance of each predictor in the model and return - it as a DataFrame. + """Compute the importance of each predictor in the model. Parameters ---------- @@ -211,7 +217,6 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame DataFrame containing columns predictor and importance. """ - y_pred = self.score_model(data) importance_by_variable = { @@ -230,7 +235,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: .reset_index(drop=True)) def _is_valid_dict(self, model_dict: dict) -> bool: - + """Check if the model dictionary is valid.""" if ("meta" not in model_dict or model_dict["meta"] != "logistic-regression"): return False @@ -248,7 +253,10 @@ def _is_valid_dict(self, model_dict: dict) -> bool: class LinearRegressionModel: - """Wrapper around the LinearRegression class, with additional methods + """ + Cobra's LinearRegression model. + + Wrapper around the LinearRegression class, with additional methods implemented such as evaluation (using RMSE), getting a list of coefficients, a dictionary of coefficients per predictor, ... for convenience. @@ -261,6 +269,7 @@ class LinearRegressionModel: """ def __init__(self): + """Initialize the LinearRegression class.""" self.linear = LinearRegression(fit_intercept=True, normalize=False) self._is_fitted = False # placeholder to keep track of a list of predictors @@ -303,7 +312,6 @@ def deserialize(self, model_dict: dict): ValueError In case JSON file is no valid serialized model. """ - if not self._is_valid_dict(model_dict): raise ValueError("No valid serialized model") @@ -315,7 +323,7 @@ def deserialize(self, model_dict: dict): self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] def get_coef(self) -> np.array: - """Returns the model coefficients. + """Return the model coefficients. Returns ------- @@ -325,7 +333,7 @@ def get_coef(self) -> np.array: return self.linear.coef_ def get_intercept(self) -> float: - """Returns the intercept of the model. + """Return the intercept of the model. Returns ------- @@ -335,7 +343,7 @@ def get_intercept(self) -> float: return self.linear.intercept_[0] def get_coef_by_predictor(self) -> dict: - """Returns a dictionary mapping predictor (key) to coefficient (value). + """Return a dictionary mapping predictor (key) to coefficient (value). Returns ------- @@ -378,7 +386,9 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str=None, metric: Optional[Callable]=None) -> float: - """Evaluate the model on a given dataset (X, y). The optional split + """Evaluate the model on a given dataset (X, y). + + The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets can be cached! @@ -421,8 +431,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, return self._eval_metrics_by_split[split] def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: - """Compute the importance of each predictor in the model and return - it as a DataFrame. + """Compute the importance of each predictor in the model. Parameters ---------- @@ -434,7 +443,6 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame DataFrame containing columns predictor and importance. """ - y_pred = self.score_model(data) importance_by_variable = { @@ -453,7 +461,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: .reset_index(drop=True)) def _is_valid_dict(self, model_dict: dict) -> bool: - + """Check if the model dictionary is valid.""" if ("meta" not in model_dict or model_dict["meta"] != "linear-regression"): return False diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 2db4abb..bb412e3 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -1,20 +1,23 @@ - +"""Calculate the univariate quality of predictors.""" import pandas as pd from sklearn.metrics import roc_auc_score, mean_squared_error from numpy import sqrt import cobra.utils as utils -def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, - target_enc_selection_data: pd.DataFrame, - predictors: list, - target_column: str, - model_type: str = "classification", - preselect_auc_threshold: float = 0.053, - preselect_rmse_threshold: float = 5, - preselect_overtrain_threshold: float = 0.05 - ) -> pd.DataFrame: - """Perform a preselection of predictors based on an AUC (in case of +def compute_univariate_preselection( + target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + model_type: str = "classification", + preselect_auc_threshold: float = 0.053, + preselect_rmse_threshold: float = 5, + preselect_overtrain_threshold: float = 0.05 +) -> pd.DataFrame: + """Perform a preselection of predictors. + + The preselection is based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame containing for each variable the train and selection AUC or RMSE along with a @@ -128,7 +131,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, return df_out def get_preselected_predictors(df_metric: pd.DataFrame) -> list: - """Wrapper function to extract a list of predictors from df_metric. + """Extract a list of predictors from df_metric. Parameters ---------- @@ -142,7 +145,6 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: list List of preselected predictors. """ - if "AUC selection" in df_metric.columns: predictor_list = (df_metric[df_metric["preselection"]] .sort_values(by="AUC selection", ascending=False) @@ -156,8 +158,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: - """Given a DataFrame and a list of predictors, compute the correlations - amongst the predictors in the DataFrame. + """Compute the correlations amongst the predictors in the DataFrame. Parameters ---------- @@ -172,7 +173,6 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, pd.DataFrame The correlation matrix of the training set. """ - correlations = target_enc_train_data[predictors].corr() predictors_cleaned = [utils.clean_predictor_name(predictor) diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index e02ad4c..b72d1a4 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -1,3 +1,5 @@ +"""This module contains all preprocessing utils.""" + from .kbins_discretizer import KBinsDiscretizer from .target_encoder import TargetEncoder from .categorical_data_processor import CategoricalDataProcessor diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 175bfb5..c9e906d 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -1,3 +1,4 @@ +"""Process categorical data.""" # standard lib imports import re @@ -15,8 +16,7 @@ log = logging.getLogger(__name__) class CategoricalDataProcessor(BaseEstimator): - """Regroups the categories of categorical variables based on significance - with target variable. + """Regroup categorical variables based on significance with target variable. This class implements the Python Prediction's way of dealing with categorical data preprocessing. There are three steps involved: @@ -64,16 +64,18 @@ class CategoricalDataProcessor(BaseEstimator): "category_size_threshold", "p_value_threshold", "scale_contingency_table", "forced_categories"] - def __init__(self, - model_type: str="classification", - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}): - + def __init__( + self, + model_type: str="classification", + regroup: bool=True, + regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: int=5, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: dict={} + ): + """Initialize the CategoricalDataProcessor.""" if model_type not in ["classification", "regression"]: raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.") @@ -108,8 +110,7 @@ def attributes_to_dict(self) -> dict: return params def set_attributes_from_dict(self, params: dict): - """Set instance attributes from a dictionary of values with key the - name of the attribute. + """Set instance attributes from a dictionary of values with key the name of the attribute. Parameters ---------- @@ -156,7 +157,6 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column : str Column name of the target. """ - if not self.regroup: # We do not need to fit anything if regroup is set to False! log.info("regroup was set to False, so no fitting is required") @@ -181,7 +181,10 @@ def fit(self, data: pd.DataFrame, column_names: list, def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: - """Compute which categories to regroup into "Other" + """ + Fit all necessary columns into "Other". + + Computes which categories to regroup into "Other" for a particular column, and return those that need to be kept as-is. @@ -271,7 +274,6 @@ def transform(self, data: pd.DataFrame, pd.DataFrame Data with additional transformed variables. """ - if self.regroup and len(self._cleaned_categories_by_column) == 0: msg = ("{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") @@ -291,9 +293,7 @@ def transform(self, data: pd.DataFrame, def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: - """Given a DataFrame, a column name and a list of categories to - combine, create an additional column which combines these categories - into "Other". + """Create an additional column which combines categories into "Other". Parameters ---------- @@ -307,7 +307,6 @@ def _transform_column(self, data: pd.DataFrame, pd.DataFrame Original DataFrame with an added processed column. """ - column_name_clean = column_name + "_processed" data.loc[:, column_name_clean] = data[column_name].astype(object) @@ -343,7 +342,7 @@ def _transform_column(self, data: pd.DataFrame, def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: - """Fits the data, then transforms it. + """Fit and transform the data. Parameters ---------- @@ -360,7 +359,6 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, pd.DataFrame Data with additional transformed variables. """ - self.fit(data, column_names, target_column) return self.transform(data, column_names) @@ -368,7 +366,9 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set: - """Fetch categories with a size below a certain threshold. + """ + Fetch categories with a size below a certain threshold. + Note that we use an additional weighting with the overall incidence. Parameters @@ -430,7 +430,10 @@ def _replace_missings(data: pd.DataFrame, def _compute_p_value(X: pd.Series, y: pd.Series, category: str, model_type: str, scale_contingency_table: bool) -> float: - """Calculates p-value in order to evaluate whether category of + """ + Calculate p-value. + + Calculate p-value in order to evaluate whether category of interest is significantly different from the rest of the categories, given the target variable. @@ -484,8 +487,11 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, @staticmethod def _replace_categories(data: pd.Series, categories: set, replace_with: str) -> pd.Series: - """Replace categories in set with "Other" and transform the remaining - categories to strings to avoid type errors later on in the pipeline. + """ + Replace categories in set with "Other". + + Transforms the remaining categories to strings + to avoid type errors later on in the pipeline. Parameters ---------- diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index c30d7de..3fe611a 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -1,4 +1,4 @@ - +"""Binning of continous data.""" # standard lib imports from copy import deepcopy from typing import List @@ -16,7 +16,10 @@ log = logging.getLogger(__name__) class KBinsDiscretizer(BaseEstimator): - """Bin continuous data into intervals of predefined size. It provides a + """ + Discretize continuous values into categorical values. + + Bin continuous data into intervals of predefined size. It provides a way to partition continuous data into discrete values, i.e. transform continuous data into nominal data. This can make a linear model more expressive as it introduces nonlinearity to the model, while maintaining @@ -63,13 +66,15 @@ class KBinsDiscretizer(BaseEstimator): "starting_precision", "label_format", "change_endpoint_format"] - def __init__(self, n_bins: int = 10, strategy: str = "quantile", - closed: str = "right", - auto_adapt_bins: bool = False, - starting_precision: int = 0, - label_format: str = "{} - {}", - change_endpoint_format: bool = False): - + def __init__( + self, n_bins: int = 10, strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False + ): + """Initialize the KBinsDiscretizer.""" # validate number of bins self._validate_n_bins(n_bins) @@ -85,8 +90,7 @@ def __init__(self, n_bins: int = 10, strategy: str = "quantile", self._bins_by_column = {} def _validate_n_bins(self, n_bins: int): - """Check if ``n_bins`` is of the proper type and if it is bigger - than two + """Check if ``n_bins`` is of the proper type and if it is bigger than one. Parameters ---------- @@ -109,7 +113,7 @@ def _validate_n_bins(self, n_bins: int): .format(KBinsDiscretizer.__name__, n_bins)) def attributes_to_dict(self) -> dict: - """Return the attributes of KBinsDiscretizer in a dictionary + """Return the attributes of KBinsDiscretizer as a dictionary. Returns ------- @@ -127,8 +131,7 @@ def attributes_to_dict(self) -> dict: return params def set_attributes_from_dict(self, params: dict): - """Set instance attributes from a dictionary of values with key the - name of the attribute. + """Set instance attributes from a dictionary. Parameters ---------- @@ -163,7 +166,7 @@ def set_attributes_from_dict(self, params: dict): return self def fit(self, data: pd.DataFrame, column_names: list): - """Fits the estimator + """Fit the estimator. Parameters ---------- @@ -172,7 +175,6 @@ def fit(self, data: pd.DataFrame, column_names: list): column_names : list Names of the columns of the DataFrame to discretize """ - if self.strategy not in self.valid_strategies: raise ValueError("{}: valid options for 'strategy' are {}. " "Got strategy={!r} instead." @@ -194,7 +196,7 @@ def fit(self, data: pd.DataFrame, column_names: list): def _fit_column(self, data: pd.DataFrame, column_name: str) -> List[tuple]: - """Compute bins for a specific column in data + """Compute bins for a specific column in data. Parameters ---------- @@ -254,8 +256,10 @@ def _fit_column(self, data: pd.DataFrame, def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: - """Discretizes the data in the given list of columns by mapping each - number to the appropriate bin computed by the fit method + """Discretize the data in the given list of columns. + + This is done by mapping each number to + the appropriate bin computed by the fit method. Parameters ---------- @@ -291,9 +295,7 @@ def transform(self, data: pd.DataFrame, def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tuple]) -> pd.DataFrame: - """Given a DataFrame, a column name and a list of bins, - create an additional column which determines the bin in which the value - of column_name lies in. + """Create a new column with binned values of column_name. Parameters ---------- @@ -309,7 +311,6 @@ def _transform_column(self, data: pd.DataFrame, pd.DataFrame original DataFrame with an added binned column """ - interval_idx = KBinsDiscretizer._create_index(bins, self.closed) column_name_bin = column_name + "_bin" @@ -337,7 +338,7 @@ def _transform_column(self, data: pd.DataFrame, def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: - """Fits to data, then transform it + """Fit to data, then transform it. Parameters ---------- @@ -357,8 +358,7 @@ def fit_transform(self, data: pd.DataFrame, def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, n_bins: int, col_min: float, col_max: float) -> list: - """Compute the bin edges for a given column, a DataFrame and the number - of required bins + """Compute the desired bin edges. Parameters ---------- @@ -378,7 +378,6 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, list list of bin edges from which to compute the bins """ - bin_edges = [] if self.strategy == "quantile": bin_edges = list(data[column_name] @@ -411,8 +410,10 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, return list(dict.fromkeys(bin_edges)) def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: - """Compute the minimal precision of a list of bin_edges so that we end - up with a strictly ascending sequence of different numbers even when rounded. + """Compute the minimal precision of a list of bin_edges. + + This way we end up with a strictly ascending sequence of + different numbers even when rounded. The starting_precision attribute will be used as the initial precision. In case of a negative starting_precision, the bin edges will be rounded to the nearest 10, 100, ... (e.g. 5.55 -> 10, 246 -> 200, ...) @@ -427,7 +428,6 @@ def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: int minimal precision for the bin edges """ - precision = self.starting_precision while True: cont = False @@ -443,8 +443,8 @@ def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: return precision def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: - """Given a list of bin edges, compute the minimal precision for which - we can make meaningful bins and make those bins + """ + Return bins with the minimal precision. Parameters ---------- @@ -471,9 +471,13 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: return bins @staticmethod - def _create_index(intervals: List[tuple], - closed: str = "right") -> pd.IntervalIndex: - """Create an pd.IntervalIndex based on a list of tuples. + def _create_index( + intervals: List[tuple], + closed: str = "right" + ) -> pd.IntervalIndex: + """ + Create an pd.IntervalIndex based on a list of tuples. + This is basically a wrapper around pd.IntervalIndex.from_tuples However, the lower bound of the first entry in the list (the lower bin) is replaced by -np.inf. Similarly, the upper bound of the last entry in @@ -492,7 +496,6 @@ def _create_index(intervals: List[tuple], pd.IntervalIndex Description """ - # check if closed is of the proper form if closed not in ["left", "right"]: raise ValueError("{}: valid options for 'closed' are {}. " @@ -511,8 +514,8 @@ def _create_index(intervals: List[tuple], return pd.IntervalIndex.from_tuples(_intervals, closed) def _create_bin_labels(self, bins: List[tuple]) -> list: - """Given a list of bins, create a list of string containing the bins - as a string with a specific format (e.g. bin labels) + """ + Stringify the bin bounds to be used as bin labels. Parameters ---------- diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index e03d352..3ecadf0 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -1,3 +1,4 @@ +"""Preprocess data.""" # standard lib imports import inspect @@ -20,7 +21,10 @@ log = logging.getLogger(__name__) class PreProcessor(BaseEstimator): - """This class implements a so-called facade pattern to define a + """ + Preprocess data. + + This class implements a so-called facade pattern to define a higher-level interface to work with the CategoricalDataProcessor, KBinsDiscretizer and TargetEncoder classes, so that their fit and transform methods are called in the correct order. @@ -48,12 +52,14 @@ class PreProcessor(BaseEstimator): (``classification`` or ``regression``). """ - def __init__(self, - categorical_data_processor: CategoricalDataProcessor, - discretizer: KBinsDiscretizer, - target_encoder: TargetEncoder, - is_fitted: bool = False): - + def __init__( + self, + categorical_data_processor: CategoricalDataProcessor, + discretizer: KBinsDiscretizer, + target_encoder: TargetEncoder, + is_fitted: bool = False + ): + """Initialize the PreProcessor class.""" self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer self._target_encoder = target_encoder @@ -63,27 +69,28 @@ def __init__(self, self.model_type = categorical_data_processor.model_type @classmethod - def from_params(cls, - model_type: str="classification", - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean"): - """Constructor to instantiate PreProcessor from all the parameters - that can be set in all its required (attribute) classes - along with good default values. + def from_params( + cls, + model_type: str="classification", + n_bins: int=10, + strategy: str="quantile", + closed: str="right", + auto_adapt_bins: bool=False, + starting_precision: int=0, + label_format: str="{} - {}", + change_endpoint_format: bool=False, + regroup: bool=True, + regroup_name: str="Other", + keep_missing: bool=True, + category_size_threshold: int=5, + p_value_threshold: float=0.001, + scale_contingency_table: bool=True, + forced_categories: dict={}, + weight: float=0.0, + imputation_strategy: str="mean" + ): + """ + Instantiate a PreProcessor from given or default params. Parameters ---------- @@ -168,8 +175,11 @@ def from_params(cls, @classmethod def from_pipeline(cls, pipeline: dict): - """Constructor to instantiate PreProcessor from a (fitted) pipeline - which was stored as a JSON file and passed to this function as a dict. + """ + Instantiate a PreProcessor from a (fitted) pipeline. + + The pipeline should be stored as a JSON file and passed to this function + as a dict. Parameters ---------- @@ -187,7 +197,6 @@ def from_pipeline(cls, pipeline: dict): If the loaded pipeline does not have all required parameters and no others. """ - if not PreProcessor._is_valid_pipeline(pipeline): raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.") @@ -222,7 +231,6 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, target_column_name : str Column name of the target. """ - # get list of all variables preprocessed_variable_names = (PreProcessor ._get_variable_list(continuous_vars, @@ -290,7 +298,6 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, NotFittedError In case PreProcessor was not fitted first. """ - start = time.time() if not self._is_fitted: @@ -339,7 +346,6 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, pd.DataFrame Transformed (preprocessed) data. """ - self.fit(train_data, continuous_vars, discrete_vars, target_column_name) @@ -350,8 +356,7 @@ def train_selection_validation_split(data: pd.DataFrame, train_prop: float=0.6, selection_prop: float=0.2, validation_prop: float=0.2) -> pd.DataFrame: - """Adds `split` column with train/selection/validation values - to the dataset. + """Add `split` column with train/selection/validation values to the dataset. Train set = data on which the model is trained and on which the encoding is based. Selection set = data used for univariate and forward feature selection. Often called the validation set. @@ -401,7 +406,10 @@ def train_selection_validation_split(data: pd.DataFrame, return data def serialize_pipeline(self) -> dict: - """Serialize the preprocessing pipeline by writing all its required + """ + Serialize the preprocessing pipeline. + + This is done by writing all its required parameters to a dictionary to later store it as a JSON file. Returns @@ -429,8 +437,7 @@ def serialize_pipeline(self) -> dict: @staticmethod def _is_valid_pipeline(pipeline: dict) -> bool: - """Validate the loaded pipeline by checking if all required parameters - are present (and no others!). + """Validate the loaded pipeline by checking if only the required parameters are present. Parameters ---------- @@ -456,8 +463,9 @@ def _is_valid_pipeline(pipeline: dict) -> bool: @staticmethod def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: - """Merge lists of continuous_vars and discrete_vars and add suffix - "_bin" resp. "_processed" to the predictors. + """Merge lists of continuous_vars and discrete_vars. + + Suffixes "_bin" resp. "_processed" are added to the predictors. Parameters ---------- diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 3eda39d..0863ae6 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,3 +1,4 @@ +"""Target encoding.""" import logging @@ -9,7 +10,10 @@ log = logging.getLogger(__name__) class TargetEncoder(BaseEstimator): - """Target encoding for categorical features, inspired by + """ + Target encoding for categorical features. + + Inspired by http://contrib.scikit-learn.org/category_encoders/targetencoder.html. Replace each value of the categorical feature with the average of the @@ -62,9 +66,11 @@ class TargetEncoder(BaseEstimator): valid_imputation_strategies = ("mean", "min", "max") - def __init__(self, weight: float=0.0, - imputation_strategy: str="mean"): - + def __init__( + self, weight: float=0.0, + imputation_strategy: str="mean" + ): + """Initialize the TargetEncoder class.""" if weight < 0: raise ValueError("The value of weight cannot be smaller than zero.") elif imputation_strategy not in self.valid_imputation_strategies: @@ -107,8 +113,7 @@ def attributes_to_dict(self) -> dict: return params def set_attributes_from_dict(self, params: dict): - """Set instance attributes from a dictionary of values with key the - name of the attribute. + """Set instance attributes from a dictionary. Parameters ---------- @@ -309,8 +314,11 @@ def fit_transform(self, data: pd.DataFrame, @staticmethod def _clean_column_name(column_name: str) -> str: - """Generate a name for the new column that this target encoder - generates in the given data, by removing "_bin", "_processed" or + """ + Generate a clean name. + + Cleans the name generated by the target encoder + in the given data, by removing "_bin", "_processed" or "_cleaned" from the original categorical column, and adding "_enc". Parameters diff --git a/cobra/utils.py b/cobra/utils.py index f394caf..c681cdf 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -1,6 +1,11 @@ +"""Cobra utils.""" + def clean_predictor_name(predictor_name: str) -> str: - """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end - of the predictor name to return a clean version of the predictor + """ + Clean the predictor name. + + This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off + from the end of the predictor name to return a clean version of the predictor """ return (predictor_name.replace("_enc", "") .replace("_bin", "") From 9ab342aab6761e6708c5846aa106499d6d9cc61c Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 22 Apr 2022 09:19:03 +0200 Subject: [PATCH 3/9] fix: codestyle and some linter issues --- .pylintrc | 585 ++++++++++++++++++ cobra/__init__.py | 2 +- cobra/evaluation/evaluator.py | 120 +++- cobra/evaluation/pigs_tables.py | 41 +- cobra/evaluation/plotting_utils.py | 58 +- cobra/model_building/forward_selection.py | 46 +- cobra/model_building/models.py | 28 +- cobra/model_building/univariate_selection.py | 5 +- .../categorical_data_processor.py | 26 +- cobra/preprocessing/kbins_discretizer.py | 9 +- cobra/preprocessing/preprocessor.py | 61 +- cobra/preprocessing/target_encoder.py | 6 +- cobra/utils.py | 1 + cobra/version.py | 2 +- setup.cfg | 2 + 15 files changed, 851 insertions(+), 141 deletions(-) create mode 100644 .pylintrc create mode 100644 setup.cfg diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..ee9601a --- /dev/null +++ b/.pylintrc @@ -0,0 +1,585 @@ +[MASTER] + +# Specify a configuration file. +#rcfile= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Files or directories to be skipped. They should be base names, not +# paths. +ignore=CVS + +# Add files or directories matching the regex patterns to the ignore-list. The +# regex matches against paths and can be in Posix or Windows format. +ignore-paths= + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. +ignore-patterns=^\.# + +# Pickle collected data for later comparisons. +persistent=yes + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + pylint.extensions.check_elif, + pylint.extensions.bad_builtin, + pylint.extensions.docparams, + pylint.extensions.for_any_all, + pylint.extensions.set_membership, + pylint.extensions.code_style, + pylint.extensions.overlapping_exceptions, + pylint.extensions.typing, + pylint.extensions.redefined_variable_type, + pylint.extensions.comparison_placement, + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-allow-list= + +# Minimum supported python version +py-version = 3.7.2 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# Specify a score threshold to be exceeded before program exits with error. +fail-under=10.0 + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +# confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable= + use-symbolic-message-instead, + useless-suppression, + fixme + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" + +disable= + attribute-defined-outside-init, + duplicate-code, + invalid-name, + missing-docstring, + protected-access, + too-few-public-methods, + # handled by black + format, + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables 'fatal', 'error', 'warning', 'refactor', 'convention' +# and 'info', which contain the number of messages in each category, as +# well as 'statement', which is the total number of statements analyzed. This +# score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + +# Activate the evaluation score. +score=yes + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME,XXX,TODO + +# Regular expression of note tags to take in consideration. +#notes-rgx= + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Signatures are removed from the similarity computation +ignore-signatures=no + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=_$|dummy + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.* + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=100 + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Maximum number of lines in a module +max-module-lines=2000 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=i,j,k,ex,Run,_, + ax, + cv, + df, + exc, + i, + j, + l, + lr, + m, + n, + q, + qq, + s, + t, + v, + x, + X, + X_train, + X_test, + y, + + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Bad variable names which should always be refused, separated by a comma +bad-names=foo,bar,baz,toto,tutu,tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names +function-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names +variable-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names +const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names +attr-rgx=[a-z_][a-z0-9_]{2,}$ + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names +argument-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names +class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. +#class-const-rgx= + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names +inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names +class-rgx=[A-Z_][a-zA-Z0-9]+$ + + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names +module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names +method-rgx=[a-z_][a-z0-9_]{2,}$ + +# Regular expression which can overwrite the naming style set by typevar-naming-style. +#typevar-rgx= + +# Regular expression which should only match function or class names that do +# not require a docstring. Use ^(?!__init__$)_ to also check __init__. +no-docstring-rgx=__.*__ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# List of decorators that define properties, such as abc.abstractproperty. +property-classes=abc.abstractproperty + + +[TYPECHECK] + +# Regex pattern to define which classes are considered mixins if ignore-mixin- +# members is set to 'yes' +mixin-class-rgx=.*MixIn + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members=REQUEST,acl_users,aq_parent,argparse.Namespace + +# List of decorators that create context managers from functions, such as +# contextlib.contextmanager. +contextmanager-decorators=contextlib.contextmanager + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# List of comma separated words that should be considered directives if they +# appear and the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + + +[DESIGN] + +# Maximum number of arguments for function / method +max-args=10 + +# Maximum number of locals for function / method body +max-locals=25 + +# Maximum number of return / yield for function / method body +max-returns=11 + +# Maximum number of branch for function / method body +max-branches=27 + +# Maximum number of statements in function / method body +max-statements=100 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# List of qualified class names to ignore when counting class parents (see R0901). +ignored-parents= + +# Maximum number of attributes for a class (see R0902). +max-attributes=11 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=25 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# List of regular expressions of class ancestor names to +# ignore when counting public methods (see R0903). +exclude-too-few-public-methods= + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__,__new__,setUp,__post_init__ + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub,TERMIOS,Bastion,rexec + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=Exception + + +[TYPING] + +# Set to ``no`` if the app / library does **NOT** need to support runtime +# introspection of type annotations. If you use type annotations +# **exclusively** for type checking of an application, you're probably fine. +# For libraries, evaluate if some users what to access the type hints at +# runtime first, e.g., through ``typing.get_type_hints``. Applies to Python +# versions 3.7 - 3.9 +runtime-typing = no + + +[DEPRECATED_BUILTINS] + +# List of builtins function names that should not be used, separated by a comma +bad-functions=map,input + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[CODE_STYLE] + +# Max line length for which to sill emit suggestions. Used to prevent optional +# suggestions which would get split by a code formatter (e.g., black). Will +# default to the setting for ``max-line-length``. +#max-line-length-suggestions= \ No newline at end of file diff --git a/cobra/__init__.py b/cobra/__init__.py index 8afad45..451287b 100644 --- a/cobra/__init__.py +++ b/cobra/__init__.py @@ -1,3 +1,3 @@ """Cobra module.""" -from .version import __version__ \ No newline at end of file +from .version import __version__ diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index f550431..41974e7 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -26,6 +26,10 @@ from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score + +DEFAULT_LABELS = ["0", "1"] + + class ClassificationEvaluator(): """Evaluator class encapsulating classification model metrics and plotting functionality. @@ -58,8 +62,8 @@ class ClassificationEvaluator(): def __init__( self, - probability_cutoff: float=None, - lift_at: float=0.05, + probability_cutoff: float = None, + lift_at: float = 0.05, n_bins: int = 10 ): """Initialize the ClassificationEvaluator.""" @@ -144,6 +148,12 @@ def _compute_scalar_metrics(y_true: np.ndarray, F1 Matthews correlation coefficient Lift at given percentage + + Raises + ---------- + ValueError + The `column_order` and `pig_tables` parameters do not contain + the same set of variables. """ return pd.Series({ "accuracy": accuracy_score(y_true, y_pred_b), @@ -152,13 +162,16 @@ def _compute_scalar_metrics(y_true: np.ndarray, "recall": recall_score(y_true, y_pred_b), "F1": f1_score(y_true, y_pred_b, average=None)[1], "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), - "lift at {}".format(lift_at): np.round(ClassificationEvaluator - ._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=lift_at), 2) + f"lift at {lift_at}": np.round( + ClassificationEvaluator + ._compute_lift( + y_true=y_true, + y_pred=y_pred, + lift_at=lift_at + ), 2) }) - def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): + def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot ROC curve of the model. Parameters @@ -167,6 +180,11 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): Path to store the figure. dim : tuple, optional Tuple with width and length of the plot. + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ if self.roc_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " @@ -178,12 +196,12 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax.plot(self.roc_curve["fpr"], self.roc_curve["tpr"], color="cornflowerblue", linewidth=3, - label="ROC curve (area = {s:.3})".format(s=auc)) + label=f"ROC curve (area = {auc:.3})") ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, linestyle="--") @@ -197,8 +215,11 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), - labels: list=["0", "1"]): + def plot_confusion_matrix( + self, path: str = None, + dim: tuple = (12, 8), + labels: list = None + ): """Plot the confusion matrix. Parameters @@ -209,14 +230,20 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), Tuple with width and length of the plot. labels : list, optional Optional list of labels, default "0" and "1". + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ + labels = labels or DEFAULT_LABELS if self.confusion_matrix is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") raise NotFittedError(msg.format(self.__class__.__name__)) - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax = sns.heatmap(self.confusion_matrix, annot=self.confusion_matrix.astype(str), fmt="s", cmap="Blues", @@ -228,7 +255,7 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), plt.show() - def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): + def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot cumulative response curve. Parameters @@ -237,6 +264,11 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): Path to store the figure. dim : tuple, optional Tuple with width and length of the plot. + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ if self.lift_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " @@ -249,7 +281,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): lifts = np.array(lifts)*inc_rate*100 with plt.style.context("seaborn-ticks"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") @@ -278,7 +310,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): + def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot lift per decile. Parameters @@ -287,6 +319,11 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): Path to store the figure. dim : tuple, optional Tuple with width and length of the plot. + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ if self.lift_curve is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " @@ -297,7 +334,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): x_labels, lifts, _ = self.lift_curve with plt.style.context("seaborn-ticks"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") @@ -326,7 +363,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): + def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): """Plot cumulative gains per decile. Parameters @@ -337,7 +374,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): Tuple with width and length of the plot. """ with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, color="cornflowerblue", linewidth=3, @@ -354,11 +391,11 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): # Format ticks ticks_loc_y = ax.get_yticks().tolist() ax.yaxis.set_major_locator(mticker.FixedLocator(ticks_loc_y)) - ax.set_yticklabels(["{:3.0f}%".format(x) for x in ticks_loc_y]) + ax.set_yticklabels([f"{x:3.0f}%" for x in ticks_loc_y]) ticks_loc_x = ax.get_xticks().tolist() ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc_x)) - ax.set_xticklabels(["{:3.0f}%".format(x) for x in ticks_loc_x]) + ax.set_xticklabels([f"{x:3.0f}%" for x in ticks_loc_x]) # Legend ax.legend(loc="lower right") @@ -384,8 +421,8 @@ def _find_optimal_cutoff(y_true: np.ndarray, float Optimal cut-off probability for the model. """ - return ClassificationEvaluator._compute_optimal_cutoff(roc_curve(y_true=y_true, - y_score=y_pred)) + fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_pred) + return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) @staticmethod def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, @@ -455,9 +492,11 @@ def _compute_cumulative_gains(y_true: np.ndarray, return percentages, gains @staticmethod - def _compute_lift_per_bin(y_true: np.ndarray, - y_pred: np.ndarray, - n_bins: int=10) -> tuple: + def _compute_lift_per_bin( + y_true: np.ndarray, + y_pred: np.ndarray, + n_bins: int = 10 + ) -> tuple: """Compute lift of the model for a given number of bins. Parameters @@ -485,8 +524,11 @@ def _compute_lift_per_bin(y_true: np.ndarray, return x_labels, lifts, y_true.mean() @staticmethod - def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, - lift_at: float=0.05) -> float: + def _compute_lift( + y_true: np.ndarray, + y_pred: np.ndarray, + lift_at: float = 0.05 + ) -> float: """Calculate lift on a specified level. Parameters @@ -619,7 +661,7 @@ def _compute_qq_residuals(y_true: np.ndarray, pd.Series Theoretical quantiles and associated actual residuals. """ - ## also possible directly via statsmodels.api.qqplot() + # also possible directly via statsmodels.api.qqplot() n = len(y_true) @@ -636,7 +678,7 @@ def _compute_qq_residuals(y_true: np.ndarray, "residuals": df["z_res"].values, }) - def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): + def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): """Plot predictions from the model against actual values. Parameters @@ -645,17 +687,24 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): Path to store the figure. dim : tuple, optional Tuple with width and length of the plot. + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ if self.y_true is None and self.y_pred is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") + raise NotFittedError(msg.format(self.__class__.__name__)) + y_true = self.y_true y_pred = self.y_pred with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable x = np.arange(1, len(y_true)+1) @@ -672,7 +721,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_qq(self, path: str=None, dim: tuple=(12, 8)): + def plot_qq(self, path: str = None, dim: tuple = (12, 8)): """Display a Q-Q plot from the standardized prediction residuals. Parameters @@ -681,6 +730,11 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): Path to store the figure. dim : tuple, optional Tuple with width and length of the plot. + + Raises + ---------- + NotFittedError + The instance is not fitted yet. """ if self.qq is None: msg = ("This {} instance is not fitted yet. Call 'fit' with " @@ -690,7 +744,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable x = self.qq["quantiles"] y = self.qq["residuals"] @@ -710,4 +764,4 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): if path: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - plt.show() \ No newline at end of file + plt.show() diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 8915c5e..5503349 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -6,12 +6,15 @@ import numpy as np from matplotlib.ticker import FuncFormatter -import cobra.utils as utils +from cobra import utils -def generate_pig_tables(basetable: pd.DataFrame, - id_column_name: str, - target_column_name: str, - preprocessed_predictors: list) -> pd.DataFrame: + +def generate_pig_tables( + basetable: pd.DataFrame, + id_column_name: str, + target_column_name: str, + preprocessed_predictors: list +) -> pd.DataFrame: """Compute PIG tables for all predictors in preprocessed_predictors. The output is a DataFrame with columns ``variable``, ``label``, @@ -94,13 +97,15 @@ def compute_pig_table(basetable: pd.DataFrame, return res[column_order] -def plot_incidence(pig_tables: pd.DataFrame, - variable: str, - model_type: str, - column_order: list=None, - dim: tuple=(12, 8)): +def plot_incidence( + pig_tables: pd.DataFrame, + variable: str, + model_type: str, + column_order: list = None, + dim: tuple = (12, 8) +): """Plot a Predictor Insights Graph (PIG). - + A PIG is a graph in which the mean target value is plotted for a number of bins constructed from a predictor variable. When the target is a binary classification target, @@ -123,6 +128,12 @@ def plot_incidence(pig_tables: pd.DataFrame, on the PIG. dim: tuple, default=(12, 8) Optional tuple to configure the width and length of the plot. + + Raises + ---------- + ValueError + The `column_order` and `pig_tables` parameters do not contain + the same set of variables. """ if model_type not in ["classification", "regression"]: raise ValueError("An unexpected value was set for the model_type " @@ -170,7 +181,7 @@ def plot_incidence(pig_tables: pd.DataFrame, # Set labels & ticks ax.set_ylabel('incidence' if model_type == "classification" else "mean target value", fontsize=16) - ax.set_xlabel('{} bins' ''.format(variable), fontsize=16) + ax.set_xlabel(f'{variable} bins' '', fontsize=16) ax.xaxis.set_tick_params(labelsize=14) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") @@ -181,7 +192,7 @@ def plot_incidence(pig_tables: pd.DataFrame, # so format them as percentages ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05)) ax.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + FuncFormatter(lambda y, _: f'{y:.1%}')) elif model_type == "regression": # If the difference between the highest avg. target of all bins # versus the global avg. target AND the difference between the @@ -213,12 +224,12 @@ def plot_incidence(pig_tables: pd.DataFrame, align='center', color="#939598", zorder=1) # Set labels & ticks - ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16) + ax2.set_xlabel(f'{variable} bins' '', fontsize=16) ax2.xaxis.set_tick_params(rotation=45, labelsize=14) ax2.yaxis.set_tick_params(labelsize=14) ax2.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + FuncFormatter(lambda y, _: f'{y:.1%}')) ax2.set_ylabel('population size', fontsize=16) ax2.tick_params(axis='y', colors="#939598") ax2.yaxis.label.set_color('#939598') diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 5aaf1a2..8f0a6b0 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -7,9 +7,19 @@ import matplotlib.pyplot as plt import seaborn as sns -def plot_univariate_predictor_quality(df_metric: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None): + +DEFAULT_COLOURS = { + "train": "#0099bf", + "selection": "#ff9500", + "validation": "#8064a2" +} + + +def plot_univariate_predictor_quality( + df_metric: pd.DataFrame, + dim: tuple = (12, 8), + path: str = None +): """Plot univariate quality of the predictors. Parameters @@ -40,7 +50,7 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, # plot data with plt.style.context("seaborn-ticks"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax = sns.barplot(x=metric, y="predictor", hue="split", data=df) ax.set_title("Univariate Quality of Predictors") @@ -56,9 +66,12 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, plt.show() -def plot_correlation_matrix(df_corr: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None): + +def plot_correlation_matrix( + df_corr: pd.DataFrame, + dim: tuple = (12, 8), + path: str = None +): """Plot correlation matrix amongst the predictors. Parameters @@ -70,7 +83,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, path : str, optional Path to store the figure. """ - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax = sns.heatmap(df_corr, cmap='Blues') ax.set_title('Correlation Matrix') @@ -79,13 +92,14 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, plt.show() -def plot_performance_curves(model_performance: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None, - colors: dict={"train": "#0099bf", - "selection": "#ff9500", - "validation": "#8064a2"}, - metric_name: str=None): + +def plot_performance_curves( + model_performance: pd.DataFrame, + dim: tuple = (12, 8), + path: str = None, + colors: dict = None, + metric_name: str = None +): """Plot performance curves for the train-selection-validation sets. Parameters @@ -104,6 +118,7 @@ def plot_performance_curves(model_performance: pd.DataFrame, Defaults to RMSE in case of regression and AUC in case of classification. """ + colors = colors or DEFAULT_COLOURS model_type = model_performance["model_type"][0] if metric_name is None: @@ -155,10 +170,13 @@ def plot_performance_curves(model_performance: pd.DataFrame, plt.show() -def plot_variable_importance(df_variable_importance: pd.DataFrame, - title: str=None, - dim: tuple=(12, 8), - path: str=None): + +def plot_variable_importance( + df_variable_importance: pd.DataFrame, + title: str = None, + dim: tuple = (12, 8), + path: str = None +): """Plot variable importance of a given model. Parameters @@ -173,7 +191,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, Path to store the figure. """ with plt.style.context("seaborn-ticks"): - fig, ax = plt.subplots(figsize=dim) + fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue") diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 693fed3..ed5f119 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -10,6 +10,12 @@ log = logging.getLogger(__name__) + +DEFAULT_SPLIT_NAMES = ["train", "selection", "validation"] +DEFAULT_FORCED_PREDICTORS = [] +DEFAULT_EXCLUDED_PREDICTORS = [] + + class ForwardFeatureSelection: """Perform forward feature selection for a given dataset using a given algorithm. @@ -37,9 +43,9 @@ class ForwardFeatureSelection: def __init__( self, - model_type: str="classification", - max_predictors: int=50, - pos_only: bool=True + model_type: str = "classification", + max_predictors: int = 50, + pos_only: bool = True ): """Initialize the ForwardFeatureSelection class.""" self.model_type = model_type @@ -80,12 +86,12 @@ def get_model_from_step(self, step: int): def compute_model_performances( self, data: pd.DataFrame, target_column_name: str, - splits: list=["train", "selection", "validation"], - metric: Optional[Callable]=None, + splits: list = None, + metric: Optional[Callable] = None, ) -> pd.DataFrame: """ Compute for each model the performance for different sets. - + Different sets could be cross validation, train-selection-validation, ... Note that the computation of the performance for each split is cached inside the model itself, so it @@ -99,7 +105,7 @@ def compute_model_performances( Name of the target column. splits : list, optional List of splits to compute performance on. - metric: Callable (function), optional + metric : Callable (function), optional Function that computes an evaluation metric to evaluate the model's performances, instead of the default metric (AUC for classification, RMSE for regression). @@ -113,6 +119,7 @@ def compute_model_performances( Contains for each model the performance for train, selection and validation sets as well as the set of predictors used in this model. """ + splits = splits or DEFAULT_SPLIT_NAMES results = [] predictor_set = set([]) @@ -145,9 +152,13 @@ def compute_model_performances( return df - def fit(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list, forced_predictors: list=[], - excluded_predictors: list=[]): + def fit( + self, train_data: pd.DataFrame, + target_column_name: str, + predictors: list, + forced_predictors: list = None, + excluded_predictors: list = None + ): """Fit the forward feature selection estimator. Parameters @@ -178,6 +189,8 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, "The train_data input df does not include a 'train' and 'selection' split." # remove excluded predictors from predictor lists + forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS + excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS filtered_predictors = [var for var in predictors if (var not in excluded_predictors and var not in forced_predictors)] @@ -205,10 +218,10 @@ def _forward_selection( train_data: pd.DataFrame, target_column_name: str, predictors: list, - forced_predictors: list = [] + forced_predictors: list = None ) -> list: """Perform the forward feature selection algorithm. - + The algorithm will compute a list of models (with increasing performance). The length of the list, i.e. the number of models, is bounded by the max_predictors class @@ -231,6 +244,7 @@ def _forward_selection( List of fitted models where the index of the list indicates the number of predictors minus one (as indices start from 0). """ + forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS fitted_models = [] current_predictors = [] @@ -279,7 +293,7 @@ def _find_next_best_model( ): """ Find the next best model with candidate predictors. - + Given a list of current predictors which are already selected to be include in the model, find amongst a list candidate predictors the predictor to add to the selected list so that the resulting model @@ -300,6 +314,12 @@ def _find_next_best_model( ------- self.MLModel Best performing model. + + Raises + ---------- + ValueError + The `column_order` and `pig_tables` parameters do not contain + the same set of variables. """ # placeholders best_model = None diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 7c55acf..cad6381 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -6,15 +6,15 @@ import numpy as np import pandas as pd from scipy import stats -from sklearn.metrics import roc_auc_score, mean_squared_error +from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve from numpy import sqrt from sklearn.linear_model import LogisticRegression, LinearRegression -from sklearn.metrics import roc_curve # custom imports import cobra.utils as utils from cobra.evaluation import ClassificationEvaluator + class LogisticRegressionModel: """ Cobra's LogisticRegression model. @@ -151,9 +151,11 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.logit.predict_proba(X[self.predictors])[:, 1] - def evaluate(self, X: pd.DataFrame, y: pd.Series, - split: str=None, - metric: Optional[Callable]=None) -> float: + def evaluate( + self, X: pd.DataFrame, y: pd.Series, + split: str = None, + metric: Optional[Callable] = None + ) -> float: """ Evaluate the model on a given dataset (X, y). @@ -383,9 +385,11 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.linear.predict(X[self.predictors]) - def evaluate(self, X: pd.DataFrame, y: pd.Series, - split: str=None, - metric: Optional[Callable]=None) -> float: + def evaluate( + self, X: pd.DataFrame, y: pd.Series, + split: str = None, + metric: Optional[Callable] = None + ) -> float: """Evaluate the model on a given dataset (X, y). The optional split @@ -425,8 +429,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, if split is None: return performance - else: - self._eval_metrics_by_split[split] = performance + self._eval_metrics_by_split[split] = performance return self._eval_metrics_by_split[split] @@ -460,7 +463,8 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: return (df.sort_values(by="importance", ascending=False) .reset_index(drop=True)) - def _is_valid_dict(self, model_dict: dict) -> bool: + @staticmethod + def _is_valid_dict(model_dict: dict) -> bool: """Check if the model dictionary is valid.""" if ("meta" not in model_dict or model_dict["meta"] != "linear-regression"): @@ -468,7 +472,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool: attr = ["coef_", "intercept_", "predictors"] for key in attr: - if not (key in model_dict or type(model_dict[key]) != list): + if not (key in model_dict or not isinstance(model_dict[key], list)): return False if ("params" not in model_dict diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index bb412e3..48c960b 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -5,6 +5,7 @@ import cobra.utils as utils + def compute_univariate_preselection( target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, @@ -16,7 +17,7 @@ def compute_univariate_preselection( preselect_overtrain_threshold: float = 0.05 ) -> pd.DataFrame: """Perform a preselection of predictors. - + The preselection is based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame @@ -130,6 +131,7 @@ def compute_univariate_preselection( return df_out + def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """Extract a list of predictors from df_metric. @@ -156,6 +158,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: return [col + "_enc" for col in predictor_list] + def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Compute the correlations amongst the predictors in the DataFrame. diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index c9e906d..bf60079 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -15,6 +15,7 @@ log = logging.getLogger(__name__) + class CategoricalDataProcessor(BaseEstimator): """Regroup categorical variables based on significance with target variable. @@ -66,18 +67,21 @@ class CategoricalDataProcessor(BaseEstimator): def __init__( self, - model_type: str="classification", - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={} + model_type: str = "classification", + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {} ): """Initialize the CategoricalDataProcessor.""" if model_type not in ["classification", "regression"]: - raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.") + raise ValueError( + "An unexpected model_type was provided. " + "A valid model_type is either 'classification' or 'regression'." + ) self.model_type = model_type self.regroup = regroup @@ -183,7 +187,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: """ Fit all necessary columns into "Other". - + Computes which categories to regroup into "Other" for a particular column, and return those that need to be kept as-is. @@ -432,7 +436,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, scale_contingency_table: bool) -> float: """ Calculate p-value. - + Calculate p-value in order to evaluate whether category of interest is significantly different from the rest of the categories, given the target variable. diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 3fe611a..1a903d9 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -15,6 +15,7 @@ log = logging.getLogger(__name__) + class KBinsDiscretizer(BaseEstimator): """ Discretize continuous values into categorical values. @@ -399,12 +400,12 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, log.warning(f"Column {column_name} " "has NaNs present in bin definitions") - # Make absolutely sure bin edges are ordered, + # Make absolutely sure bin edges are ordered, # in very rare situations this wasn't the case - # due to rounding in quantile calculation (e.g. + # due to rounding in quantile calculation (e.g. # distributions with strong mass for same value) bin_edges = sorted(bin_edges) - + # Make sure the bin_edges are unique # and order remains the same return list(dict.fromkeys(bin_edges)) @@ -460,7 +461,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: # this can be a negative number, which then # rounds numbers to the nearest 10, 100, ... precision = self._compute_minimal_precision_of_bin_edges(bin_edges) - + bins = [] for a, b in zip(bin_edges, bin_edges[1:]): fmt_a = round(a, precision) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 3ecadf0..5aa9bda 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -20,6 +20,7 @@ log = logging.getLogger(__name__) + class PreProcessor(BaseEstimator): """ Preprocess data. @@ -71,23 +72,23 @@ def __init__( @classmethod def from_params( cls, - model_type: str="classification", - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean" + model_type: str = "classification", + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + weight: float = 0.0, + imputation_strategy: str = "mean" ): """ Instantiate a PreProcessor from given or default params. @@ -154,7 +155,7 @@ def from_params( PreProcessor Class encapsulating CategoricalDataProcessor, KBinsDiscretizer, and TargetEncoder instances. - """ + """ categorical_data_processor = CategoricalDataProcessor(model_type, regroup, regroup_name, keep_missing, @@ -162,13 +163,13 @@ def from_params( p_value_threshold, scale_contingency_table, forced_categories) - + discretizer = KBinsDiscretizer(n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format) - + target_encoder = TargetEncoder(weight, imputation_strategy) return cls(categorical_data_processor, discretizer, target_encoder) @@ -352,10 +353,12 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, return self.transform(train_data, continuous_vars, discrete_vars) @staticmethod - def train_selection_validation_split(data: pd.DataFrame, - train_prop: float=0.6, - selection_prop: float=0.2, - validation_prop: float=0.2) -> pd.DataFrame: + def train_selection_validation_split( + data: pd.DataFrame, + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2 + ) -> pd.DataFrame: """Add `split` column with train/selection/validation values to the dataset. Train set = data on which the model is trained and on which the encoding is based. @@ -394,10 +397,12 @@ def train_selection_validation_split(data: pd.DataFrame, size_valid = int(validation_prop * nrows) correction = nrows - (size_train+size_select+size_valid) - split = ['train'] * size_train \ - + ['train'] * correction \ - + ['selection'] * size_select \ - + ['validation'] * size_valid + split = ( + ['train'] * size_train + + ['train'] * correction + + ['selection'] * size_select + + ['validation'] * size_valid + ) shuffle(split) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 0863ae6..0a9028f 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -7,8 +7,10 @@ from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError + log = logging.getLogger(__name__) + class TargetEncoder(BaseEstimator): """ Target encoding for categorical features. @@ -67,8 +69,8 @@ class TargetEncoder(BaseEstimator): valid_imputation_strategies = ("mean", "min", "max") def __init__( - self, weight: float=0.0, - imputation_strategy: str="mean" + self, weight: float = 0.0, + imputation_strategy: str = "mean" ): """Initialize the TargetEncoder class.""" if weight < 0: diff --git a/cobra/utils.py b/cobra/utils.py index c681cdf..b7727dd 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -1,5 +1,6 @@ """Cobra utils.""" + def clean_predictor_name(predictor_name: str) -> str: """ Clean the predictor name. diff --git a/cobra/version.py b/cobra/version.py index ff1068c..6849410 100644 --- a/cobra/version.py +++ b/cobra/version.py @@ -1 +1 @@ -__version__ = "1.1.0" \ No newline at end of file +__version__ = "1.1.0" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..15fbabe --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[pycodestyle] +max-line-length = 120 \ No newline at end of file From 7f82013c6f4efff04da5a2b9e78384aedf5e6dd6 Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 22 Apr 2022 13:57:55 +0200 Subject: [PATCH 4/9] chore: clean up formatting --- Makefile | 4 +- cobra/evaluation/evaluator.py | 148 +++++++----- cobra/evaluation/pigs_tables.py | 77 ++++--- cobra/evaluation/plotting_utils.py | 76 ++++--- cobra/model_building/forward_selection.py | 141 ++++++++---- cobra/model_building/models.py | 18 +- cobra/model_building/univariate_selection.py | 56 +++-- .../categorical_data_processor.py | 168 +++++++++----- cobra/preprocessing/kbins_discretizer.py | 205 +++++++++++------ cobra/preprocessing/preprocessor.py | 215 ++++++++++++------ cobra/preprocessing/target_encoder.py | 81 ++++--- 11 files changed, 772 insertions(+), 417 deletions(-) diff --git a/Makefile b/Makefile index b31a1db..29466d4 100644 --- a/Makefile +++ b/Makefile @@ -23,11 +23,11 @@ lint: @echo 'lint OK' lint-minimal: - pylint E cobra + pylint -E cobra @echo 'lint minimal OK' typecheck: - mypy cobra + mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports @echo 'typecheck OK' codestyle: diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 41974e7..3255fa2 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -120,10 +120,12 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(y_true, y_pred) @staticmethod - def _compute_scalar_metrics(y_true: np.ndarray, - y_pred: np.ndarray, - y_pred_b: np.ndarray, - lift_at: float) -> pd.Series: + def _compute_scalar_metrics( + y_true: np.ndarray, + y_pred: np.ndarray, + y_pred_b: np.ndarray, + lift_at: float + ) -> pd.Series: """Compute various scalar performance measures. Parameters @@ -187,17 +189,16 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): The instance is not fitted yet. """ if self.roc_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) auc = float(self.scalar_metrics.loc["AUC"]) with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax.plot(self.roc_curve["fpr"], self.roc_curve["tpr"], color="cornflowerblue", linewidth=3, @@ -238,16 +239,19 @@ def plot_confusion_matrix( """ labels = labels or DEFAULT_LABELS if self.confusion_matrix is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax = sns.heatmap(self.confusion_matrix, - annot=self.confusion_matrix.astype(str), - fmt="s", cmap="Blues", - xticklabels=labels, yticklabels=labels) + ax = sns.heatmap( + self.confusion_matrix, + annot=self.confusion_matrix.astype(str), + fmt="s", cmap="Blues", + xticklabels=labels, yticklabels=labels + ) ax.set_title("Confusion matrix", fontsize=20) if path: @@ -271,27 +275,37 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)) The instance is not fitted yet. """ if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) x_labels, lifts, inc_rate = self.lift_curve - lifts = np.array(lifts)*inc_rate*100 with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - plt.bar(x_labels[::-1], lifts, align="center", - color="cornflowerblue") + plt.bar( + x_labels[::-1], + lifts, + align="center", + color="cornflowerblue") plt.ylabel("response (%)", fontsize=16) plt.xlabel("decile", fontsize=16) ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="Incidence") + plt.axhline( + y=inc_rate*100, + color="darkorange", + linestyle="--", + xmin=0.05, + xmax=0.95, + linewidth=3, + label="Incidence" + ) # Legend ax.legend(loc="upper right") @@ -326,9 +340,10 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): The instance is not fitted yet. """ if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) x_labels, lifts, _ = self.lift_curve @@ -343,8 +358,15 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline(y=1, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") + plt.axhline( + y=1, + color="darkorange", + linestyle="--", + xmin=0.05, + xmax=0.95, + linewidth=3, + label="Baseline" + ) # Legend ax.legend(loc="upper right") @@ -405,8 +427,10 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): plt.show() @staticmethod - def _find_optimal_cutoff(y_true: np.ndarray, - y_pred: np.ndarray) -> float: + def _find_optimal_cutoff( + y_true: np.ndarray, + y_pred: np.ndarray + ) -> float: """Find the optimal probability cut off point for a classification model. Parameters @@ -425,8 +449,11 @@ def _find_optimal_cutoff(y_true: np.ndarray, return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) @staticmethod - def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, - thresholds: np.ndarray) -> float: + def _compute_optimal_cutoff( + fpr: np.ndarray, + tpr: np.ndarray, + thresholds: np.ndarray + ) -> float: """Calculate the optimal probability cut-off point for a classification model. The optimal cut-off would be where TPR is high and FPR is low, hence @@ -454,8 +481,10 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, return thresholds[optimal_index][0] @staticmethod - def _compute_cumulative_gains(y_true: np.ndarray, - y_pred: np.ndarray) -> tuple: + def _compute_cumulative_gains( + y_true: np.ndarray, + y_pred: np.ndarray + ) -> tuple: """Compute cumulative gains of the model. Code from (https://github.com/reiinakano/scikit-plot/blob/ @@ -514,10 +543,15 @@ def _compute_lift_per_bin( tuple Includes x-labels, lifts per decile, and target incidence. """ - lifts = [ClassificationEvaluator._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=perc_lift) - for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)] + lifts = [ + ClassificationEvaluator + ._compute_lift( + y_true=y_true, + y_pred=y_pred, + lift_at=perc_lift + ) + for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True) + ] x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] @@ -562,14 +596,14 @@ def _compute_lift( avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) # Sort and filter data - data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1)) + data_sorted = ( + y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] + .reshape(stop, 1) + ) # Calculate lift (einsum is a very fast way of summing, but needs specific shape) inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] - return lift @@ -617,8 +651,10 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.qq = RegressionEvaluator._compute_qq_residuals(y_true, y_pred) @staticmethod - def _compute_scalar_metrics(y_true: np.ndarray, - y_pred: np.ndarray) -> pd.Series: + def _compute_scalar_metrics( + y_true: np.ndarray, + y_pred: np.ndarray + ) -> pd.Series: """Compute various scalar performance measures. Parameters @@ -645,8 +681,10 @@ def _compute_scalar_metrics(y_true: np.ndarray, }) @staticmethod - def _compute_qq_residuals(y_true: np.ndarray, - y_pred: np.ndarray) -> pd.Series: + def _compute_qq_residuals( + y_true: np.ndarray, + y_pred: np.ndarray + ) -> pd.Series: """Compute various scalar performance measures. Parameters @@ -694,16 +732,16 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): The instance is not fitted yet. """ if self.y_true is None and self.y_pred is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) y_true = self.y_true y_pred = self.y_pred with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable x = np.arange(1, len(y_true)+1) @@ -737,13 +775,13 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)): The instance is not fitted yet. """ if self.qq is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable x = self.qq["quantiles"] diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 5503349..e728dd0 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -38,10 +38,12 @@ def generate_pig_tables( DataFrame containing a PIG table for all predictors. """ pigs = [ - compute_pig_table(basetable, - column_name, - target_column_name, - id_column_name) + compute_pig_table( + basetable, + column_name, + target_column_name, + id_column_name + ) for column_name in sorted(preprocessed_predictors) if column_name not in [id_column_name, target_column_name] ] @@ -49,10 +51,12 @@ def generate_pig_tables( return output -def compute_pig_table(basetable: pd.DataFrame, - predictor_column_name: str, - target_column_name: str, - id_column_name: str) -> pd.DataFrame: +def compute_pig_table( + basetable: pd.DataFrame, + predictor_column_name: str, + target_column_name: str, + id_column_name: str +) -> pd.DataFrame: """Compute the PIG table of a given predictor for a given target. Parameters @@ -76,12 +80,17 @@ def compute_pig_table(basetable: pd.DataFrame, # group by the binned variable, compute the incidence # (=mean of the target for the given bin) and compute the bin size # (e.g. COUNT(id_column_name)). After that, rename the columns - res = (basetable.groupby(predictor_column_name) - .agg({target_column_name: "mean", id_column_name: "size"}) - .reset_index() - .rename(columns={predictor_column_name: "label", - target_column_name: "avg_target", - id_column_name: "pop_size"})) + res = ( + basetable + .groupby(predictor_column_name) + .agg({target_column_name: "mean", id_column_name: "size"}) + .reset_index() + .rename(columns={ + predictor_column_name: "label", + target_column_name: "avg_target", + id_column_name: "pop_size" + }) + ) # add the column name to a variable column # add the average incidence @@ -136,9 +145,11 @@ def plot_incidence( the same set of variables. """ if model_type not in ["classification", "regression"]: - raise ValueError("An unexpected value was set for the model_type " - "parameter. Expected 'classification' or " - "'regression'.") + raise ValueError( + "An unexpected value was set for the model_type " + "parameter. Expected 'classification' or " + "'regression'." + ) df_plot = pig_tables[pig_tables['variable'] == variable].copy() @@ -149,8 +160,10 @@ def plot_incidence( 'the same set of variables.') df_plot['label'] = df_plot['label'].astype('category') - df_plot['label'].cat.reorder_categories(column_order, - inplace=True) + df_plot['label'].cat.reorder_categories( + column_order, + inplace=True + ) df_plot.sort_values(by=['label'], ascending=True, inplace=True) df_plot.reset_index(inplace=True) @@ -179,12 +192,18 @@ def plot_incidence( ax.plot(np.nan, "#939598", linewidth=6, label='bin size') # Set labels & ticks - ax.set_ylabel('incidence' if model_type == "classification" else "mean target value", - fontsize=16) + ax.set_ylabel( + 'incidence' if model_type == "classification" else "mean target value", + fontsize=16 + ) ax.set_xlabel(f'{variable} bins' '', fontsize=16) ax.xaxis.set_tick_params(labelsize=14) - plt.setp(ax.get_xticklabels(), - rotation=45, ha="right", rotation_mode="anchor") + plt.setp( + ax.get_xticklabels(), + rotation=45, + ha="right", + rotation_mode="anchor" + ) ax.yaxis.set_tick_params(labelsize=14) if model_type == "classification": @@ -247,9 +266,15 @@ def plot_incidence( else: title = "Mean target plot - " + variable fig.suptitle(title, fontsize=22) - ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), - loc=3, ncol=1, mode="expand", borderaxespad=0., - prop={"size": 14}) + ax.legend( + frameon=False, + bbox_to_anchor=(0., 1.01, 1., .102), + loc=3, + ncol=1, + mode="expand", + borderaxespad=0., + prop={"size": 14} + ) # Set order of layers ax.set_zorder(1) diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 8f0a6b0..ae91220 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -40,13 +40,18 @@ def plot_univariate_predictor_quality( metric = "RMSE" ascending = True - df = (df_metric[df_metric["preselection"]] - .sort_values(by=metric+" selection", ascending=ascending)) - - df = pd.melt(df, id_vars=["predictor"], - value_vars=[metric+" train", metric+" selection"], - var_name="split", - value_name=metric) + df = ( + df_metric[df_metric["preselection"]] + .sort_values(by=metric+" selection", ascending=ascending) + ) + + df = pd.melt( + df, + id_vars=["predictor"], + value_vars=[metric+" train", metric+" selection"], + var_name="split", + value_name=metric + ) # plot data with plt.style.context("seaborn-ticks"): @@ -127,28 +132,39 @@ def plot_performance_curves( elif model_type == "regression": metric_name = "RMSE" - max_metric = np.round(max(max(model_performance['train_performance']), - max(model_performance['selection_performance']), - max(model_performance['validation_performance'])), 1) + max_metric = np.round( + max( + max(model_performance['train_performance']), + max(model_performance['selection_performance']), + max(model_performance['validation_performance']) + ), 1) with plt.style.context("seaborn-whitegrid"): - fig, ax = plt.subplots(figsize=dim) - plt.plot(model_performance['train_performance'], marker=".", - markersize=20, linewidth=3, label="train", - color=colors["train"]) - plt.plot(model_performance['selection_performance'], marker=".", - markersize=20, linewidth=3, label="selection", - color=colors["selection"]) - plt.plot(model_performance['validation_performance'], marker=".", - markersize=20, linewidth=3, label="validation", - color=colors["validation"]) + plt.plot( + model_performance['train_performance'], marker=".", + markersize=20, linewidth=3, label="train", + color=colors["train"] + ) + plt.plot( + model_performance['selection_performance'], marker=".", + markersize=20, linewidth=3, label="selection", + color=colors["selection"] + ) + plt.plot( + model_performance['validation_performance'], marker=".", + markersize=20, linewidth=3, label="validation", + color=colors["validation"] + ) # Set x- and y-ticks ax.set_xticks(np.arange(len(model_performance['last_added_predictor']))) - ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), - rotation=40, ha='right') + ax.set_xticklabels( + model_performance['last_added_predictor'].tolist(), + rotation=40, + ha='right' + ) if model_type == "classification": ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05)) @@ -160,9 +176,11 @@ def plot_performance_curves( # Make pretty ax.legend(loc='lower right') - fig.suptitle('Performance curves forward feature selection', - fontsize=20) - plt.title("Metric: "+metric_name, fontsize=15, loc="left") + fig.suptitle( + 'Performance curves forward feature selection', + fontsize=20 + ) + plt.title("Metric: " + metric_name, fontsize=15, loc="left") plt.ylabel('Model performance') if path is not None: @@ -192,9 +210,11 @@ def plot_variable_importance( """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax = sns.barplot(x="importance", y="predictor", - data=df_variable_importance, - color="cornflowerblue") + ax = sns.barplot( + x="importance", y="predictor", + data=df_variable_importance, + color="cornflowerblue" + ) if title: ax.set_title(title) else: diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index ed5f119..9b897d9 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -1,7 +1,7 @@ """Feature forward selection.""" import logging -from typing import Callable, Optional +from typing import Callable, Optional, Union import pandas as pd from tqdm.auto import tqdm @@ -15,6 +15,8 @@ DEFAULT_FORCED_PREDICTORS = [] DEFAULT_EXCLUDED_PREDICTORS = [] +Model = Union[LinearRegressionModel, LogisticRegressionModel, None] + class ForwardFeatureSelection: """Perform forward feature selection for a given dataset using a given algorithm. @@ -59,7 +61,7 @@ def __init__( self._fitted_models = [] - def get_model_from_step(self, step: int): + def get_model_from_step(self, step: int) -> Model: """Get fitted model from a particular step. Parameters @@ -78,8 +80,10 @@ def get_model_from_step(self, step: int): In case step is larger than the number of available models. """ if len(self._fitted_models) <= step: - raise ValueError(f"No model available for step {step}. " - "The first step starts from index 0.") + raise ValueError( + f"No model available for step {step}. " + "The first step starts from index 0." + ) return self._fitted_models[step] @@ -124,8 +128,10 @@ def compute_model_performances( predictor_set = set([]) for model in self._fitted_models: - last_added_predictor = (set(model.predictors) - .difference(predictor_set)) + last_added_predictor = ( + set(model.predictors) + .difference(predictor_set) + ) tmp = { "predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0] @@ -144,7 +150,6 @@ def compute_model_performances( }) results.append(tmp) - predictor_set = predictor_set.union(set(model.predictors)) df = pd.DataFrame(results) @@ -191,27 +196,41 @@ def fit( # remove excluded predictors from predictor lists forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS - filtered_predictors = [var for var in predictors - if (var not in excluded_predictors and - var not in forced_predictors)] + filtered_predictors = [ + var for var in predictors + if ( + var not in excluded_predictors + and var not in forced_predictors + ) + ] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: - raise ValueError("Size of forced_predictors cannot be bigger than " - "max_predictors.") + raise ValueError( + "Size of forced_predictors cannot be bigger than " + "max_predictors." + ) elif len(forced_predictors) == self.max_predictors: - log.info("Size of forced_predictors equals max_predictors " - "only one model will be trained...") + log.info( + "Size of forced_predictors equals max_predictors " + "only one model will be trained..." + ) # train model with all forced_predictors (only) - (self._fitted_models - .append(self._train_model(train_data[train_data["split"] == "train"], - target_column_name, - forced_predictors))) + self._fitted_models.append( + self._train_model( + train_data[train_data["split"] == "train"], + target_column_name, + forced_predictors + ) + ) + else: - self._fitted_models = self._forward_selection(train_data, - target_column_name, - filtered_predictors, - forced_predictors) + self._fitted_models = self._forward_selection( + train_data, + target_column_name, + filtered_predictors, + forced_predictors + ) def _forward_selection( self, @@ -219,7 +238,7 @@ def _forward_selection( target_column_name: str, predictors: list, forced_predictors: list = None - ) -> list: + ) -> list[Model]: """Perform the forward feature selection algorithm. The algorithm will compute a list of models (with increasing performance). @@ -251,26 +270,35 @@ def _forward_selection( max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors)) - for step in tqdm(range(1, max_steps), desc="Sequentially adding best " - "predictor..."): + for step in tqdm( + range(1, max_steps), + desc="Sequentially adding best predictor..." + ): if step <= len(forced_predictors): # first, we go through the forced predictors - candidate_predictors = [var for var in forced_predictors - if var not in current_predictors] + candidate_predictors = [ + var for var in forced_predictors + if var not in current_predictors + ] else: - candidate_predictors = [var for var in (predictors - + forced_predictors) - if var not in current_predictors] - - model = self._find_next_best_model(train_data, - target_column_name, - candidate_predictors, - current_predictors) + candidate_predictors = [ + var for var in (predictors + forced_predictors) + if var not in current_predictors + ] + + model = self._find_next_best_model( + train_data, + target_column_name, + candidate_predictors, + current_predictors + ) if model is not None: # Add new model predictors to the list of current predictors - current_predictors = list(set(current_predictors) - .union(set(model.predictors))) + current_predictors = list( + set(current_predictors) + .union(set(model.predictors)) + ) fitted_models.append(model) # else: @@ -290,7 +318,7 @@ def _find_next_best_model( target_column_name: str, candidate_predictors: list, current_predictors: list - ): + ) -> Model: """ Find the next best model with candidate predictors. @@ -324,27 +352,36 @@ def _find_next_best_model( # placeholders best_model = None if self.MLModel == LogisticRegressionModel: - best_performance = -1 # AUC metric is used + best_performance = -1.0 # AUC metric is used elif self.MLModel == LinearRegressionModel: best_performance = float("inf") # RMSE metric is used else: - raise ValueError("No metric comparison method has been configured " - "for the given model_type specified as " - "ForwardFeatureSelection argument.") + raise ValueError( + "No metric comparison method has been configured " + "for the given model_type specified as " + "ForwardFeatureSelection argument." + ) fit_data = train_data[train_data["split"] == "train"] # data to fit the models with sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with for pred in candidate_predictors: # Train a model with an additional predictor - model = self._train_model(fit_data, target_column_name, - (current_predictors + [pred])) + model = self._train_model( + fit_data, + target_column_name, + (current_predictors + [pred]) + ) # Evaluate the model - performance = (model - .evaluate(sel_data[current_predictors + [pred]], - sel_data[target_column_name], - split="selection")) + performance = ( + model + .evaluate( + sel_data[current_predictors + [pred]], + sel_data[target_column_name], + split="selection" + ) + ) if self.pos_only and (not (model.get_coef() >= 0).all()): continue @@ -362,8 +399,12 @@ def _find_next_best_model( return best_model - def _train_model(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list): + def _train_model( + self, + train_data: pd.DataFrame, + target_column_name: str, + predictors: list + ) -> Model: """Train the model with a given set of predictors. Parameters diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index cad6381..58571b3 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -233,8 +233,10 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: orient="index").reset_index() df.columns = ["predictor", "importance"] - return (df.sort_values(by="importance", ascending=False) - .reset_index(drop=True)) + return ( + df.sort_values(by="importance", ascending=False) + .reset_index(drop=True) + ) def _is_valid_dict(self, model_dict: dict) -> bool: """Check if the model dictionary is valid.""" @@ -456,12 +458,16 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, - orient="index").reset_index() + df = pd.DataFrame.from_dict( + importance_by_variable, + orient="index" + ).reset_index() df.columns = ["predictor", "importance"] - return (df.sort_values(by="importance", ascending=False) - .reset_index(drop=True)) + return ( + df.sort_values(by="importance", ascending=False) + .reset_index(drop=True) + ) @staticmethod def _is_valid_dict(model_dict: dict) -> bool: diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 48c960b..e4d1ff6 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -81,9 +81,13 @@ def compute_univariate_preselection( y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor]) - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) + result.append( + { + "predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection + } + ) df_auc = pd.DataFrame(result) @@ -92,8 +96,8 @@ def compute_univariate_preselection( # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) - < preselect_overtrain_threshold) + preselect_overtrain = df_auc["AUC train"] - df_auc["AUC selection"] + auc_overtrain = preselect_overtrain < preselect_overtrain_threshold df_auc["preselection"] = auc_thresh & auc_overtrain @@ -111,9 +115,13 @@ def compute_univariate_preselection( y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor])) - result.append({"predictor": cleaned_predictor, - "RMSE train": rmse_train, - "RMSE selection": rmse_selection}) + result.append( + { + "predictor": cleaned_predictor, + "RMSE train": rmse_train, + "RMSE selection": rmse_selection + } + ) df_rmse = pd.DataFrame(result) @@ -122,8 +130,8 @@ def compute_univariate_preselection( # Identify those variables for which the RMSE difference between train # and selection is within a user-defined ratio - rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC - < preselect_overtrain_threshold) + preselect_overtrain = df_rmse["RMSE selection"] - df_rmse["RMSE train"] # flip subtraction vs. AUC + rmse_overtrain = preselect_overtrain < preselect_overtrain_threshold df_rmse["preselection"] = rmse_thresh & rmse_overtrain @@ -148,19 +156,25 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: List of preselected predictors. """ if "AUC selection" in df_metric.columns: - predictor_list = (df_metric[df_metric["preselection"]] - .sort_values(by="AUC selection", ascending=False) - .predictor.tolist()) + predictor_list = ( + df_metric[df_metric["preselection"]] + .sort_values(by="AUC selection", ascending=False) + .predictor.tolist() + ) elif "RMSE selection" in df_metric.columns: - predictor_list = (df_metric[df_metric["preselection"]] - .sort_values(by="RMSE selection", ascending=True) # lower is better - .predictor.tolist()) + predictor_list = ( + df_metric[df_metric["preselection"]] + .sort_values(by="RMSE selection", ascending=True) # lower is better + .predictor.tolist() + ) return [col + "_enc" for col in predictor_list] -def compute_correlations(target_enc_train_data: pd.DataFrame, - predictors: list) -> pd.DataFrame: +def compute_correlations( + target_enc_train_data: pd.DataFrame, + predictors: list +) -> pd.DataFrame: """Compute the correlations amongst the predictors in the DataFrame. Parameters @@ -178,8 +192,10 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, """ correlations = target_enc_train_data[predictors].corr() - predictors_cleaned = [utils.clean_predictor_name(predictor) - for predictor in predictors] + predictors_cleaned = [ + utils.clean_predictor_name(predictor) + for predictor in predictors + ] # Change index and columns with the cleaned version of the predictors # e.g. change "var1_enc" with "var1" diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index bf60079..9d2f263 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -61,9 +61,11 @@ class CategoricalDataProcessor(BaseEstimator): Whether contingency table should be scaled before chi^2. """ - valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories"] + valid_keys = [ + "model_type", "regroup", "regroup_name", "keep_missing", + "category_size_threshold", "p_value_threshold", + "scale_contingency_table", "forced_categories" + ] def __init__( self, @@ -81,7 +83,7 @@ def __init__( raise ValueError( "An unexpected model_type was provided. " "A valid model_type is either 'classification' or 'regression'." - ) + ) self.model_type = model_type self.regroup = regroup @@ -130,9 +132,11 @@ def set_attributes_from_dict(self, params: dict): _fitted_output = params.pop("_cleaned_categories_by_column", {}) if type(_fitted_output) != dict: - raise ValueError("_cleaned_categories_by_column is expected to " - "be a dict but is of type {} instead" - .format(type(_fitted_output))) + raise ValueError( + "_cleaned_categories_by_column is expected to " + "be a dict but is of type {} instead" + .format(type(_fitted_output)) + ) # Clean out params dictionary to remove unknown keys (for safety!) params = {key: params[key] for key in params if key in self.valid_keys} @@ -147,8 +151,12 @@ def set_attributes_from_dict(self, params: dict): return self - def fit(self, data: pd.DataFrame, column_names: list, - target_column: str): + def fit( + self, + data: pd.DataFrame, + column_names: list, + target_column: str + ): """Fit the CategoricalDataProcessor. Parameters @@ -166,12 +174,15 @@ def fit(self, data: pd.DataFrame, column_names: list, log.info("regroup was set to False, so no fitting is required") return None - for column_name in tqdm(column_names, desc="Fitting category " - "regrouping..."): - + for column_name in tqdm( + column_names, + desc="Fitting category regrouping..." + ): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting" .format(column_name) + ) continue cleaned_cats = self._fit_column(data, column_name, target_column) @@ -220,9 +231,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, combined_categories = set() # replace missings and get unique categories as a list - X = (CategoricalDataProcessor - ._replace_missings(data[column_name]) - .astype(object)) + X = ( + CategoricalDataProcessor + ._replace_missings(data[column_name]) + .astype(object) + ) unique_categories = list(X.unique()) @@ -235,21 +248,28 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, # get small categories and add them to the merged category list # does not apply incidence factor when model_type = "regression" - small_categories = (CategoricalDataProcessor - ._get_small_categories( - X, - incidence, - self.category_size_threshold)) + small_categories = ( + CategoricalDataProcessor + ._get_small_categories( + X, + incidence, + self.category_size_threshold + ) + ) combined_categories = combined_categories.union(small_categories) for category in unique_categories: if category in small_categories: continue - pval = (CategoricalDataProcessor - ._compute_p_value(X, y, category, - model_type, - self.scale_contingency_table)) + pval = ( + CategoricalDataProcessor + ._compute_p_value( + X, y, category, + model_type, + self.scale_contingency_table + ) + ) # if not significant, add it to the list if pval > self.p_value_threshold: @@ -261,8 +281,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, return set(unique_categories).difference(combined_categories) - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform( + self, + data: pd.DataFrame, + column_names: list + ) -> pd.DataFrame: """Transform the data. Parameters @@ -279,24 +302,26 @@ def transform(self, data: pd.DataFrame, Data with additional transformed variables. """ if self.regroup and len(self._cleaned_categories_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in column_names: if column_name not in data.columns: - log.warning("Unknown column '{}' will be skipped" - .format(column_name)) + log.warning("Unknown column '{}' will be skipped".format(column_name)) continue data = self._transform_column(data, column_name) return data - def _transform_column(self, data: pd.DataFrame, - column_name: str) -> pd.DataFrame: + def _transform_column( + self, data: pd.DataFrame, + column_name: str + ) -> pd.DataFrame: """Create an additional column which combines categories into "Other". Parameters @@ -315,11 +340,13 @@ def _transform_column(self, data: pd.DataFrame, data.loc[:, column_name_clean] = data[column_name].astype(object) # Fill missings first - data.loc[:, column_name_clean] = (CategoricalDataProcessor - ._replace_missings( - data, - column_name_clean - )) + data.loc[:, column_name_clean] = ( + CategoricalDataProcessor + ._replace_missings( + data, + column_name_clean + ) + ) if self.regroup: categories = self._cleaned_categories_by_column.get(column_name) @@ -332,20 +359,26 @@ def _transform_column(self, data: pd.DataFrame, "and will be skipped".format(column_name)) return data - data.loc[:, column_name_clean] = (CategoricalDataProcessor - ._replace_categories( - data[column_name_clean], - categories, - self.regroup_name)) + data.loc[:, column_name_clean] = ( + CategoricalDataProcessor + ._replace_categories( + data[column_name_clean], + categories, + self.regroup_name + ) + ) # change data to categorical - data.loc[:, column_name_clean] = (data[column_name_clean] - .astype("category")) + data.loc[:, column_name_clean] = data[column_name_clean].astype("category") return data - def fit_transform(self, data: pd.DataFrame, column_names: list, - target_column: str) -> pd.DataFrame: + def fit_transform( + self, + data: pd.DataFrame, + column_names: list, + target_column: str + ) -> pd.DataFrame: """Fit and transform the data. Parameters @@ -367,9 +400,11 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, return self.transform(data, column_names) @staticmethod - def _get_small_categories(predictor_series: pd.Series, - incidence: float, - category_size_threshold: int) -> set: + def _get_small_categories( + predictor_series: pd.Series, + incidence: float, + category_size_threshold: int + ) -> set: """ Fetch categories with a size below a certain threshold. @@ -400,8 +435,10 @@ def _get_small_categories(predictor_series: pd.Series, return set(category_counts[bool_mask].index.tolist()) @staticmethod - def _replace_missings(data: pd.DataFrame, - column_names: Optional[list] = None) -> pd.DataFrame: + def _replace_missings( + data: pd.DataFrame, + column_names: Optional[list] = None + ) -> pd.DataFrame: """Replace missing values (incl. empty strings). Parameters @@ -431,9 +468,13 @@ def _replace_missings(data: pd.DataFrame, return temp @staticmethod - def _compute_p_value(X: pd.Series, y: pd.Series, category: str, - model_type: str, - scale_contingency_table: bool) -> float: + def _compute_p_value( + X: pd.Series, + y: pd.Series, + category: str, + model_type: str, + scale_contingency_table: bool + ) -> float: """ Calculate p-value. @@ -483,14 +524,19 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, pval = stats.chi2_contingency(contingency_table, correction=False)[1] elif model_type == "regression": - pval = stats.kruskal(df.y[df.other_categories == 0], - df.y[df.other_categories == 1])[1] + pval = stats.kruskal( + df.y[df.other_categories == 0], + df.y[df.other_categories == 1] + )[1] return pval @staticmethod - def _replace_categories(data: pd.Series, categories: set, - replace_with: str) -> pd.Series: + def _replace_categories( + data: pd.Series, + categories: set, + replace_with: str + ) -> pd.Series: """ Replace categories in set with "Other". diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 1a903d9..7621ac8 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -1,6 +1,7 @@ """Binning of continous data.""" # standard lib imports from copy import deepcopy +from this import d from typing import List import numbers import logging @@ -63,9 +64,11 @@ class KBinsDiscretizer(BaseEstimator): """ valid_strategies = ("uniform", "quantile") - valid_keys = ["n_bins", "strategy", "closed", "auto_adapt_bins", - "starting_precision", "label_format", - "change_endpoint_format"] + valid_keys = [ + "n_bins", "strategy", "closed", "auto_adapt_bins", + "starting_precision", "label_format", + "change_endpoint_format" + ] def __init__( self, n_bins: int = 10, strategy: str = "quantile", @@ -104,14 +107,21 @@ def _validate_n_bins(self, n_bins: int): in case ``n_bins`` is not an integer or if ``n_bins < 2`` """ if not isinstance(n_bins, numbers.Integral): - raise ValueError("{} received an invalid n_bins type. " - "Received {}, expected int." - .format(KBinsDiscretizer.__name__, - type(n_bins).__name__)) + raise ValueError( + "{} received an invalid n_bins type. Received {}, expected int." + .format( + KBinsDiscretizer.__name__, + type(n_bins).__name__ + ) + ) if n_bins < 2: - raise ValueError("{} received an invalid number " - "of bins. Received {}, expected at least 2." - .format(KBinsDiscretizer.__name__, n_bins)) + raise ValueError( + "{} received an invalid number of bins. Received {}, expected at least 2." + .format( + KBinsDiscretizer.__name__, + n_bins + ) + ) def attributes_to_dict(self) -> dict: """Return the attributes of KBinsDiscretizer as a dictionary. @@ -148,9 +158,10 @@ def set_attributes_from_dict(self, params: dict): _bins_by_column = params.pop("_bins_by_column", {}) if type(_bins_by_column) != dict: - raise ValueError("_bins_by_column is expected to be a dict " - "but is of type {} instead" - .format(type(_bins_by_column))) + raise ValueError( + "_bins_by_column is expected to be a dict but is of type {} instead" + .format(type(_bins_by_column)) + ) # Clean out params dictionary to remove unknown keys (for safety!) params = {key: params[key] for key in params if key in self.valid_keys} @@ -177,17 +188,23 @@ def fit(self, data: pd.DataFrame, column_names: list): Names of the columns of the DataFrame to discretize """ if self.strategy not in self.valid_strategies: - raise ValueError("{}: valid options for 'strategy' are {}. " - "Got strategy={!r} instead." - .format(KBinsDiscretizer.__name__, - self.valid_strategies, self.strategy)) - - for column_name in tqdm(column_names, desc="Computing " - "discretization bins..."): - + raise ValueError( + "{}: valid options for 'strategy' are {}. Got strategy={!r} instead." + .format( + KBinsDiscretizer.__name__, + self.valid_strategies, self.strategy + ) + ) + + for column_name in tqdm( + column_names, desc="Computing discretization bins..." + ): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting" + .format(column_name) + ) continue bins = self._fit_column(data, column_name) @@ -195,8 +212,11 @@ def fit(self, data: pd.DataFrame, column_names: list): # Add to bins_by_column for later use self._bins_by_column[column_name] = bins - def _fit_column(self, data: pd.DataFrame, - column_name: str) -> List[tuple]: + def _fit_column( + self, + data: pd.DataFrame, + column_name: str + ) -> List[tuple]: """Compute bins for a specific column in data. Parameters @@ -214,25 +234,31 @@ def _fit_column(self, data: pd.DataFrame, col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: - log.warning("Predictor '{}' is constant and " - "will be ignored in computation".format(column_name)) + log.warning( + "Predictor '{}' is constant and will be ignored in computation" + .format(column_name) + ) return None prop_inf = (np.sum(np.isinf(data[column_name])) / data[column_name].shape[0]) if prop_inf > 0: - log.warning(f"Column {column_name} has " - f"{prop_inf:.1%} inf values, thus it was skipped. " - f"Consider dropping or transforming it.") + log.warning( + f"Column {column_name} has " + f"{prop_inf:.1%} inf values, thus it was skipped. " + f"Consider dropping or transforming it." + ) return None prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] if prop_nan >= 0.99: - log.warning(f"Column {column_name} is" - f" {prop_nan:.1%}% NaNs, " - f"consider dropping or transforming it.") + log.warning( + f"Column {column_name} is" + f" {prop_nan:.1%}% NaNs, " + f"consider dropping or transforming it." + ) n_bins = self.n_bins if self.auto_adapt_bins: @@ -240,23 +266,37 @@ def _fit_column(self, data: pd.DataFrame, missing_pct = data[column_name].isnull().sum()/size n_bins = int(max(round((1 - missing_pct) * n_bins), 2)) - bin_edges = self._compute_bin_edges(data, column_name, n_bins, - col_min, col_max) + bin_edges = self._compute_bin_edges( + data, + column_name, + n_bins, + col_min, + col_max + ) if len(bin_edges) < 3: - log.warning("Only 1 bin was found for predictor '{}' so it will " - "be ignored in computation".format(column_name)) + log.warning( + "Only 1 bin was found for predictor '{}' so it will " + "be ignored in computation" + .format(column_name) + ) return None if len(bin_edges) < n_bins + 1: - log.warning("The number of actual bins for predictor '{}' is {} " - "which is smaller than the requested number of bins " - "{}".format(column_name, len(bin_edges) - 1, n_bins)) + log.warning( + "The number of actual bins for predictor '{}' is {} " + "which is smaller than the requested number of bins " + "{}" + .format(column_name, len(bin_edges) - 1, n_bins) + ) return self._compute_bins_from_edges(bin_edges) - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform( + self, + data: pd.DataFrame, + column_names: list + ) -> pd.DataFrame: """Discretize the data in the given list of columns. This is done by mapping each number to @@ -275,9 +315,10 @@ def transform(self, data: pd.DataFrame, data with additional discretized variables """ if len(self._bins_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in tqdm(column_names, desc="Discretizing columns..."): @@ -293,9 +334,11 @@ def transform(self, data: pd.DataFrame, return data - def _transform_column(self, data: pd.DataFrame, - column_name: str, - bins: List[tuple]) -> pd.DataFrame: + def _transform_column( + self, data: pd.DataFrame, + column_name: str, + bins: List[tuple] + ) -> pd.DataFrame: """Create a new column with binned values of column_name. Parameters @@ -317,14 +360,18 @@ def _transform_column(self, data: pd.DataFrame, column_name_bin = column_name + "_bin" # use pd.cut to compute bins - data.loc[:, column_name_bin] = pd.cut(x=data[column_name], - bins=interval_idx) + data.loc[:, column_name_bin] = pd.cut( + x=data[column_name], + bins=interval_idx + ) # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) - data.loc[:, column_name_bin] = (data[column_name_bin] - .cat.rename_categories(bin_labels)) + data.loc[:, column_name_bin] = ( + data[column_name_bin] + .cat.rename_categories(bin_labels) + ) if data[column_name_bin].isnull().sum() > 0: @@ -337,8 +384,11 @@ def _transform_column(self, data: pd.DataFrame, return data - def fit_transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def fit_transform( + self, + data: pd.DataFrame, + column_names: list + ) -> pd.DataFrame: """Fit to data, then transform it. Parameters @@ -356,9 +406,14 @@ def fit_transform(self, data: pd.DataFrame, self.fit(data, column_names) return self.transform(data, column_names) - def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, - n_bins: int, col_min: float, - col_max: float) -> list: + def _compute_bin_edges( + self, + data: pd.DataFrame, + column_name: str, + n_bins: int, + col_min: float, + col_max: float + ) -> list: """Compute the desired bin edges. Parameters @@ -381,9 +436,13 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, """ bin_edges = [] if self.strategy == "quantile": - bin_edges = list(data[column_name] - .quantile(np.linspace(0, 1, n_bins + 1), - interpolation="linear")) + bin_edges = list( + data[column_name] + .quantile( + np.linspace(0, 1, n_bins + 1), + interpolation="linear" + ) + ) elif self.strategy == "uniform": bin_edges = list(np.linspace(col_min, col_max, n_bins + 1)) @@ -397,8 +456,9 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, bin_edges[-1] = np.inf if np.isnan(bin_edges).sum() > 0: - log.warning(f"Column {column_name} " - "has NaNs present in bin definitions") + log.warning( + f"Column {column_name} has NaNs present in bin definitions" + ) # Make absolutely sure bin edges are ordered, # in very rare situations this wasn't the case @@ -499,10 +559,14 @@ def _create_index( """ # check if closed is of the proper form if closed not in ["left", "right"]: - raise ValueError("{}: valid options for 'closed' are {}. " - "Got strategy={!r} instead." - .format(KBinsDiscretizer.__name__, - ["left", "right"], closed)) + raise ValueError( + "{}: valid options for 'closed' are {}. " + "Got strategy={!r} instead." + .format( + KBinsDiscretizer.__name__, + ["left", "right"], closed + ) + ) # deepcopy variable because we do not want to modify the content # of intervals (which is still used outside of this function) @@ -530,8 +594,13 @@ def _create_bin_labels(self, bins: List[tuple]) -> list: """ bin_labels = [] for interval in bins: - bin_labels.append(self.label_format.format(interval[0], - interval[1])) + bin_labels.append( + self.label_format + .format( + interval[0], + interval[1] + ) + ) # Format first and last bin as < x and > y resp. if self.change_endpoint_format: diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 5aa9bda..64c0fa9 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -156,19 +156,23 @@ def from_params( Class encapsulating CategoricalDataProcessor, KBinsDiscretizer, and TargetEncoder instances. """ - categorical_data_processor = CategoricalDataProcessor(model_type, - regroup, - regroup_name, keep_missing, - category_size_threshold, - p_value_threshold, - scale_contingency_table, - forced_categories) - - discretizer = KBinsDiscretizer(n_bins, strategy, closed, - auto_adapt_bins, - starting_precision, - label_format, - change_endpoint_format) + categorical_data_processor = CategoricalDataProcessor( + model_type, + regroup, + regroup_name, keep_missing, + category_size_threshold, + p_value_threshold, + scale_contingency_table, + forced_categories + ) + + discretizer = KBinsDiscretizer( + n_bins, strategy, closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format + ) target_encoder = TargetEncoder(weight, imputation_strategy) @@ -199,8 +203,10 @@ def from_pipeline(cls, pipeline: dict): and no others. """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline, as it does not " - "contain all and only the required parameters.") + raise ValueError( + "Invalid pipeline, as it does not " + "contain all and only the required parameters." + ) categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( @@ -214,11 +220,20 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(categorical_data_processor, discretizer, target_encoder, - is_fitted=pipeline["_is_fitted"]) + return cls( + categorical_data_processor, + discretizer, + target_encoder, + is_fitted=pipeline["_is_fitted"] + ) - def fit(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, target_column_name: str): + def fit( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str + ): """Fit the data to the preprocessing pipeline. Parameters @@ -233,9 +248,13 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, Column name of the target. """ # get list of all variables - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = ( + PreProcessor + ._get_variable_list( + continuous_vars, + discrete_vars + ) + ) log.info("Starting to fit pipeline") start = time.time() @@ -249,35 +268,55 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, if continuous_vars: begin = time.time() self._discretizer.fit(train_data, continuous_vars) - log.info("Fitting KBinsDiscretizer took {} seconds" - .format(time.time() - begin)) - - train_data = self._discretizer.transform(train_data, - continuous_vars) + log.info( + "Fitting KBinsDiscretizer took {} seconds" + .format(time.time() - begin) + ) + + train_data = self._discretizer.transform( + train_data, + continuous_vars + ) if discrete_vars: begin = time.time() - self._categorical_data_processor.fit(train_data, - discrete_vars, - target_column_name) - log.info("Fitting categorical_data_processor class took {} seconds" - .format(time.time() - begin)) - - train_data = (self._categorical_data_processor - .transform(train_data, discrete_vars)) + self._categorical_data_processor.fit( + train_data, + discrete_vars, + target_column_name + ) + log.info( + "Fitting categorical_data_processor class took {} seconds" + .format(time.time() - begin) + ) + + train_data = ( + self._categorical_data_processor + .transform(train_data, discrete_vars) + ) begin = time.time() - self._target_encoder.fit(train_data, preprocessed_variable_names, - target_column_name) - log.info("Fitting TargetEncoder took {} seconds" - .format(time.time() - begin)) + self._target_encoder.fit( + train_data, preprocessed_variable_names, + target_column_name + ) + log.info( + "Fitting TargetEncoder took {} seconds" + .format(time.time() - begin) + ) self._is_fitted = True # set fitted boolean to True - log.info("Fitting pipeline took {} seconds" - .format(time.time() - start)) + log.info( + "Fitting pipeline took {} seconds" + .format(time.time() - start) + ) - def transform(self, data: pd.DataFrame, continuous_vars: list, - discrete_vars: list) -> pd.DataFrame: + def transform( + self, + data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list + ) -> pd.DataFrame: """Transform the data by applying the preprocessing pipeline. Parameters @@ -302,33 +341,48 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, start = time.time() if not self._is_fitted: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") - + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) - preprocessed_variable_names = (PreProcessor - ._get_variable_list(continuous_vars, - discrete_vars)) + preprocessed_variable_names = ( + PreProcessor + ._get_variable_list( + continuous_vars, + discrete_vars + ) + ) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) if discrete_vars: - data = self._categorical_data_processor.transform(data, - discrete_vars) - - data = self._target_encoder.transform(data, - preprocessed_variable_names) + data = self._categorical_data_processor.transform( + data, + discrete_vars + ) + + data = self._target_encoder.transform( + data, + preprocessed_variable_names + ) - log.info("Transforming data took {} seconds" - .format(time.time() - start)) + log.info( + "Transforming data took {} seconds" + .format(time.time() - start) + ) return data - def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, - discrete_vars: list, - target_column_name: str) -> pd.DataFrame: + def fit_transform( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str + ) -> pd.DataFrame: """Fit preprocessing pipeline and transform the data. Parameters @@ -347,8 +401,12 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, pd.DataFrame Transformed (preprocessed) data. """ - self.fit(train_data, continuous_vars, discrete_vars, - target_column_name) + self.fit( + train_data, + continuous_vars, + discrete_vars, + target_column_name + ) return self.transform(train_data, continuous_vars, discrete_vars) @@ -382,8 +440,10 @@ def train_selection_validation_split( DataFrame with additional split column. """ if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): - raise ValueError("The sum of train_prop, selection_prop and " - "validation_prop must be 1.0.") + raise ValueError( + "The sum of train_prop, selection_prop and " + "validation_prop must be 1.0." + ) if train_prop == 0.0: raise ValueError("train_prop cannot be zero!") @@ -428,13 +488,17 @@ def serialize_pipeline(self) -> dict: } } - pipeline["categorical_data_processor"] = (self - ._categorical_data_processor - .attributes_to_dict()) + pipeline["categorical_data_processor"] = ( + self + ._categorical_data_processor + .attributes_to_dict() + ) pipeline["discretizer"] = self._discretizer.attributes_to_dict() - pipeline["target_encoder"] = (self._target_encoder - .attributes_to_dict()) + pipeline["target_encoder"] = ( + self._target_encoder + .attributes_to_dict() + ) pipeline["_is_fitted"] = True @@ -450,13 +514,20 @@ def _is_valid_pipeline(pipeline: dict) -> bool: Loaded pipeline from JSON file. """ keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set([key for key in keys - if key not in ["cls", "serialization_path"]]) + valid_keys = set( + [ + key for key in keys + if key not in ["cls", "serialization_path"] + ] + ) input_keys = set() for key in pipeline: - if key in ["categorical_data_processor", "discretizer", - "target_encoder"]: + if key in [ + "categorical_data_processor", + "discretizer", + "target_encoder" + ]: input_keys = input_keys.union(set(pipeline[key].keys())) elif key != "metadata": input_keys.add(key) diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 0a9028f..7485b6b 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -76,16 +76,22 @@ def __init__( if weight < 0: raise ValueError("The value of weight cannot be smaller than zero.") elif imputation_strategy not in self.valid_imputation_strategies: - raise ValueError("Valid options for 'imputation_strategy' are {}." - " Got imputation_strategy={!r} instead." - .format(self.valid_imputation_strategies, - imputation_strategy)) + raise ValueError( + "Valid options for 'imputation_strategy' are {}. " + "Got imputation_strategy={!r} instead." + .format( + self.valid_imputation_strategies, + imputation_strategy + ) + ) if weight == 0: - log.warning("The target encoder's additive smoothing weight is " - "set to 0. This disables smoothing and may make the " - "encoding prone to overfitting. Increase the weight " - "if needed.") + log.warning( + "The target encoder's additive smoothing weight is " + "set to 0. This disables smoothing and may make the " + "encoding prone to overfitting. Increase the weight " + "if needed." + ) self.weight = weight self.imputation_strategy = imputation_strategy @@ -149,8 +155,12 @@ def dict_to_series(key, value): return self - def fit(self, data: pd.DataFrame, column_names: list, - target_column: str): + def fit( + self, + data: pd.DataFrame, + column_names: list, + target_column: str + ): """Fit the TargetEncoder to the data. Parameters @@ -169,8 +179,11 @@ def fit(self, data: pd.DataFrame, column_names: list, for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting" + .format(column) + ) continue self._mapping[column] = self._fit_column(data[column], y) @@ -205,8 +218,11 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: return numerator / denominator - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform( + self, + data: pd.DataFrame, + column_names: list + ) -> pd.DataFrame: """Replace (e.g. encode) values of each categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -231,8 +247,10 @@ def transform(self, data: pd.DataFrame, method. """ if (len(self._mapping) == 0) or (self._global_mean is None): - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column in tqdm(column_names, desc="Applying target encoding..."): @@ -248,8 +266,11 @@ def transform(self, data: pd.DataFrame, return data - def _transform_column(self, data: pd.DataFrame, - column_name: str) -> pd.DataFrame: + def _transform_column( + self, + data: pd.DataFrame, + column_name: str + ) -> pd.DataFrame: """Replace (e.g. encode) values of a categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -272,8 +293,10 @@ def _transform_column(self, data: pd.DataFrame, # Convert dtype to float, because when the original dtype # is of type "category", the resulting dtype would otherwise also be of # type "category": - data[new_column] = (data[column_name].map(self._mapping[column_name]) - .astype("float")) + data[new_column] = ( + data[column_name].map(self._mapping[column_name]) + .astype("float") + ) # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result @@ -281,20 +304,20 @@ def _transform_column(self, data: pd.DataFrame, # configured imputation strategy: if data[new_column].isnull().sum() > 0: if self.imputation_strategy == "mean": - data[new_column].fillna(self._global_mean, - inplace=True) + data[new_column].fillna(self._global_mean, inplace=True) elif self.imputation_strategy == "min": - data[new_column].fillna(data[new_column].min(), - inplace=True) + data[new_column].fillna(data[new_column].min(), inplace=True) elif self.imputation_strategy == "max": - data[new_column].fillna(data[new_column].max(), - inplace=True) + data[new_column].fillna(data[new_column].max(), inplace=True) return data - def fit_transform(self, data: pd.DataFrame, - column_names: list, - target_column: str) -> pd.DataFrame: + def fit_transform( + self, + data: pd.DataFrame, + column_names: list, + target_column: str + ) -> pd.DataFrame: """Fit the encoder and transform the data. Parameters From 39a28fbf3cf6111b50eeafc813ca43485b7f37da Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 20 May 2022 11:07:41 +0200 Subject: [PATCH 5/9] feat: replace pylint with black --- .pylintrc | 585 ------------------ Makefile | 11 +- cobra/evaluation/__init__.py | 20 +- cobra/evaluation/evaluator.py | 261 +++----- cobra/evaluation/pigs_tables.py | 149 ++--- cobra/evaluation/plotting_utils.py | 102 ++- cobra/model_building/__init__.py | 14 +- cobra/model_building/forward_selection.py | 164 ++--- cobra/model_building/models.py | 86 +-- cobra/model_building/univariate_selection.py | 54 +- cobra/preprocessing/__init__.py | 5 +- .../categorical_data_processor.py | 161 ++--- cobra/preprocessing/kbins_discretizer.py | 150 ++--- cobra/preprocessing/preprocessor.py | 194 ++---- cobra/preprocessing/target_encoder.py | 75 +-- cobra/utils.py | 4 +- requirements.dev.txt | 2 +- 17 files changed, 472 insertions(+), 1565 deletions(-) delete mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index ee9601a..0000000 --- a/.pylintrc +++ /dev/null @@ -1,585 +0,0 @@ -[MASTER] - -# Specify a configuration file. -#rcfile= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Files or directories to be skipped. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the ignore-list. The -# regex matches against paths and can be in Posix or Windows format. -ignore-paths= - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. -ignore-patterns=^\.# - -# Pickle collected data for later comparisons. -persistent=yes - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - pylint.extensions.check_elif, - pylint.extensions.bad_builtin, - pylint.extensions.docparams, - pylint.extensions.for_any_all, - pylint.extensions.set_membership, - pylint.extensions.code_style, - pylint.extensions.overlapping_exceptions, - pylint.extensions.typing, - pylint.extensions.redefined_variable_type, - pylint.extensions.comparison_placement, - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-allow-list= - -# Minimum supported python version -py-version = 3.7.2 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# Specify a score threshold to be exceeded before program exits with error. -fail-under=10.0 - -# Return non-zero exit code if any of these messages/categories are detected, -# even if score is above --fail-under value. Syntax same as enable. Messages -# specified are enabled, while categories only check already-enabled messages. -fail-on= - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -# confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable= - use-symbolic-message-instead, - useless-suppression, - fixme - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then re-enable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" - -disable= - attribute-defined-outside-init, - duplicate-code, - invalid-name, - missing-docstring, - protected-access, - too-few-public-methods, - # handled by black - format, - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables 'fatal', 'error', 'warning', 'refactor', 'convention' -# and 'info', which contain the number of messages in each category, as -# well as 'statement', which is the total number of statements analyzed. This -# score is used by the global evaluation report (RP0004). -evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - -# Activate the evaluation score. -score=yes - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - -# The type of string formatting that logging methods do. `old` means using % -# formatting, `new` is for `{}` formatting. -logging-format-style=old - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - -# Regular expression of note tags to take in consideration. -#notes-rgx= - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Signatures are removed from the similarity computation -ignore-signatures=no - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_$|dummy - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of names allowed to shadow builtins -allowed-redefined-builtins= - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.* - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=100 - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Maximum number of lines in a module -max-module-lines=2000 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_, - ax, - cv, - df, - exc, - i, - j, - l, - lr, - m, - n, - q, - qq, - s, - t, - v, - x, - X, - X_train, - X_test, - y, - - -# Good variable names regexes, separated by a comma. If names match any regex, -# they will always be accepted -good-names-rgxs= - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Bad variable names regexes, separated by a comma. If names match any regex, -# they will always be refused -bad-names-rgxs= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names -function-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names -variable-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names -attr-rgx=[a-z_][a-z0-9_]{2,}$ - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names -argument-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming style matching correct class constant names. -class-const-naming-style=UPPER_CASE - -# Regular expression matching correct class constant names. Overrides class- -# const-naming-style. -#class-const-rgx= - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,}$ - -# Regular expression which can overwrite the naming style set by typevar-naming-style. -#typevar-rgx= - -# Regular expression which should only match function or class names that do -# not require a docstring. Use ^(?!__init__$)_ to also check __init__. -no-docstring-rgx=__.*__ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# List of decorators that define properties, such as abc.abstractproperty. -property-classes=abc.abstractproperty - - -[TYPECHECK] - -# Regex pattern to define which classes are considered mixins if ignore-mixin- -# members is set to 'yes' -mixin-class-rgx=.*MixIn - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members=REQUEST,acl_users,aq_parent,argparse.Namespace - -# List of decorators that create context managers from functions, such as -# contextlib.contextmanager. -contextmanager-decorators=contextlib.contextmanager - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# List of comma separated words that should be considered directives if they -# appear and the beginning of a comment and should not be checked. -spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=10 - -# Maximum number of locals for function / method body -max-locals=25 - -# Maximum number of return / yield for function / method body -max-returns=11 - -# Maximum number of branch for function / method body -max-branches=27 - -# Maximum number of statements in function / method body -max-statements=100 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# List of qualified class names to ignore when counting class parents (see R0901). -ignored-parents= - -# Maximum number of attributes for a class (see R0902). -max-attributes=11 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=25 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# List of regular expressions of class ancestor names to -# ignore when counting public methods (see R0903). -exclude-too-few-public-methods= - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp,__post_init__ - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - -# Warn about protected attribute access inside special methods -check-protected-access-in-special-methods=no - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub,TERMIOS,Bastion,rexec - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception - - -[TYPING] - -# Set to ``no`` if the app / library does **NOT** need to support runtime -# introspection of type annotations. If you use type annotations -# **exclusively** for type checking of an application, you're probably fine. -# For libraries, evaluate if some users what to access the type hints at -# runtime first, e.g., through ``typing.get_type_hints``. Applies to Python -# versions 3.7 - 3.9 -runtime-typing = no - - -[DEPRECATED_BUILTINS] - -# List of builtins function names that should not be used, separated by a comma -bad-functions=map,input - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit,argparse.parse_error - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=no - -# This flag controls whether the implicit-str-concat should generate a warning -# on implicit string concatenation in sequences defined over several lines. -check-str-concat-over-line-jumps=no - - -[CODE_STYLE] - -# Max line length for which to sill emit suggestions. Used to prevent optional -# suggestions which would get split by a code formatter (e.g., black). Will -# default to the setting for ``max-line-length``. -#max-line-length-suggestions= \ No newline at end of file diff --git a/Makefile b/Makefile index 29466d4..4789718 100644 --- a/Makefile +++ b/Makefile @@ -18,13 +18,8 @@ test-unit: pytest tests @echo 'unit tests OK' -lint: - pylint cobra - @echo 'lint OK' - -lint-minimal: - pylint -E cobra - @echo 'lint minimal OK' +black-check: + black --diff --line-length 120 cobra/ typecheck: mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports @@ -38,4 +33,4 @@ docstyle: pydocstyle cobra @echo 'docstyle OK' -code-qa: typecheck codestyle docstyle lint-minimal +code-qa: typecheck codestyle docstyle diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index d480bdb..8302ea9 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -13,12 +13,14 @@ # from .evaluator import Evaluator from .evaluator import ClassificationEvaluator, RegressionEvaluator -__all__ = ["generate_pig_tables", - "compute_pig_table", - "plot_incidence", - "plot_performance_curves", - "plot_variable_importance", - "plot_univariate_predictor_quality", - "plot_correlation_matrix", - "ClassificationEvaluator", - "RegressionEvaluator"] +__all__ = [ + "generate_pig_tables", + "compute_pig_table", + "plot_incidence", + "plot_performance_curves", + "plot_variable_importance", + "plot_univariate_predictor_quality", + "plot_correlation_matrix", + "ClassificationEvaluator", + "RegressionEvaluator", +] diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 3255fa2..22e034b 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -1,5 +1,6 @@ """Evaluate the created model.""" +from typing import Any, Union, cast import numpy as np import pandas as pd @@ -30,7 +31,7 @@ DEFAULT_LABELS = ["0", "1"] -class ClassificationEvaluator(): +class ClassificationEvaluator: """Evaluator class encapsulating classification model metrics and plotting functionality. Attributes @@ -60,26 +61,21 @@ class ClassificationEvaluator(): (by default 10, so deciles). """ - def __init__( - self, - probability_cutoff: float = None, - lift_at: float = 0.05, - n_bins: int = 10 - ): + def __init__(self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10): """Initialize the ClassificationEvaluator.""" - self.y_true = None - self.y_pred = None + self.y_true: np.ndarray + self.y_pred: np.ndarray self.lift_at = lift_at self.probability_cutoff = probability_cutoff self.n_bins = n_bins # Placeholder to store fitted output - self.scalar_metrics = None - self.roc_curve = None - self.confusion_matrix = None - self.lift_curve = None - self.cumulative_gains = None + self.scalar_metrics: pd.Series + self.roc_curve: dict[str, Any] + self.confusion_matrix: np.ndarray + self.lift_curve: tuple[list[float], list[float], float] + self.cumulative_gains: tuple[np.ndarray, np.ndarray] def fit(self, y_true: np.ndarray, y_pred: np.ndarray): """Fit the evaluator by computing the relevant evaluation metrics on the inputs. @@ -95,20 +91,14 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): # if probability_cutoff is not set, take the optimal cut-off if not self.probability_cutoff: - self.probability_cutoff = (ClassificationEvaluator. - _compute_optimal_cutoff(fpr, tpr, - thresholds)) + self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) # Transform probabilities to binary array using cut-off - y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 - for pred in y_pred]) + y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 for pred in y_pred]) # Compute the various evaluation metrics - self.scalar_metrics = ClassificationEvaluator._compute_scalar_metrics( - y_true, - y_pred, - y_pred_b, - self.lift_at + self.scalar_metrics = cast( + pd.Series, ClassificationEvaluator._compute_scalar_metrics(y_true, y_pred, y_pred_b, self.lift_at) ) self.y_true = y_true @@ -121,10 +111,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): @staticmethod def _compute_scalar_metrics( - y_true: np.ndarray, - y_pred: np.ndarray, - y_pred_b: np.ndarray, - lift_at: float + y_true: np.ndarray, y_pred: np.ndarray, y_pred_b: np.ndarray, lift_at: float ) -> pd.Series: """Compute various scalar performance measures. @@ -157,21 +144,19 @@ def _compute_scalar_metrics( The `column_order` and `pig_tables` parameters do not contain the same set of variables. """ - return pd.Series({ - "accuracy": accuracy_score(y_true, y_pred_b), - "AUC": roc_auc_score(y_true, y_pred), - "precision": precision_score(y_true, y_pred_b), - "recall": recall_score(y_true, y_pred_b), - "F1": f1_score(y_true, y_pred_b, average=None)[1], - "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), - f"lift at {lift_at}": np.round( - ClassificationEvaluator - ._compute_lift( - y_true=y_true, - y_pred=y_pred, - lift_at=lift_at - ), 2) - }) + return pd.Series( + { + "accuracy": accuracy_score(y_true, y_pred_b), + "AUC": roc_auc_score(y_true, y_pred), + "precision": precision_score(y_true, y_pred_b), + "recall": recall_score(y_true, y_pred_b), + "F1": f1_score(y_true, y_pred_b, average=None)[1], + "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), + f"lift at {lift_at}": np.round( + ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2 + ), + } + ) def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot ROC curve of the model. @@ -190,8 +175,7 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): """ if self.roc_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -199,13 +183,15 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax.plot(self.roc_curve["fpr"], - self.roc_curve["tpr"], - color="cornflowerblue", linewidth=3, - label=f"ROC curve (area = {auc:.3})") + ax.plot( + self.roc_curve["fpr"], + self.roc_curve["tpr"], + color="cornflowerblue", + linewidth=3, + label=f"ROC curve (area = {auc:.3})", + ) - ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, - linestyle="--") + ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, linestyle="--") ax.set_xlabel("False Positive Rate", fontsize=15) ax.set_ylabel("True Positive Rate", fontsize=15) ax.legend(loc="lower right") @@ -216,11 +202,7 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): plt.show() - def plot_confusion_matrix( - self, path: str = None, - dim: tuple = (12, 8), - labels: list = None - ): + def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels: list = None): """Plot the confusion matrix. Parameters @@ -240,8 +222,7 @@ def plot_confusion_matrix( labels = labels or DEFAULT_LABELS if self.confusion_matrix is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -249,8 +230,10 @@ def plot_confusion_matrix( ax = sns.heatmap( self.confusion_matrix, annot=self.confusion_matrix.astype(str), - fmt="s", cmap="Blues", - xticklabels=labels, yticklabels=labels + fmt="s", + cmap="Blues", + xticklabels=labels, + yticklabels=labels, ) ax.set_title("Confusion matrix", fontsize=20) @@ -276,35 +259,30 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)) """ if self.lift_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) x_labels, lifts, inc_rate = self.lift_curve - lifts = np.array(lifts)*inc_rate*100 + lifts = np.array(lifts) * inc_rate * 100 with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - plt.bar( - x_labels[::-1], - lifts, - align="center", - color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") plt.ylabel("response (%)", fontsize=16) plt.xlabel("decile", fontsize=16) ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) plt.axhline( - y=inc_rate*100, + y=inc_rate * 100, color="darkorange", linestyle="--", xmin=0.05, xmax=0.95, linewidth=3, - label="Incidence" + label="Incidence", ) # Legend @@ -341,8 +319,7 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): """ if self.lift_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -351,22 +328,13 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - plt.bar(x_labels[::-1], lifts, align="center", - color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") plt.ylabel("lift", fontsize=16) plt.xlabel("decile", fontsize=16) ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline( - y=1, - color="darkorange", - linestyle="--", - xmin=0.05, - xmax=0.95, - linewidth=3, - label="Baseline" - ) + plt.axhline(y=1, color="darkorange", linestyle="--", xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") # Legend ax.legend(loc="upper right") @@ -398,11 +366,14 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, - color="cornflowerblue", linewidth=3, - label="cumulative gains") - ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, - ls="--", color="darkorange", label="random selection") + ax.plot( + self.cumulative_gains[0] * 100, + self.cumulative_gains[1] * 100, + color="cornflowerblue", + linewidth=3, + label="cumulative gains", + ) + ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, ls="--", color="darkorange", label="random selection") ax.set_title("Cumulative Gains curve", fontsize=20) @@ -427,10 +398,7 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): plt.show() @staticmethod - def _find_optimal_cutoff( - y_true: np.ndarray, - y_pred: np.ndarray - ) -> float: + def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float: """Find the optimal probability cut off point for a classification model. Parameters @@ -449,11 +417,7 @@ def _find_optimal_cutoff( return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) @staticmethod - def _compute_optimal_cutoff( - fpr: np.ndarray, - tpr: np.ndarray, - thresholds: np.ndarray - ) -> float: + def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray) -> float: """Calculate the optimal probability cut-off point for a classification model. The optimal cut-off would be where TPR is high and FPR is low, hence @@ -473,7 +437,7 @@ def _compute_optimal_cutoff( float Optimal probability cut-off point. """ - temp = np.absolute(tpr - (1-fpr)) + temp = np.absolute(tpr - (1 - fpr)) # index for optimal value is the one for which temp is minimal optimal_index = np.where(temp == min(temp))[0] @@ -481,10 +445,7 @@ def _compute_optimal_cutoff( return thresholds[optimal_index][0] @staticmethod - def _compute_cumulative_gains( - y_true: np.ndarray, - y_pred: np.ndarray - ) -> tuple: + def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """Compute cumulative gains of the model. Code from (https://github.com/reiinakano/scikit-plot/blob/ @@ -504,7 +465,7 @@ def _compute_cumulative_gains( With x-labels, and gains. """ # make y_true a boolean vector - y_true = (y_true == 1) + y_true = y_true == 1 sorted_indices = np.argsort(y_pred)[::-1] y_true = y_true[sorted_indices] @@ -522,10 +483,8 @@ def _compute_cumulative_gains( @staticmethod def _compute_lift_per_bin( - y_true: np.ndarray, - y_pred: np.ndarray, - n_bins: int = 10 - ) -> tuple: + y_true: np.ndarray, y_pred: np.ndarray, n_bins: int = 10 + ) -> tuple[list[float], list[float], float]: """Compute lift of the model for a given number of bins. Parameters @@ -544,25 +503,16 @@ def _compute_lift_per_bin( Includes x-labels, lifts per decile, and target incidence. """ lifts = [ - ClassificationEvaluator - ._compute_lift( - y_true=y_true, - y_pred=y_pred, - lift_at=perc_lift - ) - for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True) + ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift) + for perc_lift in np.linspace(1 / n_bins, 1, num=n_bins, endpoint=True) ] - x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] + x_labels = [len(lifts) - x for x in np.arange(0, len(lifts), 1)] - return x_labels, lifts, y_true.mean() + return x_labels, lifts, cast(float, y_true.mean()) @staticmethod - def _compute_lift( - y_true: np.ndarray, - y_pred: np.ndarray, - lift_at: float = 0.05 - ) -> float: + def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05) -> float: """Calculate lift on a specified level. Parameters @@ -592,22 +542,19 @@ def _compute_lift( # Calculate necessary variables nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) + stop = int(np.floor(nrows * lift_at)) + avg_incidence = np.einsum("ij->j", y_true_) / float(len(y_true_)) # Sort and filter data - data_sorted = ( - y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1) - ) + data_sorted = y_data[y_data[:, 1].argsort()[::-1]][:stop, 0].reshape(stop, 1) # Calculate lift (einsum is a very fast way of summing, but needs specific shape) - inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + inc_in_top_n = np.einsum("ij->j", data_sorted) / float(len(data_sorted)) + lift = np.round(inc_in_top_n / avg_incidence, 2)[0] return lift -class RegressionEvaluator(): +class RegressionEvaluator: """Evaluator class encapsulating regression model metrics and plotting functionality. Attributes @@ -651,10 +598,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.qq = RegressionEvaluator._compute_qq_residuals(y_true, y_pred) @staticmethod - def _compute_scalar_metrics( - y_true: np.ndarray, - y_pred: np.ndarray - ) -> pd.Series: + def _compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: """Compute various scalar performance measures. Parameters @@ -673,18 +617,17 @@ def _compute_scalar_metrics( Mean squared error (expected value of the quadratic error) Root mean squared error (sqrt of expected value of the quadratic error) """ - return pd.Series({ - "R2": r2_score(y_true, y_pred), - "MAE": mean_absolute_error(y_true, y_pred), - "MSE": mean_squared_error(y_true, y_pred), - "RMSE": sqrt(mean_squared_error(y_true, y_pred)) - }) + return pd.Series( + { + "R2": r2_score(y_true, y_pred), + "MAE": mean_absolute_error(y_true, y_pred), + "MSE": mean_squared_error(y_true, y_pred), + "RMSE": sqrt(mean_squared_error(y_true, y_pred)), + } + ) @staticmethod - def _compute_qq_residuals( - y_true: np.ndarray, - y_pred: np.ndarray - ) -> pd.Series: + def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: """Compute various scalar performance measures. Parameters @@ -706,15 +649,17 @@ def _compute_qq_residuals( df = pd.DataFrame({"res": sorted((y_true - y_pred))}) # ascending order m, s = df["res"].mean(), df["res"].std() - df["z_res"] = df["res"].apply(lambda x: (x-m)/s) - df["rank"] = df.index+1 - df["percentile"] = df["rank"].apply(lambda x: x/(n+1)) # divide by n+1 to avoid inf + df["z_res"] = df["res"].apply(lambda x: (x - m) / s) + df["rank"] = df.index + 1 + df["percentile"] = df["rank"].apply(lambda x: x / (n + 1)) # divide by n+1 to avoid inf df["q_theoretical"] = norm.ppf(df["percentile"]) - return pd.Series({ - "quantiles": df["q_theoretical"].values, - "residuals": df["z_res"].values, - }) + return pd.Series( + { + "quantiles": df["q_theoretical"].values, + "residuals": df["z_res"].values, + } + ) def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): """Plot predictions from the model against actual values. @@ -733,8 +678,7 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): """ if self.y_true is None and self.y_pred is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -744,7 +688,7 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - x = np.arange(1, len(y_true)+1) + x = np.arange(1, len(y_true) + 1) ax.plot(x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3) ax.plot(x, y_pred, label="predictions", color="cornflowerblue", linewidth=3) @@ -775,10 +719,7 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)): The instance is not fitted yet. """ if self.qq is None: - msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." - ) + msg = "This {} instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." raise NotFittedError(msg.format(self.__class__.__name__)) with plt.style.context("seaborn-whitegrid"): @@ -791,10 +732,10 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)): ax.plot(x, y, label="current model", color="cornflowerblue", linewidth=3) ax.set_xlabel("Theoretical quantiles", fontsize=15) - ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")])))+1, 1)) + ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1)) ax.set_ylabel("Standardized residuals", fontsize=15) - ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1)) + ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1)) ax.legend(loc="best") ax.set_title("Q-Q plot", fontsize=20) diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index e728dd0..dfbd85c 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -10,10 +10,7 @@ def generate_pig_tables( - basetable: pd.DataFrame, - id_column_name: str, - target_column_name: str, - preprocessed_predictors: list + basetable: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list ) -> pd.DataFrame: """Compute PIG tables for all predictors in preprocessed_predictors. @@ -38,12 +35,7 @@ def generate_pig_tables( DataFrame containing a PIG table for all predictors. """ pigs = [ - compute_pig_table( - basetable, - column_name, - target_column_name, - id_column_name - ) + compute_pig_table(basetable, column_name, target_column_name, id_column_name) for column_name in sorted(preprocessed_predictors) if column_name not in [id_column_name, target_column_name] ] @@ -52,10 +44,7 @@ def generate_pig_tables( def compute_pig_table( - basetable: pd.DataFrame, - predictor_column_name: str, - target_column_name: str, - id_column_name: str + basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str, id_column_name: str ) -> pd.DataFrame: """Compute the PIG table of a given predictor for a given target. @@ -81,15 +70,10 @@ def compute_pig_table( # (=mean of the target for the given bin) and compute the bin size # (e.g. COUNT(id_column_name)). After that, rename the columns res = ( - basetable - .groupby(predictor_column_name) + basetable.groupby(predictor_column_name) .agg({target_column_name: "mean", id_column_name: "size"}) .reset_index() - .rename(columns={ - predictor_column_name: "label", - target_column_name: "avg_target", - id_column_name: "pop_size" - }) + .rename(columns={predictor_column_name: "label", target_column_name: "avg_target", id_column_name: "pop_size"}) ) # add the column name to a variable column @@ -97,21 +81,16 @@ def compute_pig_table( # replace population size by a percentage of total population res["variable"] = utils.clean_predictor_name(predictor_column_name) res["global_avg_target"] = global_avg_target - res["pop_size"] = res["pop_size"]/len(basetable.index) + res["pop_size"] = res["pop_size"] / len(basetable.index) # make sure to always return the data with the proper column order - column_order = ["variable", "label", "pop_size", - "global_avg_target", "avg_target"] + column_order = ["variable", "label", "pop_size", "global_avg_target", "avg_target"] return res[column_order] def plot_incidence( - pig_tables: pd.DataFrame, - variable: str, - model_type: str, - column_order: list = None, - dim: tuple = (12, 8) + pig_tables: pd.DataFrame, variable: str, model_type: str, column_order: list = None, dim: tuple = (12, 8) ): """Plot a Predictor Insights Graph (PIG). @@ -146,29 +125,22 @@ def plot_incidence( """ if model_type not in ["classification", "regression"]: raise ValueError( - "An unexpected value was set for the model_type " - "parameter. Expected 'classification' or " - "'regression'." + "An unexpected value was set for the model_type " "parameter. Expected 'classification' or " "'regression'." ) - df_plot = pig_tables[pig_tables['variable'] == variable].copy() + df_plot = pig_tables[pig_tables["variable"] == variable].copy() if column_order is not None: - if not set(df_plot['label']) == set(column_order): - raise ValueError( - 'The column_order and pig_tables parameters do not contain ' - 'the same set of variables.') - - df_plot['label'] = df_plot['label'].astype('category') - df_plot['label'].cat.reorder_categories( - column_order, - inplace=True - ) + if not set(df_plot["label"]) == set(column_order): + raise ValueError("The column_order and pig_tables parameters do not contain " "the same set of variables.") + + df_plot["label"] = df_plot["label"].astype("category") + df_plot["label"].cat.reorder_categories(column_order, inplace=True) - df_plot.sort_values(by=['label'], ascending=True, inplace=True) + df_plot.sort_values(by=["label"], ascending=True, inplace=True) df_plot.reset_index(inplace=True) else: - df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True) + df_plot.sort_values(by=["avg_target"], ascending=False, inplace=True) df_plot.reset_index(inplace=True) with plt.style.context("seaborn-ticks"): @@ -177,41 +149,42 @@ def plot_incidence( # -------------------------- # Left axis - average target # -------------------------- - ax.plot(df_plot['label'], df_plot['avg_target'], - color="#00ccff", marker=".", - markersize=20, linewidth=3, - label='incidence rate per bin' if model_type == "classification" else "mean target value per bin", - zorder=10) + ax.plot( + df_plot["label"], + df_plot["avg_target"], + color="#00ccff", + marker=".", + markersize=20, + linewidth=3, + label="incidence rate per bin" if model_type == "classification" else "mean target value per bin", + zorder=10, + ) - ax.plot(df_plot['label'], df_plot['global_avg_target'], - color="#022252", linestyle='--', linewidth=4, - label='average incidence rate' if model_type == "classification" else "global mean target value", - zorder=10) + ax.plot( + df_plot["label"], + df_plot["global_avg_target"], + color="#022252", + linestyle="--", + linewidth=4, + label="average incidence rate" if model_type == "classification" else "global mean target value", + zorder=10, + ) # Dummy line to have label on second axis from first - ax.plot(np.nan, "#939598", linewidth=6, label='bin size') + ax.plot(np.nan, "#939598", linewidth=6, label="bin size") # Set labels & ticks - ax.set_ylabel( - 'incidence' if model_type == "classification" else "mean target value", - fontsize=16 - ) - ax.set_xlabel(f'{variable} bins' '', fontsize=16) + ax.set_ylabel("incidence" if model_type == "classification" else "mean target value", fontsize=16) + ax.set_xlabel(f"{variable} bins" "", fontsize=16) ax.xaxis.set_tick_params(labelsize=14) - plt.setp( - ax.get_xticklabels(), - rotation=45, - ha="right", - rotation_mode="anchor" - ) + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") ax.yaxis.set_tick_params(labelsize=14) if model_type == "classification": # Mean target values are between 0 and 1 (target incidence rate), # so format them as percentages - ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05)) - ax.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: f'{y:.1%}')) + ax.set_yticks(np.arange(0, max(df_plot["avg_target"]) + 0.05, 0.05)) + ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{y:.1%}")) elif model_type == "regression": # If the difference between the highest avg. target of all bins # versus the global avg. target AND the difference between the @@ -223,40 +196,38 @@ def plot_incidence( # the bins and versus the global avg. target. # (Motivation for the AND above: if on one end there IS enough # difference, the effect that we discuss here does not occur.) - global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin. - if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25) - and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)): - ax.set_ylim(global_avg_target * 0.75, - global_avg_target * 1.25) + global_avg_target = max(df_plot["global_avg_target"]) # series of same number, for every bin. + if (np.abs((max(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25) and ( + np.abs((min(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25 + ): + ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25) # Remove ticks but keep the labels - ax.tick_params(axis='both', which='both', length=0) - ax.tick_params(axis='y', colors="#00ccff") - ax.yaxis.label.set_color('#00ccff') + ax.tick_params(axis="both", which="both", length=0) + ax.tick_params(axis="y", colors="#00ccff") + ax.yaxis.label.set_color("#00ccff") # ----------------- # Right Axis - bins # ----------------- ax2 = ax.twinx() - ax2.bar(df_plot['label'], df_plot['pop_size'], - align='center', color="#939598", zorder=1) + ax2.bar(df_plot["label"], df_plot["pop_size"], align="center", color="#939598", zorder=1) # Set labels & ticks - ax2.set_xlabel(f'{variable} bins' '', fontsize=16) + ax2.set_xlabel(f"{variable} bins" "", fontsize=16) ax2.xaxis.set_tick_params(rotation=45, labelsize=14) ax2.yaxis.set_tick_params(labelsize=14) - ax2.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: f'{y:.1%}')) - ax2.set_ylabel('population size', fontsize=16) - ax2.tick_params(axis='y', colors="#939598") - ax2.yaxis.label.set_color('#939598') + ax2.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{y:.1%}")) + ax2.set_ylabel("population size", fontsize=16) + ax2.tick_params(axis="y", colors="#939598") + ax2.yaxis.label.set_color("#939598") # Despine & prettify sns.despine(ax=ax, right=True, left=True) sns.despine(ax=ax2, left=True, right=False) - ax2.spines['right'].set_color('white') + ax2.spines["right"].set_color("white") ax2.grid(False) @@ -268,12 +239,12 @@ def plot_incidence( fig.suptitle(title, fontsize=22) ax.legend( frameon=False, - bbox_to_anchor=(0., 1.01, 1., .102), + bbox_to_anchor=(0.0, 1.01, 1.0, 0.102), loc=3, ncol=1, mode="expand", - borderaxespad=0., - prop={"size": 14} + borderaxespad=0.0, + prop={"size": 14}, ) # Set order of layers diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index ae91220..19fbf64 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -1,6 +1,7 @@ """Collection of plotting utils.""" # third party imports +from typing import cast import numpy as np import pandas as pd @@ -8,18 +9,10 @@ import seaborn as sns -DEFAULT_COLOURS = { - "train": "#0099bf", - "selection": "#ff9500", - "validation": "#8064a2" -} +DEFAULT_COLOURS = {"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"} -def plot_univariate_predictor_quality( - df_metric: pd.DataFrame, - dim: tuple = (12, 8), - path: str = None -): +def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None): """Plot univariate quality of the predictors. Parameters @@ -40,17 +33,14 @@ def plot_univariate_predictor_quality( metric = "RMSE" ascending = True - df = ( - df_metric[df_metric["preselection"]] - .sort_values(by=metric+" selection", ascending=ascending) - ) + df = df_metric[df_metric["preselection"]].sort_values(by=metric + " selection", ascending=ascending) df = pd.melt( df, id_vars=["predictor"], - value_vars=[metric+" train", metric+" selection"], + value_vars=[metric + " train", metric + " selection"], var_name="split", - value_name=metric + value_name=metric, ) # plot data @@ -72,11 +62,7 @@ def plot_univariate_predictor_quality( plt.show() -def plot_correlation_matrix( - df_corr: pd.DataFrame, - dim: tuple = (12, 8), - path: str = None -): +def plot_correlation_matrix(df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None): """Plot correlation matrix amongst the predictors. Parameters @@ -89,8 +75,8 @@ def plot_correlation_matrix( Path to store the figure. """ fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax = sns.heatmap(df_corr, cmap='Blues') - ax.set_title('Correlation Matrix') + ax = sns.heatmap(df_corr, cmap="Blues") + ax.set_title("Correlation Matrix") if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") @@ -103,7 +89,7 @@ def plot_performance_curves( dim: tuple = (12, 8), path: str = None, colors: dict = None, - metric_name: str = None + metric_name: str = None, ): """Plot performance curves for the train-selection-validation sets. @@ -131,40 +117,48 @@ def plot_performance_curves( metric_name = "AUC" elif model_type == "regression": metric_name = "RMSE" + metric_name = cast(str, metric_name) max_metric = np.round( max( - max(model_performance['train_performance']), - max(model_performance['selection_performance']), - max(model_performance['validation_performance']) - ), 1) + max(model_performance["train_performance"]), + max(model_performance["selection_performance"]), + max(model_performance["validation_performance"]), + ), + 1, + ) with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) plt.plot( - model_performance['train_performance'], marker=".", - markersize=20, linewidth=3, label="train", - color=colors["train"] + model_performance["train_performance"], + marker=".", + markersize=20, + linewidth=3, + label="train", + color=colors["train"], ) plt.plot( - model_performance['selection_performance'], marker=".", - markersize=20, linewidth=3, label="selection", - color=colors["selection"] + model_performance["selection_performance"], + marker=".", + markersize=20, + linewidth=3, + label="selection", + color=colors["selection"], ) plt.plot( - model_performance['validation_performance'], marker=".", - markersize=20, linewidth=3, label="validation", - color=colors["validation"] + model_performance["validation_performance"], + marker=".", + markersize=20, + linewidth=3, + label="validation", + color=colors["validation"], ) # Set x- and y-ticks - ax.set_xticks(np.arange(len(model_performance['last_added_predictor']))) - ax.set_xticklabels( - model_performance['last_added_predictor'].tolist(), - rotation=40, - ha='right' - ) + ax.set_xticks(np.arange(len(model_performance["last_added_predictor"]))) + ax.set_xticklabels(model_performance["last_added_predictor"].tolist(), rotation=40, ha="right") if model_type == "classification": ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05)) @@ -172,16 +166,13 @@ def plot_performance_curves( # In regression, the scale of the y-axis can largely vary depending # on the dataset, it is easier to just set the y-axis bounds, # but not the tick distance. - ax.set_ylim(0, max_metric*1.1) + ax.set_ylim(0, max_metric * 1.1) # Make pretty - ax.legend(loc='lower right') - fig.suptitle( - 'Performance curves forward feature selection', - fontsize=20 - ) + ax.legend(loc="lower right") + fig.suptitle("Performance curves forward feature selection", fontsize=20) plt.title("Metric: " + metric_name, fontsize=15, loc="left") - plt.ylabel('Model performance') + plt.ylabel("Model performance") if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") @@ -190,10 +181,7 @@ def plot_performance_curves( def plot_variable_importance( - df_variable_importance: pd.DataFrame, - title: str = None, - dim: tuple = (12, 8), - path: str = None + df_variable_importance: pd.DataFrame, title: str = None, dim: tuple = (12, 8), path: str = None ): """Plot variable importance of a given model. @@ -210,11 +198,7 @@ def plot_variable_importance( """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax = sns.barplot( - x="importance", y="predictor", - data=df_variable_importance, - color="cornflowerblue" - ) + ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue") if title: ax.set_title(title) else: diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py index 288a2c4..c4d2a89 100644 --- a/cobra/model_building/__init__.py +++ b/cobra/model_building/__init__.py @@ -7,9 +7,11 @@ from .models import LogisticRegressionModel, LinearRegressionModel from .forward_selection import ForwardFeatureSelection -__all__ = ['compute_univariate_preselection', - 'get_preselected_predictors', - 'compute_correlations', - 'LogisticRegressionModel', - 'LinearRegressionModel', - 'ForwardFeatureSelection'] +__all__ = [ + "compute_univariate_preselection", + "get_preselected_predictors", + "compute_correlations", + "LogisticRegressionModel", + "LinearRegressionModel", + "ForwardFeatureSelection", +] diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 9b897d9..ee75b02 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -1,7 +1,7 @@ """Feature forward selection.""" import logging -from typing import Callable, Optional, Union +from typing import Callable, Optional, Set, Union, cast import pandas as pd from tqdm.auto import tqdm @@ -15,7 +15,7 @@ DEFAULT_FORCED_PREDICTORS = [] DEFAULT_EXCLUDED_PREDICTORS = [] -Model = Union[LinearRegressionModel, LogisticRegressionModel, None] +Model = Union[LinearRegressionModel, LogisticRegressionModel] class ForwardFeatureSelection: @@ -43,12 +43,7 @@ class ForwardFeatureSelection: List of fitted models. """ - def __init__( - self, - model_type: str = "classification", - max_predictors: int = 50, - pos_only: bool = True - ): + def __init__(self, model_type: str = "classification", max_predictors: int = 50, pos_only: bool = True): """Initialize the ForwardFeatureSelection class.""" self.model_type = model_type if model_type == "classification": @@ -59,7 +54,7 @@ def __init__( self.max_predictors = max_predictors self.pos_only = pos_only - self._fitted_models = [] + self._fitted_models: list[Model] = [] def get_model_from_step(self, step: int) -> Model: """Get fitted model from a particular step. @@ -80,15 +75,13 @@ def get_model_from_step(self, step: int) -> Model: In case step is larger than the number of available models. """ if len(self._fitted_models) <= step: - raise ValueError( - f"No model available for step {step}. " - "The first step starts from index 0." - ) + raise ValueError(f"No model available for step {step}. " "The first step starts from index 0.") return self._fitted_models[step] def compute_model_performances( - self, data: pd.DataFrame, + self, + data: pd.DataFrame, target_column_name: str, splits: list = None, metric: Optional[Callable] = None, @@ -125,29 +118,25 @@ def compute_model_performances( """ splits = splits or DEFAULT_SPLIT_NAMES results = [] - predictor_set = set([]) + predictor_set: Set[str] = set() for model in self._fitted_models: - last_added_predictor = ( - set(model.predictors) - .difference(predictor_set) - ) - tmp = { - "predictors": model.predictors, - "last_added_predictor": list(last_added_predictor)[0] - } + last_added_predictor = set(model.predictors).difference(predictor_set) + tmp = {"predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0]} # Evaluate model on each dataset split, # e.g. train-selection-validation - tmp.update({ - f"{split}_performance": model.evaluate( - data[data["split"] == split], - data[data["split"] == split][target_column_name], - split=split, # parameter used for caching - metric=metric - ) - for split in splits - }) + tmp.update( + { + f"{split}_performance": model.evaluate( + data[data["split"] == split], + data[data["split"] == split][target_column_name], + split=split, # parameter used for caching + metric=metric, + ) + for split in splits + } + ) results.append(tmp) predictor_set = predictor_set.union(set(model.predictors)) @@ -158,11 +147,12 @@ def compute_model_performances( return df def fit( - self, train_data: pd.DataFrame, + self, + train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = None, - excluded_predictors: list = None + excluded_predictors: list = None, ): """Fit the forward feature selection estimator. @@ -190,54 +180,34 @@ def fit( number of allowed predictors in the model. """ assert "split" in train_data.columns, "The train_data input df does not include a split column." - assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \ - "The train_data input df does not include a 'train' and 'selection' split." + assert ( + len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0 + ), "The train_data input df does not include a 'train' and 'selection' split." # remove excluded predictors from predictor lists forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS filtered_predictors = [ - var for var in predictors - if ( - var not in excluded_predictors - and var not in forced_predictors - ) + var for var in predictors if (var not in excluded_predictors and var not in forced_predictors) ] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: - raise ValueError( - "Size of forced_predictors cannot be bigger than " - "max_predictors." - ) + raise ValueError("Size of forced_predictors cannot be bigger than " "max_predictors.") elif len(forced_predictors) == self.max_predictors: - log.info( - "Size of forced_predictors equals max_predictors " - "only one model will be trained..." - ) + log.info("Size of forced_predictors equals max_predictors " "only one model will be trained...") # train model with all forced_predictors (only) self._fitted_models.append( - self._train_model( - train_data[train_data["split"] == "train"], - target_column_name, - forced_predictors - ) + self._train_model(train_data[train_data["split"] == "train"], target_column_name, forced_predictors) ) else: self._fitted_models = self._forward_selection( - train_data, - target_column_name, - filtered_predictors, - forced_predictors + train_data, target_column_name, filtered_predictors, forced_predictors ) def _forward_selection( - self, - train_data: pd.DataFrame, - target_column_name: str, - predictors: list, - forced_predictors: list = None + self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = None ) -> list[Model]: """Perform the forward feature selection algorithm. @@ -264,41 +234,25 @@ def _forward_selection( number of predictors minus one (as indices start from 0). """ forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS - fitted_models = [] - current_predictors = [] + fitted_models: list[Model] = [] + current_predictors: list[str] = [] - max_steps = 1 + min(self.max_predictors, - len(predictors) + len(forced_predictors)) + max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors)) - for step in tqdm( - range(1, max_steps), - desc="Sequentially adding best predictor..." - ): + for step in tqdm(range(1, max_steps), desc="Sequentially adding best predictor..."): if step <= len(forced_predictors): # first, we go through the forced predictors - candidate_predictors = [ - var for var in forced_predictors - if var not in current_predictors - ] + candidate_predictors = [var for var in forced_predictors if var not in current_predictors] else: candidate_predictors = [ - var for var in (predictors + forced_predictors) - if var not in current_predictors + var for var in (predictors + forced_predictors) if var not in current_predictors ] - model = self._find_next_best_model( - train_data, - target_column_name, - candidate_predictors, - current_predictors - ) + model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors) if model is not None: # Add new model predictors to the list of current predictors - current_predictors = list( - set(current_predictors) - .union(set(model.predictors)) - ) + current_predictors = list(set(current_predictors).union(set(model.predictors))) fitted_models.append(model) # else: @@ -313,11 +267,7 @@ def _forward_selection( return fitted_models def _find_next_best_model( - self, - train_data: pd.DataFrame, - target_column_name: str, - candidate_predictors: list, - current_predictors: list + self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list ) -> Model: """ Find the next best model with candidate predictors. @@ -367,20 +317,11 @@ def _find_next_best_model( for pred in candidate_predictors: # Train a model with an additional predictor - model = self._train_model( - fit_data, - target_column_name, - (current_predictors + [pred]) - ) + model = self._train_model(fit_data, target_column_name, (current_predictors + [pred])) # Evaluate the model - performance = ( - model - .evaluate( - sel_data[current_predictors + [pred]], - sel_data[target_column_name], - split="selection" - ) + performance = model.evaluate( + sel_data[current_predictors + [pred]], sel_data[target_column_name], split="selection" ) if self.pos_only and (not (model.get_coef() >= 0).all()): @@ -388,23 +329,16 @@ def _find_next_best_model( # Check if the model is better than the current best model # and if it is, replace the current best. - if self.MLModel == LogisticRegressionModel \ - and performance > best_performance: # AUC metric is used + if self.MLModel == LogisticRegressionModel and performance > best_performance: # AUC metric is used best_performance = performance best_model = model - elif self.MLModel == LinearRegressionModel \ - and performance < best_performance: # RMSE metric is used + elif self.MLModel == LinearRegressionModel and performance < best_performance: # RMSE metric is used best_performance = performance best_model = model - return best_model + return cast(Model, best_model) - def _train_model( - self, - train_data: pd.DataFrame, - target_column_name: str, - predictors: list - ) -> Model: + def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> Model: """Train the model with a given set of predictors. Parameters diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 58571b3..408ead4 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -33,8 +33,7 @@ class LogisticRegressionModel: def __init__(self): """Initialize the LogisticRegressionModel class.""" - self.logit = LogisticRegression(fit_intercept=True, C=1e9, - solver='liblinear', random_state=42) + self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver="liblinear", random_state=42) self._is_fitted = False # placeholder to keep track of a list of predictors self.predictors = [] @@ -52,16 +51,18 @@ def serialize(self) -> dict: "meta": "logistic-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, - "params": self.logit.get_params() + "params": self.logit.get_params(), } if self._is_fitted: - serialized_model.update({ - "classes_": self.logit.classes_.tolist(), - "coef_": self.logit.coef_.tolist(), - "intercept_": self.logit.intercept_.tolist(), - "n_iter_": self.logit.n_iter_.tolist(), - }) + serialized_model.update( + { + "classes_": self.logit.classes_.tolist(), + "coef_": self.logit.coef_.tolist(), + "intercept_": self.logit.intercept_.tolist(), + "n_iter_": self.logit.n_iter_.tolist(), + } + ) return serialized_model @@ -90,7 +91,7 @@ def deserialize(self, model_dict: dict): self.predictors = model_dict["predictors"] self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] - def get_coef(self) -> np.array: + def get_coef(self) -> np.ndarray: """Return the model coefficients. Returns @@ -151,11 +152,7 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.logit.predict_proba(X[self.predictors])[:, 1] - def evaluate( - self, X: pd.DataFrame, y: pd.Series, - split: str = None, - metric: Optional[Callable] = None - ) -> float: + def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float: """ Evaluate the model on a given dataset (X, y). @@ -188,7 +185,7 @@ def evaluate( y_pred = self.score_model(X) fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred) - cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)) + cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred]) performance = metric(y_true=y, y_pred=y_pred_b) @@ -222,26 +219,18 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: y_pred = self.score_model(data) importance_by_variable = { - utils.clean_predictor_name(predictor): stats.pearsonr( - data[predictor], - y_pred - )[0] + utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, - orient="index").reset_index() + df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() df.columns = ["predictor", "importance"] - return ( - df.sort_values(by="importance", ascending=False) - .reset_index(drop=True) - ) + return df.sort_values(by="importance", ascending=False).reset_index(drop=True) def _is_valid_dict(self, model_dict: dict) -> bool: """Check if the model dictionary is valid.""" - if ("meta" not in model_dict - or model_dict["meta"] != "logistic-regression"): + if "meta" not in model_dict or model_dict["meta"] != "logistic-regression": return False attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"] @@ -249,8 +238,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool: if not (key in model_dict or type(model_dict[key]) != list): return False - if ("params" not in model_dict - or "_eval_metrics_by_split" not in model_dict): + if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict: return False return True @@ -292,14 +280,13 @@ def serialize(self) -> dict: "meta": "linear-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, - "params": self.linear.get_params() + "params": self.linear.get_params(), } if self._is_fitted: - serialized_model.update({ - "coef_": self.linear.coef_.tolist(), - "intercept_": self.linear.intercept_.tolist() - }) + serialized_model.update( + {"coef_": self.linear.coef_.tolist(), "intercept_": self.linear.intercept_.tolist()} + ) return serialized_model @@ -326,7 +313,7 @@ def deserialize(self, model_dict: dict): self.predictors = model_dict["predictors"] self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] - def get_coef(self) -> np.array: + def get_coef(self) -> np.ndarray: """Return the model coefficients. Returns @@ -387,11 +374,7 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.linear.predict(X[self.predictors]) - def evaluate( - self, X: pd.DataFrame, y: pd.Series, - split: str = None, - metric: Optional[Callable] = None - ) -> float: + def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float: """Evaluate the model on a given dataset (X, y). The optional split @@ -451,29 +434,19 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: y_pred = self.score_model(data) importance_by_variable = { - utils.clean_predictor_name(predictor): stats.pearsonr( - data[predictor], - y_pred - )[0] + utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict( - importance_by_variable, - orient="index" - ).reset_index() + df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() df.columns = ["predictor", "importance"] - return ( - df.sort_values(by="importance", ascending=False) - .reset_index(drop=True) - ) + return df.sort_values(by="importance", ascending=False).reset_index(drop=True) @staticmethod def _is_valid_dict(model_dict: dict) -> bool: """Check if the model dictionary is valid.""" - if ("meta" not in model_dict - or model_dict["meta"] != "linear-regression"): + if "meta" not in model_dict or model_dict["meta"] != "linear-regression": return False attr = ["coef_", "intercept_", "predictors"] @@ -481,8 +454,7 @@ def _is_valid_dict(model_dict: dict) -> bool: if not (key in model_dict or not isinstance(model_dict[key], list)): return False - if ("params" not in model_dict - or "_eval_metrics_by_split" not in model_dict): + if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict: return False return True diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index e4d1ff6..2d90b48 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -14,7 +14,7 @@ def compute_univariate_preselection( model_type: str = "classification", preselect_auc_threshold: float = 0.053, preselect_rmse_threshold: float = 5, - preselect_overtrain_threshold: float = 0.05 + preselect_overtrain_threshold: float = 0.05, ) -> pd.DataFrame: """Perform a preselection of predictors. @@ -74,21 +74,15 @@ def compute_univariate_preselection( cleaned_predictor = utils.clean_predictor_name(predictor) auc_train = roc_auc_score( - y_true=target_enc_train_data[target_column], - y_score=target_enc_train_data[predictor]) + y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor] + ) auc_selection = roc_auc_score( - y_true=target_enc_selection_data[target_column], - y_score=target_enc_selection_data[predictor]) - - result.append( - { - "predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection - } + y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor] ) + result.append({"predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection}) + df_auc = pd.DataFrame(result) # Filter based on min. AUC @@ -107,22 +101,18 @@ def compute_univariate_preselection( for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) - rmse_train = sqrt(mean_squared_error( - y_true=target_enc_train_data[target_column], - y_pred=target_enc_train_data[predictor])) - - rmse_selection = sqrt(mean_squared_error( - y_true=target_enc_selection_data[target_column], - y_pred=target_enc_selection_data[predictor])) + rmse_train = sqrt( + mean_squared_error(y_true=target_enc_train_data[target_column], y_pred=target_enc_train_data[predictor]) + ) - result.append( - { - "predictor": cleaned_predictor, - "RMSE train": rmse_train, - "RMSE selection": rmse_selection - } + rmse_selection = sqrt( + mean_squared_error( + y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor] + ) ) + result.append({"predictor": cleaned_predictor, "RMSE train": rmse_train, "RMSE selection": rmse_selection}) + df_rmse = pd.DataFrame(result) # Filter based on max. RMSE @@ -157,9 +147,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """ if "AUC selection" in df_metric.columns: predictor_list = ( - df_metric[df_metric["preselection"]] - .sort_values(by="AUC selection", ascending=False) - .predictor.tolist() + df_metric[df_metric["preselection"]].sort_values(by="AUC selection", ascending=False).predictor.tolist() ) elif "RMSE selection" in df_metric.columns: predictor_list = ( @@ -171,10 +159,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: return [col + "_enc" for col in predictor_list] -def compute_correlations( - target_enc_train_data: pd.DataFrame, - predictors: list -) -> pd.DataFrame: +def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Compute the correlations amongst the predictors in the DataFrame. Parameters @@ -192,10 +177,7 @@ def compute_correlations( """ correlations = target_enc_train_data[predictors].corr() - predictors_cleaned = [ - utils.clean_predictor_name(predictor) - for predictor in predictors - ] + predictors_cleaned = [utils.clean_predictor_name(predictor) for predictor in predictors] # Change index and columns with the cleaned version of the predictors # e.g. change "var1_enc" with "var1" diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index b72d1a4..55e036b 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -5,7 +5,4 @@ from .categorical_data_processor import CategoricalDataProcessor from .preprocessor import PreProcessor -__all__ = ['KBinsDiscretizer', - 'TargetEncoder', - 'CategoricalDataProcessor', - 'PreProcessor'] +__all__ = ["KBinsDiscretizer", "TargetEncoder", "CategoricalDataProcessor", "PreProcessor"] diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 9d2f263..6632720 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -2,7 +2,7 @@ # standard lib imports import re -from typing import Optional +from typing import Any, Optional, Set, Union import logging # third party imports @@ -62,9 +62,14 @@ class CategoricalDataProcessor(BaseEstimator): """ valid_keys = [ - "model_type", "regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories" + "model_type", + "regroup", + "regroup_name", + "keep_missing", + "category_size_threshold", + "p_value_threshold", + "scale_contingency_table", + "forced_categories", ] def __init__( @@ -76,7 +81,7 @@ def __init__( category_size_threshold: int = 5, p_value_threshold: float = 0.001, scale_contingency_table: bool = True, - forced_categories: dict = {} + forced_categories: dict = {}, ): """Initialize the CategoricalDataProcessor.""" if model_type not in ["classification", "regression"]: @@ -95,7 +100,7 @@ def __init__( self.forced_categories = forced_categories # dict to store fitted output in - self._cleaned_categories_by_column = {} + self._cleaned_categories_by_column: dict[str, Set[Any]] = {} def attributes_to_dict(self) -> dict: """Return the attributes of CategoricalDataProcessor as a dictionary. @@ -109,8 +114,7 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_cleaned_categories_by_column"] = { - key: list(value) - for key, value in self._cleaned_categories_by_column.items() + key: list(value) for key, value in self._cleaned_categories_by_column.items() } return params @@ -134,8 +138,7 @@ def set_attributes_from_dict(self, params: dict): if type(_fitted_output) != dict: raise ValueError( "_cleaned_categories_by_column is expected to " - "be a dict but is of type {} instead" - .format(type(_fitted_output)) + "be a dict but is of type {} instead".format(type(_fitted_output)) ) # Clean out params dictionary to remove unknown keys (for safety!) @@ -145,18 +148,11 @@ def set_attributes_from_dict(self, params: dict): # of the following method from BaseEstimator: self.set_params(**params) - self._cleaned_categories_by_column = { - key: set(value) for key, value in _fitted_output.items() - } + self._cleaned_categories_by_column = {key: set(value) for key, value in _fitted_output.items()} return self - def fit( - self, - data: pd.DataFrame, - column_names: list, - target_column: str - ): + def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the CategoricalDataProcessor. Parameters @@ -174,15 +170,9 @@ def fit( log.info("regroup was set to False, so no fitting is required") return None - for column_name in tqdm( - column_names, - desc="Fitting category regrouping..." - ): + for column_name in tqdm(column_names, desc="Fitting category regrouping..."): if column_name not in data.columns: - log.warning( - "DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name) - ) + log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name)) continue cleaned_cats = self._fit_column(data, column_name, target_column) @@ -194,8 +184,7 @@ def fit( # Add to _cleaned_categories_by_column for later use self._cleaned_categories_by_column[column_name] = cleaned_cats - def _fit_column(self, data: pd.DataFrame, column_name: str, - target_column) -> set: + def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: """ Fit all necessary columns into "Other". @@ -218,8 +207,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, model_type = self.model_type if len(data[column_name].unique()) == 1: - log.warning(f"Predictor {column_name} is constant" - " and will be ignored in computation.") + log.warning(f"Predictor {column_name} is constant" " and will be ignored in computation.") return set(data[column_name].unique()) y = data[target_column] @@ -228,48 +216,28 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, else: incidence = None - combined_categories = set() + combined_categories: Set[str] = set() # replace missings and get unique categories as a list - X = ( - CategoricalDataProcessor - ._replace_missings(data[column_name]) - .astype(object) - ) + X = CategoricalDataProcessor._replace_missings(data[column_name]).astype(object) unique_categories = list(X.unique()) # do not merge categories in case of dummies, i.e. 0 and 1 # (and possibly "Missing") - if (len(unique_categories) == 2 - or (len(unique_categories) == 3 - and "Missing" in unique_categories)): + if len(unique_categories) == 2 or (len(unique_categories) == 3 and "Missing" in unique_categories): return set(unique_categories) # get small categories and add them to the merged category list # does not apply incidence factor when model_type = "regression" - small_categories = ( - CategoricalDataProcessor - ._get_small_categories( - X, - incidence, - self.category_size_threshold - ) - ) + small_categories = CategoricalDataProcessor._get_small_categories(X, incidence, self.category_size_threshold) combined_categories = combined_categories.union(small_categories) for category in unique_categories: if category in small_categories: continue - pval = ( - CategoricalDataProcessor - ._compute_p_value( - X, y, category, - model_type, - self.scale_contingency_table - ) - ) + pval = CategoricalDataProcessor._compute_p_value(X, y, category, model_type, self.scale_contingency_table) # if not significant, add it to the list if pval > self.p_value_threshold: @@ -281,11 +249,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, return set(unique_categories).difference(combined_categories) - def transform( - self, - data: pd.DataFrame, - column_names: list - ) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Transform the data. Parameters @@ -302,10 +266,7 @@ def transform( Data with additional transformed variables. """ if self.regroup and len(self._cleaned_categories_by_column) == 0: - msg = ( - "{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." - ) + msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in column_names: @@ -318,10 +279,7 @@ def transform( return data - def _transform_column( - self, data: pd.DataFrame, - column_name: str - ) -> pd.DataFrame: + def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: """Create an additional column which combines categories into "Other". Parameters @@ -340,13 +298,7 @@ def _transform_column( data.loc[:, column_name_clean] = data[column_name].astype(object) # Fill missings first - data.loc[:, column_name_clean] = ( - CategoricalDataProcessor - ._replace_missings( - data, - column_name_clean - ) - ) + data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings(data, column_name_clean) if self.regroup: categories = self._cleaned_categories_by_column.get(column_name) @@ -355,17 +307,11 @@ def _transform_column( # Log warning if categories is None, which indicates it is # not in fitted output if categories is None: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) + log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) return data - data.loc[:, column_name_clean] = ( - CategoricalDataProcessor - ._replace_categories( - data[column_name_clean], - categories, - self.regroup_name - ) + data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_categories( + data[column_name_clean], categories, self.regroup_name ) # change data to categorical @@ -373,12 +319,7 @@ def _transform_column( return data - def fit_transform( - self, - data: pd.DataFrame, - column_names: list, - target_column: str - ) -> pd.DataFrame: + def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: """Fit and transform the data. Parameters @@ -400,11 +341,7 @@ def fit_transform( return self.transform(data, column_names) @staticmethod - def _get_small_categories( - predictor_series: pd.Series, - incidence: float, - category_size_threshold: int - ) -> set: + def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set: """ Fetch categories with a size below a certain threshold. @@ -431,14 +368,11 @@ def _get_small_categories( factor = 1 # Get all categories with a count below a threshold - bool_mask = (category_counts*factor) <= category_size_threshold + bool_mask = (category_counts * factor) <= category_size_threshold return set(category_counts[bool_mask].index.tolist()) @staticmethod - def _replace_missings( - data: pd.DataFrame, - column_names: Optional[list] = None - ) -> pd.DataFrame: + def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None) -> pd.DataFrame: """Replace missing values (incl. empty strings). Parameters @@ -469,11 +403,7 @@ def _replace_missings( @staticmethod def _compute_p_value( - X: pd.Series, - y: pd.Series, - category: str, - model_type: str, - scale_contingency_table: bool + X: pd.Series, y: pd.Series, category: str, model_type: str, scale_contingency_table: bool ) -> float: """ Calculate p-value. @@ -509,34 +439,26 @@ def _compute_p_value( df["other_categories"] = np.where(X == category, 0, 1) if model_type == "classification": - contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], - margins=False) + contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], margins=False) # if true, we scale the "other" categories if scale_contingency_table: size_other_cats = contingency_table.iloc[1].sum() incidence_mean = y.mean() - contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats + contingency_table.iloc[1, 0] = (1 - incidence_mean) * size_other_cats contingency_table.iloc[1, 1] = incidence_mean * size_other_cats contingency_table = contingency_table.values.astype(np.int64) pval = stats.chi2_contingency(contingency_table, correction=False)[1] elif model_type == "regression": - pval = stats.kruskal( - df.y[df.other_categories == 0], - df.y[df.other_categories == 1] - )[1] + pval = stats.kruskal(df.y[df.other_categories == 0], df.y[df.other_categories == 1])[1] return pval @staticmethod - def _replace_categories( - data: pd.Series, - categories: set, - replace_with: str - ) -> pd.Series: + def _replace_categories(data: pd.Series, categories: set, replace_with: str) -> pd.Series: """ Replace categories in set with "Other". @@ -557,5 +479,4 @@ def _replace_categories( pd.Series Series with replaced categories. """ - return data.apply( - lambda x: str(x) if x in categories else replace_with) + return data.apply(lambda x: str(x) if x in categories else replace_with) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 7621ac8..0ad6265 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -2,7 +2,7 @@ # standard lib imports from copy import deepcopy from this import d -from typing import List +from typing import Dict, List, Optional, Union import numbers import logging import math @@ -65,18 +65,24 @@ class KBinsDiscretizer(BaseEstimator): valid_strategies = ("uniform", "quantile") valid_keys = [ - "n_bins", "strategy", "closed", "auto_adapt_bins", - "starting_precision", "label_format", - "change_endpoint_format" + "n_bins", + "strategy", + "closed", + "auto_adapt_bins", + "starting_precision", + "label_format", + "change_endpoint_format", ] def __init__( - self, n_bins: int = 10, strategy: str = "quantile", + self, + n_bins: int = 10, + strategy: str = "quantile", closed: str = "right", auto_adapt_bins: bool = False, starting_precision: int = 0, label_format: str = "{} - {}", - change_endpoint_format: bool = False + change_endpoint_format: bool = False, ): """Initialize the KBinsDiscretizer.""" # validate number of bins @@ -108,18 +114,14 @@ def _validate_n_bins(self, n_bins: int): """ if not isinstance(n_bins, numbers.Integral): raise ValueError( - "{} received an invalid n_bins type. Received {}, expected int." - .format( - KBinsDiscretizer.__name__, - type(n_bins).__name__ + "{} received an invalid n_bins type. Received {}, expected int.".format( + KBinsDiscretizer.__name__, type(n_bins).__name__ ) ) if n_bins < 2: raise ValueError( - "{} received an invalid number of bins. Received {}, expected at least 2." - .format( - KBinsDiscretizer.__name__, - n_bins + "{} received an invalid number of bins. Received {}, expected at least 2.".format( + KBinsDiscretizer.__name__, n_bins ) ) @@ -135,8 +137,7 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_bins_by_column"] = { - key: [list(tup) for tup in value] if value else None - for key, value in self._bins_by_column.items() + key: [list(tup) for tup in value] if value else None for key, value in self._bins_by_column.items() } return params @@ -159,8 +160,7 @@ def set_attributes_from_dict(self, params: dict): if type(_bins_by_column) != dict: raise ValueError( - "_bins_by_column is expected to be a dict but is of type {} instead" - .format(type(_bins_by_column)) + "_bins_by_column is expected to be a dict but is of type {} instead".format(type(_bins_by_column)) ) # Clean out params dictionary to remove unknown keys (for safety!) @@ -171,8 +171,7 @@ def set_attributes_from_dict(self, params: dict): self.set_params(**params) self._bins_by_column = { - key: ([tuple(v) for v in value] if value else None) - for key, value in _bins_by_column.items() + key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items() } return self @@ -189,22 +188,14 @@ def fit(self, data: pd.DataFrame, column_names: list): """ if self.strategy not in self.valid_strategies: raise ValueError( - "{}: valid options for 'strategy' are {}. Got strategy={!r} instead." - .format( - KBinsDiscretizer.__name__, - self.valid_strategies, self.strategy + "{}: valid options for 'strategy' are {}. Got strategy={!r} instead.".format( + KBinsDiscretizer.__name__, self.valid_strategies, self.strategy ) ) - for column_name in tqdm( - column_names, desc="Computing discretization bins..." - ): + for column_name in tqdm(column_names, desc="Computing discretization bins..."): if column_name not in data.columns: - log.warning( - "DataFrame has no column '{}', so it will be " - "skipped in fitting" - .format(column_name) - ) + log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name)) continue bins = self._fit_column(data, column_name) @@ -212,11 +203,7 @@ def fit(self, data: pd.DataFrame, column_names: list): # Add to bins_by_column for later use self._bins_by_column[column_name] = bins - def _fit_column( - self, - data: pd.DataFrame, - column_name: str - ) -> List[tuple]: + def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tuple]]: """Compute bins for a specific column in data. Parameters @@ -234,14 +221,10 @@ def _fit_column( col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: - log.warning( - "Predictor '{}' is constant and will be ignored in computation" - .format(column_name) - ) + log.warning("Predictor '{}' is constant and will be ignored in computation".format(column_name)) return None - prop_inf = (np.sum(np.isinf(data[column_name])) - / data[column_name].shape[0]) + prop_inf = np.sum(np.isinf(data[column_name])) / data[column_name].shape[0] if prop_inf > 0: log.warning( @@ -254,31 +237,19 @@ def _fit_column( prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] if prop_nan >= 0.99: - log.warning( - f"Column {column_name} is" - f" {prop_nan:.1%}% NaNs, " - f"consider dropping or transforming it." - ) + log.warning(f"Column {column_name} is" f" {prop_nan:.1%}% NaNs, " f"consider dropping or transforming it.") n_bins = self.n_bins if self.auto_adapt_bins: size = len(data.index) - missing_pct = data[column_name].isnull().sum()/size + missing_pct = data[column_name].isnull().sum() / size n_bins = int(max(round((1 - missing_pct) * n_bins), 2)) - bin_edges = self._compute_bin_edges( - data, - column_name, - n_bins, - col_min, - col_max - ) + bin_edges = self._compute_bin_edges(data, column_name, n_bins, col_min, col_max) if len(bin_edges) < 3: log.warning( - "Only 1 bin was found for predictor '{}' so it will " - "be ignored in computation" - .format(column_name) + "Only 1 bin was found for predictor '{}' so it will " "be ignored in computation".format(column_name) ) return None @@ -286,17 +257,12 @@ def _fit_column( log.warning( "The number of actual bins for predictor '{}' is {} " "which is smaller than the requested number of bins " - "{}" - .format(column_name, len(bin_edges) - 1, n_bins) + "{}".format(column_name, len(bin_edges) - 1, n_bins) ) return self._compute_bins_from_edges(bin_edges) - def transform( - self, - data: pd.DataFrame, - column_names: list - ) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Discretize the data in the given list of columns. This is done by mapping each number to @@ -315,16 +281,12 @@ def transform( data with additional discretized variables """ if len(self._bins_by_column) == 0: - msg = ( - "{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." - ) + msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in tqdm(column_names, desc="Discretizing columns..."): if column_name not in self._bins_by_column: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) + log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) continue # can be None for a column with a constant value! @@ -334,11 +296,7 @@ def transform( return data - def _transform_column( - self, data: pd.DataFrame, - column_name: str, - bins: List[tuple] - ) -> pd.DataFrame: + def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tuple]) -> pd.DataFrame: """Create a new column with binned values of column_name. Parameters @@ -360,18 +318,12 @@ def _transform_column( column_name_bin = column_name + "_bin" # use pd.cut to compute bins - data.loc[:, column_name_bin] = pd.cut( - x=data[column_name], - bins=interval_idx - ) + data.loc[:, column_name_bin] = pd.cut(x=data[column_name], bins=interval_idx) # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) - data.loc[:, column_name_bin] = ( - data[column_name_bin] - .cat.rename_categories(bin_labels) - ) + data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories(bin_labels) if data[column_name_bin].isnull().sum() > 0: @@ -384,11 +336,7 @@ def _transform_column( return data - def fit_transform( - self, - data: pd.DataFrame, - column_names: list - ) -> pd.DataFrame: + def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Fit to data, then transform it. Parameters @@ -407,12 +355,7 @@ def fit_transform( return self.transform(data, column_names) def _compute_bin_edges( - self, - data: pd.DataFrame, - column_name: str, - n_bins: int, - col_min: float, - col_max: float + self, data: pd.DataFrame, column_name: str, n_bins: int, col_min: float, col_max: float ) -> list: """Compute the desired bin edges. @@ -434,6 +377,7 @@ def _compute_bin_edges( list list of bin edges from which to compute the bins """ + # fmt: off bin_edges = [] if self.strategy == "quantile": bin_edges = list( @@ -445,6 +389,7 @@ def _compute_bin_edges( ) elif self.strategy == "uniform": bin_edges = list(np.linspace(col_min, col_max, n_bins + 1)) + # fmt: on # nans lead to unexpected behavior during sorting, # by replacing with inf we ensure these stay at the @@ -456,9 +401,7 @@ def _compute_bin_edges( bin_edges[-1] = np.inf if np.isnan(bin_edges).sum() > 0: - log.warning( - f"Column {column_name} has NaNs present in bin definitions" - ) + log.warning(f"Column {column_name} has NaNs present in bin definitions") # Make absolutely sure bin edges are ordered, # in very rare situations this wasn't the case @@ -532,10 +475,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: return bins @staticmethod - def _create_index( - intervals: List[tuple], - closed: str = "right" - ) -> pd.IntervalIndex: + def _create_index(intervals: List[tuple], closed: str = "right") -> pd.IntervalIndex: """ Create an pd.IntervalIndex based on a list of tuples. @@ -558,6 +498,7 @@ def _create_index( Description """ # check if closed is of the proper form + # fmt: off if closed not in ["left", "right"]: raise ValueError( "{}: valid options for 'closed' are {}. " @@ -567,6 +508,7 @@ def _create_index( ["left", "right"], closed ) ) + # fmt: on # deepcopy variable because we do not want to modify the content # of intervals (which is still used outside of this function) @@ -593,6 +535,7 @@ def _create_bin_labels(self, bins: List[tuple]) -> list: list of (formatted) bin labels """ bin_labels = [] + # fmt: off for interval in bins: bin_labels.append( self.label_format @@ -601,6 +544,7 @@ def _create_bin_labels(self, bins: List[tuple]) -> list: interval[1] ) ) + # fmt: on # Format first and last bin as < x and > y resp. if self.change_endpoint_format: diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 64c0fa9..3a82efa 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -7,6 +7,7 @@ import logging from random import shuffle from datetime import datetime +from typing import Any, Set # third party imports import pandas as pd @@ -58,7 +59,7 @@ def __init__( categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, - is_fitted: bool = False + is_fitted: bool = False, ): """Initialize the PreProcessor class.""" self._categorical_data_processor = categorical_data_processor @@ -88,7 +89,7 @@ def from_params( scale_contingency_table: bool = True, forced_categories: dict = {}, weight: float = 0.0, - imputation_strategy: str = "mean" + imputation_strategy: str = "mean", ): """ Instantiate a PreProcessor from given or default params. @@ -159,19 +160,16 @@ def from_params( categorical_data_processor = CategoricalDataProcessor( model_type, regroup, - regroup_name, keep_missing, + regroup_name, + keep_missing, category_size_threshold, p_value_threshold, scale_contingency_table, - forced_categories + forced_categories, ) discretizer = KBinsDiscretizer( - n_bins, strategy, closed, - auto_adapt_bins, - starting_precision, - label_format, - change_endpoint_format + n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format ) target_encoder = TargetEncoder(weight, imputation_strategy) @@ -203,15 +201,10 @@ def from_pipeline(cls, pipeline: dict): and no others. """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError( - "Invalid pipeline, as it does not " - "contain all and only the required parameters." - ) + raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.") categorical_data_processor = CategoricalDataProcessor() - categorical_data_processor.set_attributes_from_dict( - pipeline["categorical_data_processor"] - ) + categorical_data_processor.set_attributes_from_dict(pipeline["categorical_data_processor"]) # model_type = categorical_data_processor.model_type discretizer = KBinsDiscretizer() @@ -220,20 +213,9 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls( - categorical_data_processor, - discretizer, - target_encoder, - is_fitted=pipeline["_is_fitted"] - ) + return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"]) - def fit( - self, - train_data: pd.DataFrame, - continuous_vars: list, - discrete_vars: list, - target_column_name: str - ): + def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str): """Fit the data to the preprocessing pipeline. Parameters @@ -248,13 +230,7 @@ def fit( Column name of the target. """ # get list of all variables - preprocessed_variable_names = ( - PreProcessor - ._get_variable_list( - continuous_vars, - discrete_vars - ) - ) + preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars) log.info("Starting to fit pipeline") start = time.time() @@ -268,55 +244,25 @@ def fit( if continuous_vars: begin = time.time() self._discretizer.fit(train_data, continuous_vars) - log.info( - "Fitting KBinsDiscretizer took {} seconds" - .format(time.time() - begin) - ) + log.info("Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin)) - train_data = self._discretizer.transform( - train_data, - continuous_vars - ) + train_data = self._discretizer.transform(train_data, continuous_vars) if discrete_vars: begin = time.time() - self._categorical_data_processor.fit( - train_data, - discrete_vars, - target_column_name - ) - log.info( - "Fitting categorical_data_processor class took {} seconds" - .format(time.time() - begin) - ) + self._categorical_data_processor.fit(train_data, discrete_vars, target_column_name) + log.info("Fitting categorical_data_processor class took {} seconds".format(time.time() - begin)) - train_data = ( - self._categorical_data_processor - .transform(train_data, discrete_vars) - ) + train_data = self._categorical_data_processor.transform(train_data, discrete_vars) begin = time.time() - self._target_encoder.fit( - train_data, preprocessed_variable_names, - target_column_name - ) - log.info( - "Fitting TargetEncoder took {} seconds" - .format(time.time() - begin) - ) + self._target_encoder.fit(train_data, preprocessed_variable_names, target_column_name) + log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin)) self._is_fitted = True # set fitted boolean to True - log.info( - "Fitting pipeline took {} seconds" - .format(time.time() - start) - ) + log.info("Fitting pipeline took {} seconds".format(time.time() - start)) - def transform( - self, - data: pd.DataFrame, - continuous_vars: list, - discrete_vars: list - ) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list) -> pd.DataFrame: """Transform the data by applying the preprocessing pipeline. Parameters @@ -342,46 +288,26 @@ def transform( if not self._is_fitted: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) - preprocessed_variable_names = ( - PreProcessor - ._get_variable_list( - continuous_vars, - discrete_vars - ) - ) + preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) if discrete_vars: - data = self._categorical_data_processor.transform( - data, - discrete_vars - ) + data = self._categorical_data_processor.transform(data, discrete_vars) - data = self._target_encoder.transform( - data, - preprocessed_variable_names - ) + data = self._target_encoder.transform(data, preprocessed_variable_names) - log.info( - "Transforming data took {} seconds" - .format(time.time() - start) - ) + log.info("Transforming data took {} seconds".format(time.time() - start)) return data def fit_transform( - self, - train_data: pd.DataFrame, - continuous_vars: list, - discrete_vars: list, - target_column_name: str + self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str ) -> pd.DataFrame: """Fit preprocessing pipeline and transform the data. @@ -401,21 +327,13 @@ def fit_transform( pd.DataFrame Transformed (preprocessed) data. """ - self.fit( - train_data, - continuous_vars, - discrete_vars, - target_column_name - ) + self.fit(train_data, continuous_vars, discrete_vars, target_column_name) return self.transform(train_data, continuous_vars, discrete_vars) @staticmethod def train_selection_validation_split( - data: pd.DataFrame, - train_prop: float = 0.6, - selection_prop: float = 0.2, - validation_prop: float = 0.2 + data: pd.DataFrame, train_prop: float = 0.6, selection_prop: float = 0.2, validation_prop: float = 0.2 ) -> pd.DataFrame: """Add `split` column with train/selection/validation values to the dataset. @@ -440,10 +358,7 @@ def train_selection_validation_split( DataFrame with additional split column. """ if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): - raise ValueError( - "The sum of train_prop, selection_prop and " - "validation_prop must be 1.0." - ) + raise ValueError("The sum of train_prop, selection_prop and " "validation_prop must be 1.0.") if train_prop == 0.0: raise ValueError("train_prop cannot be zero!") @@ -455,22 +370,19 @@ def train_selection_validation_split( size_train = int(train_prop * nrows) size_select = int(selection_prop * nrows) size_valid = int(validation_prop * nrows) - correction = nrows - (size_train+size_select+size_valid) + correction = nrows - (size_train + size_select + size_valid) split = ( - ['train'] * size_train - + ['train'] * correction - + ['selection'] * size_select - + ['validation'] * size_valid + ["train"] * size_train + ["train"] * correction + ["selection"] * size_select + ["validation"] * size_valid ) shuffle(split) - data['split'] = split + data["split"] = split return data - def serialize_pipeline(self) -> dict: + def serialize_pipeline(self) -> dict[str, Any]: """ Serialize the preprocessing pipeline. @@ -482,23 +394,13 @@ def serialize_pipeline(self) -> dict: dict Return the pipeline as a dictionary. """ - pipeline = { - "metadata": { - "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") - } - } - - pipeline["categorical_data_processor"] = ( - self - ._categorical_data_processor - .attributes_to_dict() - ) + pipeline: dict[str, Any] + pipeline = {"metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")}} + + pipeline["categorical_data_processor"] = self._categorical_data_processor.attributes_to_dict() pipeline["discretizer"] = self._discretizer.attributes_to_dict() - pipeline["target_encoder"] = ( - self._target_encoder - .attributes_to_dict() - ) + pipeline["target_encoder"] = self._target_encoder.attributes_to_dict() pipeline["_is_fitted"] = True @@ -514,20 +416,11 @@ def _is_valid_pipeline(pipeline: dict) -> bool: Loaded pipeline from JSON file. """ keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set( - [ - key for key in keys - if key not in ["cls", "serialization_path"] - ] - ) + valid_keys = set([key for key in keys if key not in ["cls", "serialization_path"]]) - input_keys = set() + input_keys: Set[str] = set() for key in pipeline: - if key in [ - "categorical_data_processor", - "discretizer", - "target_encoder" - ]: + if key in ["categorical_data_processor", "discretizer", "target_encoder"]: input_keys = input_keys.union(set(pipeline[key].keys())) elif key != "metadata": input_keys.add(key) @@ -560,8 +453,7 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: ValueError In case both lists are empty. """ - var_list = ([col + "_processed" for col in discrete_vars] - + [col + "_bin" for col in continuous_vars]) + var_list = [col + "_processed" for col in discrete_vars] + [col + "_bin" for col in continuous_vars] if not var_list: raise ValueError("Variable var_list is None or empty list.") diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 7485b6b..cd6bc34 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -68,21 +68,14 @@ class TargetEncoder(BaseEstimator): valid_imputation_strategies = ("mean", "min", "max") - def __init__( - self, weight: float = 0.0, - imputation_strategy: str = "mean" - ): + def __init__(self, weight: float = 0.0, imputation_strategy: str = "mean"): """Initialize the TargetEncoder class.""" if weight < 0: raise ValueError("The value of weight cannot be smaller than zero.") elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError( "Valid options for 'imputation_strategy' are {}. " - "Got imputation_strategy={!r} instead." - .format( - self.valid_imputation_strategies, - imputation_strategy - ) + "Got imputation_strategy={!r} instead.".format(self.valid_imputation_strategies, imputation_strategy) ) if weight == 0: @@ -98,7 +91,7 @@ def __init__( self._mapping = {} # placeholder for fitted output # placeholder for the global incidence of the data used for fitting - self._global_mean = None + self._global_mean: float def attributes_to_dict(self) -> dict: """Return the attributes of TargetEncoder in a dictionary. @@ -111,10 +104,7 @@ def attributes_to_dict(self) -> dict: """ params = self.get_params() - params["_mapping"] = { - key: value.to_dict() - for key, value in self._mapping.items() - } + params["_mapping"] = {key: value.to_dict() for key, value in self._mapping.items()} params["_global_mean"] = self._global_mean @@ -132,8 +122,7 @@ def set_attributes_from_dict(self, params: dict): if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] - if ("imputation_strategy" in params and - params["imputation_strategy"] in self.valid_imputation_strategies): + if "imputation_strategy" in params and params["imputation_strategy"] in self.valid_imputation_strategies: self.imputation_strategy = params["imputation_strategy"] if "_global_mean" in params and type(params["_global_mean"]) == float: @@ -148,19 +137,11 @@ def dict_to_series(key, value): s.index.name = key return s - self._mapping = { - key: dict_to_series(key, value) - for key, value in _mapping.items() - } + self._mapping = {key: dict_to_series(key, value) for key, value in _mapping.items()} return self - def fit( - self, - data: pd.DataFrame, - column_names: list, - target_column: str - ): + def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the TargetEncoder to the data. Parameters @@ -179,11 +160,7 @@ def fit( for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: - log.warning( - "DataFrame has no column '{}', so it will be " - "skipped in fitting" - .format(column) - ) + log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column)) continue self._mapping[column] = self._fit_column(data[column], y) @@ -211,18 +188,13 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: stats = y.groupby(X).agg(["mean", "count"]) # Note: if self.weight = 0, we have the ordinary incidence replacement - numerator = (stats["count"] * stats["mean"] - + self.weight * self._global_mean) + numerator = stats["count"] * stats["mean"] + self.weight * self._global_mean denominator = stats["count"] + self.weight return numerator / denominator - def transform( - self, - data: pd.DataFrame, - column_names: list - ) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Replace (e.g. encode) values of each categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -248,29 +220,22 @@ def transform( """ if (len(self._mapping) == 0) or (self._global_mean is None): msg = ( - "This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) for column in tqdm(column_names, desc="Applying target encoding..."): if column not in data.columns: - log.warning("Unknown column '{}' will be skipped." - .format(column)) + log.warning("Unknown column '{}' will be skipped.".format(column)) continue elif column not in self._mapping: - log.warning("Column '{}' is not in fitted output " - "and will be skipped.".format(column)) + log.warning("Column '{}' is not in fitted output " "and will be skipped.".format(column)) continue data = self._transform_column(data, column) return data - def _transform_column( - self, - data: pd.DataFrame, - column_name: str - ) -> pd.DataFrame: + def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: """Replace (e.g. encode) values of a categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -293,10 +258,7 @@ def _transform_column( # Convert dtype to float, because when the original dtype # is of type "category", the resulting dtype would otherwise also be of # type "category": - data[new_column] = ( - data[column_name].map(self._mapping[column_name]) - .astype("float") - ) + data[new_column] = data[column_name].map(self._mapping[column_name]).astype("float") # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result @@ -312,12 +274,7 @@ def _transform_column( return data - def fit_transform( - self, - data: pd.DataFrame, - column_names: list, - target_column: str - ) -> pd.DataFrame: + def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: """Fit the encoder and transform the data. Parameters diff --git a/cobra/utils.py b/cobra/utils.py index b7727dd..0287947 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -8,6 +8,4 @@ def clean_predictor_name(predictor_name: str) -> str: This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off from the end of the predictor name to return a clean version of the predictor """ - return (predictor_name.replace("_enc", "") - .replace("_bin", "") - .replace("_processed", "")) + return predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "") diff --git a/requirements.dev.txt b/requirements.dev.txt index 3d87710..9534dc0 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,6 +1,6 @@ +black>=22.3.0 mypy>=0.942 pycodestyle>=2.8.0 pydocstyle>=6.1.1 -pylint>=2.13.7 pytest>=7.1.1 pytest-mock>=3.7.0 \ No newline at end of file From aac52033e6fb460073b4bf75dfa203edc2208983 Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 20 May 2022 13:18:55 +0200 Subject: [PATCH 6/9] feat: add make black command, line length 80 instead of 120 --- Makefile | 5 +- cobra/evaluation/evaluator.py | 112 ++++++++++++++---- cobra/evaluation/pigs_tables.py | 68 +++++++++-- cobra/evaluation/plotting_utils.py | 28 ++++- cobra/model_building/forward_selection.py | 112 ++++++++++++++---- cobra/model_building/models.py | 45 +++++-- cobra/model_building/univariate_selection.py | 54 +++++++-- cobra/preprocessing/__init__.py | 7 +- .../categorical_data_processor.py | 77 +++++++++--- cobra/preprocessing/kbins_discretizer.py | 63 +++++++--- cobra/preprocessing/preprocessor.py | 106 +++++++++++++---- cobra/preprocessing/target_encoder.py | 38 ++++-- cobra/utils.py | 4 +- 13 files changed, 564 insertions(+), 155 deletions(-) diff --git a/Makefile b/Makefile index 4789718..c0b115b 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,10 @@ test-unit: @echo 'unit tests OK' black-check: - black --diff --line-length 120 cobra/ + black --diff --line-length 80 cobra/ + +black: + black cobra/ typecheck: mypy cobra --allow-redefinition --allow-untyped-globals --ignore-missing-imports diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 22e034b..278bf9d 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -61,7 +61,9 @@ class ClassificationEvaluator: (by default 10, so deciles). """ - def __init__(self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10): + def __init__( + self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10 + ): """Initialize the ClassificationEvaluator.""" self.y_true: np.ndarray self.y_pred: np.ndarray @@ -91,14 +93,21 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): # if probability_cutoff is not set, take the optimal cut-off if not self.probability_cutoff: - self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) + self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff( + fpr, tpr, thresholds + ) # Transform probabilities to binary array using cut-off - y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 for pred in y_pred]) + y_pred_b = np.array( + [0 if pred <= self.probability_cutoff else 1 for pred in y_pred] + ) # Compute the various evaluation metrics self.scalar_metrics = cast( - pd.Series, ClassificationEvaluator._compute_scalar_metrics(y_true, y_pred, y_pred_b, self.lift_at) + pd.Series, + ClassificationEvaluator._compute_scalar_metrics( + y_true, y_pred, y_pred_b, self.lift_at + ), ) self.y_true = y_true @@ -106,8 +115,12 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} self.confusion_matrix = confusion_matrix(y_true, y_pred_b) - self.lift_curve = ClassificationEvaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins) - self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(y_true, y_pred) + self.lift_curve = ClassificationEvaluator._compute_lift_per_bin( + y_true, y_pred, self.n_bins + ) + self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains( + y_true, y_pred + ) @staticmethod def _compute_scalar_metrics( @@ -153,7 +166,10 @@ def _compute_scalar_metrics( "F1": f1_score(y_true, y_pred_b, average=None)[1], "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), f"lift at {lift_at}": np.round( - ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=lift_at), 2 + ClassificationEvaluator._compute_lift( + y_true=y_true, y_pred=y_pred, lift_at=lift_at + ), + 2, ), } ) @@ -175,7 +191,8 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): """ if self.roc_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -202,7 +219,9 @@ def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): plt.show() - def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels: list = None): + def plot_confusion_matrix( + self, path: str = None, dim: tuple = (12, 8), labels: list = None + ): """Plot the confusion matrix. Parameters @@ -222,7 +241,8 @@ def plot_confusion_matrix(self, path: str = None, dim: tuple = (12, 8), labels: labels = labels or DEFAULT_LABELS if self.confusion_matrix is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -259,7 +279,8 @@ def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)) """ if self.lift_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -319,7 +340,8 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): """ if self.lift_curve is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -334,7 +356,15 @@ def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline(y=1, color="darkorange", linestyle="--", xmin=0.05, xmax=0.95, linewidth=3, label="Baseline") + plt.axhline( + y=1, + color="darkorange", + linestyle="--", + xmin=0.05, + xmax=0.95, + linewidth=3, + label="Baseline", + ) # Legend ax.legend(loc="upper right") @@ -373,7 +403,14 @@ def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): linewidth=3, label="cumulative gains", ) - ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, ls="--", color="darkorange", label="random selection") + ax.plot( + ax.get_xlim(), + ax.get_ylim(), + linewidth=3, + ls="--", + color="darkorange", + label="random selection", + ) ax.set_title("Cumulative Gains curve", fontsize=20) @@ -417,7 +454,9 @@ def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float: return ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) @staticmethod - def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray) -> float: + def _compute_optimal_cutoff( + fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray + ) -> float: """Calculate the optimal probability cut-off point for a classification model. The optimal cut-off would be where TPR is high and FPR is low, hence @@ -445,7 +484,9 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, thresholds: np.nda return thresholds[optimal_index][0] @staticmethod - def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + def _compute_cumulative_gains( + y_true: np.ndarray, y_pred: np.ndarray + ) -> tuple[np.ndarray, np.ndarray]: """Compute cumulative gains of the model. Code from (https://github.com/reiinakano/scikit-plot/blob/ @@ -503,7 +544,9 @@ def _compute_lift_per_bin( Includes x-labels, lifts per decile, and target incidence. """ lifts = [ - ClassificationEvaluator._compute_lift(y_true=y_true, y_pred=y_pred, lift_at=perc_lift) + ClassificationEvaluator._compute_lift( + y_true=y_true, y_pred=y_pred, lift_at=perc_lift + ) for perc_lift in np.linspace(1 / n_bins, 1, num=n_bins, endpoint=True) ] @@ -512,7 +555,9 @@ def _compute_lift_per_bin( return x_labels, lifts, cast(float, y_true.mean()) @staticmethod - def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05) -> float: + def _compute_lift( + y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05 + ) -> float: """Calculate lift on a specified level. Parameters @@ -589,7 +634,9 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): Model scores. """ # Compute the various evaluation metrics - self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics(y_true, y_pred) + self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics( + y_true, y_pred + ) self.y_true = y_true self.y_pred = y_pred @@ -651,7 +698,9 @@ def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: df["z_res"] = df["res"].apply(lambda x: (x - m) / s) df["rank"] = df.index + 1 - df["percentile"] = df["rank"].apply(lambda x: x / (n + 1)) # divide by n+1 to avoid inf + df["percentile"] = df["rank"].apply( + lambda x: x / (n + 1) + ) # divide by n+1 to avoid inf df["q_theoretical"] = norm.ppf(df["percentile"]) return pd.Series( @@ -678,7 +727,8 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): """ if self.y_true is None and self.y_pred is None: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -690,7 +740,9 @@ def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): x = np.arange(1, len(y_true) + 1) - ax.plot(x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3) + ax.plot( + x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3 + ) ax.plot(x, y_pred, label="predictions", color="cornflowerblue", linewidth=3) ax.set_xlabel("Index", fontsize=15) @@ -728,14 +780,24 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)): x = self.qq["quantiles"] y = self.qq["residuals"] - ax.plot(x, x, ls="--", label="perfect model", color="darkorange", linewidth=3) + ax.plot( + x, x, ls="--", label="perfect model", color="darkorange", linewidth=3 + ) ax.plot(x, y, label="current model", color="cornflowerblue", linewidth=3) ax.set_xlabel("Theoretical quantiles", fontsize=15) - ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1)) + ax.set_xticks( + range( + int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1 + ) + ) ax.set_ylabel("Standardized residuals", fontsize=15) - ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1)) + ax.set_yticks( + range( + int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1 + ) + ) ax.legend(loc="best") ax.set_title("Q-Q plot", fontsize=20) diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index dfbd85c..08e89e2 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -10,7 +10,10 @@ def generate_pig_tables( - basetable: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list + basetable: pd.DataFrame, + id_column_name: str, + target_column_name: str, + preprocessed_predictors: list, ) -> pd.DataFrame: """Compute PIG tables for all predictors in preprocessed_predictors. @@ -44,7 +47,10 @@ def generate_pig_tables( def compute_pig_table( - basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str, id_column_name: str + basetable: pd.DataFrame, + predictor_column_name: str, + target_column_name: str, + id_column_name: str, ) -> pd.DataFrame: """Compute the PIG table of a given predictor for a given target. @@ -73,7 +79,13 @@ def compute_pig_table( basetable.groupby(predictor_column_name) .agg({target_column_name: "mean", id_column_name: "size"}) .reset_index() - .rename(columns={predictor_column_name: "label", target_column_name: "avg_target", id_column_name: "pop_size"}) + .rename( + columns={ + predictor_column_name: "label", + target_column_name: "avg_target", + id_column_name: "pop_size", + } + ) ) # add the column name to a variable column @@ -90,7 +102,11 @@ def compute_pig_table( def plot_incidence( - pig_tables: pd.DataFrame, variable: str, model_type: str, column_order: list = None, dim: tuple = (12, 8) + pig_tables: pd.DataFrame, + variable: str, + model_type: str, + column_order: list = None, + dim: tuple = (12, 8), ): """Plot a Predictor Insights Graph (PIG). @@ -125,14 +141,19 @@ def plot_incidence( """ if model_type not in ["classification", "regression"]: raise ValueError( - "An unexpected value was set for the model_type " "parameter. Expected 'classification' or " "'regression'." + "An unexpected value was set for the model_type " + "parameter. Expected 'classification' or " + "'regression'." ) df_plot = pig_tables[pig_tables["variable"] == variable].copy() if column_order is not None: if not set(df_plot["label"]) == set(column_order): - raise ValueError("The column_order and pig_tables parameters do not contain " "the same set of variables.") + raise ValueError( + "The column_order and pig_tables parameters do not contain " + "the same set of variables." + ) df_plot["label"] = df_plot["label"].astype("category") df_plot["label"].cat.reorder_categories(column_order, inplace=True) @@ -156,7 +177,9 @@ def plot_incidence( marker=".", markersize=20, linewidth=3, - label="incidence rate per bin" if model_type == "classification" else "mean target value per bin", + label="incidence rate per bin" + if model_type == "classification" + else "mean target value per bin", zorder=10, ) @@ -166,7 +189,9 @@ def plot_incidence( color="#022252", linestyle="--", linewidth=4, - label="average incidence rate" if model_type == "classification" else "global mean target value", + label="average incidence rate" + if model_type == "classification" + else "global mean target value", zorder=10, ) @@ -174,7 +199,10 @@ def plot_incidence( ax.plot(np.nan, "#939598", linewidth=6, label="bin size") # Set labels & ticks - ax.set_ylabel("incidence" if model_type == "classification" else "mean target value", fontsize=16) + ax.set_ylabel( + "incidence" if model_type == "classification" else "mean target value", + fontsize=16, + ) ax.set_xlabel(f"{variable} bins" "", fontsize=16) ax.xaxis.set_tick_params(labelsize=14) plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") @@ -196,9 +224,17 @@ def plot_incidence( # the bins and versus the global avg. target. # (Motivation for the AND above: if on one end there IS enough # difference, the effect that we discuss here does not occur.) - global_avg_target = max(df_plot["global_avg_target"]) # series of same number, for every bin. - if (np.abs((max(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25) and ( - np.abs((min(df_plot["avg_target"]) - global_avg_target)) / global_avg_target < 0.25 + global_avg_target = max( + df_plot["global_avg_target"] + ) # series of same number, for every bin. + if ( + np.abs((max(df_plot["avg_target"]) - global_avg_target)) + / global_avg_target + < 0.25 + ) and ( + np.abs((min(df_plot["avg_target"]) - global_avg_target)) + / global_avg_target + < 0.25 ): ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25) @@ -212,7 +248,13 @@ def plot_incidence( # ----------------- ax2 = ax.twinx() - ax2.bar(df_plot["label"], df_plot["pop_size"], align="center", color="#939598", zorder=1) + ax2.bar( + df_plot["label"], + df_plot["pop_size"], + align="center", + color="#939598", + zorder=1, + ) # Set labels & ticks ax2.set_xlabel(f"{variable} bins" "", fontsize=16) diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 19fbf64..5e77192 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -12,7 +12,9 @@ DEFAULT_COLOURS = {"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"} -def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None): +def plot_univariate_predictor_quality( + df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None +): """Plot univariate quality of the predictors. Parameters @@ -33,7 +35,9 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, metric = "RMSE" ascending = True - df = df_metric[df_metric["preselection"]].sort_values(by=metric + " selection", ascending=ascending) + df = df_metric[df_metric["preselection"]].sort_values( + by=metric + " selection", ascending=ascending + ) df = pd.melt( df, @@ -62,7 +66,9 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple = (12, plt.show() -def plot_correlation_matrix(df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None): +def plot_correlation_matrix( + df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None +): """Plot correlation matrix amongst the predictors. Parameters @@ -158,7 +164,9 @@ def plot_performance_curves( # Set x- and y-ticks ax.set_xticks(np.arange(len(model_performance["last_added_predictor"]))) - ax.set_xticklabels(model_performance["last_added_predictor"].tolist(), rotation=40, ha="right") + ax.set_xticklabels( + model_performance["last_added_predictor"].tolist(), rotation=40, ha="right" + ) if model_type == "classification": ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05)) @@ -181,7 +189,10 @@ def plot_performance_curves( def plot_variable_importance( - df_variable_importance: pd.DataFrame, title: str = None, dim: tuple = (12, 8), path: str = None + df_variable_importance: pd.DataFrame, + title: str = None, + dim: tuple = (12, 8), + path: str = None, ): """Plot variable importance of a given model. @@ -198,7 +209,12 @@ def plot_variable_importance( """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # pylint: disable=unused-variable - ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue") + ax = sns.barplot( + x="importance", + y="predictor", + data=df_variable_importance, + color="cornflowerblue", + ) if title: ax.set_title(title) else: diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index ee75b02..1733616 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -43,7 +43,12 @@ class ForwardFeatureSelection: List of fitted models. """ - def __init__(self, model_type: str = "classification", max_predictors: int = 50, pos_only: bool = True): + def __init__( + self, + model_type: str = "classification", + max_predictors: int = 50, + pos_only: bool = True, + ): """Initialize the ForwardFeatureSelection class.""" self.model_type = model_type if model_type == "classification": @@ -75,7 +80,10 @@ def get_model_from_step(self, step: int) -> Model: In case step is larger than the number of available models. """ if len(self._fitted_models) <= step: - raise ValueError(f"No model available for step {step}. " "The first step starts from index 0.") + raise ValueError( + f"No model available for step {step}. " + "The first step starts from index 0." + ) return self._fitted_models[step] @@ -122,7 +130,10 @@ def compute_model_performances( for model in self._fitted_models: last_added_predictor = set(model.predictors).difference(predictor_set) - tmp = {"predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0]} + tmp = { + "predictors": model.predictors, + "last_added_predictor": list(last_added_predictor)[0], + } # Evaluate model on each dataset split, # e.g. train-selection-validation @@ -179,26 +190,44 @@ def fit( In case the number of forced predictors is larger than the maximum number of allowed predictors in the model. """ - assert "split" in train_data.columns, "The train_data input df does not include a split column." assert ( - len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0 + "split" in train_data.columns + ), "The train_data input df does not include a split column." + assert ( + len( + set(["train", "selection"]).difference( + set(train_data["split"].unique()) + ) + ) + == 0 ), "The train_data input df does not include a 'train' and 'selection' split." # remove excluded predictors from predictor lists forced_predictors = forced_predictors or DEFAULT_FORCED_PREDICTORS excluded_predictors = excluded_predictors or DEFAULT_EXCLUDED_PREDICTORS filtered_predictors = [ - var for var in predictors if (var not in excluded_predictors and var not in forced_predictors) + var + for var in predictors + if (var not in excluded_predictors and var not in forced_predictors) ] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: - raise ValueError("Size of forced_predictors cannot be bigger than " "max_predictors.") + raise ValueError( + "Size of forced_predictors cannot be bigger than " "max_predictors." + ) elif len(forced_predictors) == self.max_predictors: - log.info("Size of forced_predictors equals max_predictors " "only one model will be trained...") + log.info( + "Size of forced_predictors equals max_predictors " + "only one model will be trained..." + ) # train model with all forced_predictors (only) self._fitted_models.append( - self._train_model(train_data[train_data["split"] == "train"], target_column_name, forced_predictors) + self._train_model( + train_data[train_data["split"] == "train"], + target_column_name, + forced_predictors, + ) ) else: @@ -207,7 +236,11 @@ def fit( ) def _forward_selection( - self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = None + self, + train_data: pd.DataFrame, + target_column_name: str, + predictors: list, + forced_predictors: list = None, ) -> list[Model]: """Perform the forward feature selection algorithm. @@ -237,22 +270,34 @@ def _forward_selection( fitted_models: list[Model] = [] current_predictors: list[str] = [] - max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors)) + max_steps = 1 + min( + self.max_predictors, len(predictors) + len(forced_predictors) + ) - for step in tqdm(range(1, max_steps), desc="Sequentially adding best predictor..."): + for step in tqdm( + range(1, max_steps), desc="Sequentially adding best predictor..." + ): if step <= len(forced_predictors): # first, we go through the forced predictors - candidate_predictors = [var for var in forced_predictors if var not in current_predictors] + candidate_predictors = [ + var for var in forced_predictors if var not in current_predictors + ] else: candidate_predictors = [ - var for var in (predictors + forced_predictors) if var not in current_predictors + var + for var in (predictors + forced_predictors) + if var not in current_predictors ] - model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors) + model = self._find_next_best_model( + train_data, target_column_name, candidate_predictors, current_predictors + ) if model is not None: # Add new model predictors to the list of current predictors - current_predictors = list(set(current_predictors).union(set(model.predictors))) + current_predictors = list( + set(current_predictors).union(set(model.predictors)) + ) fitted_models.append(model) # else: @@ -267,7 +312,11 @@ def _forward_selection( return fitted_models def _find_next_best_model( - self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list + self, + train_data: pd.DataFrame, + target_column_name: str, + candidate_predictors: list, + current_predictors: list, ) -> Model: """ Find the next best model with candidate predictors. @@ -312,16 +361,24 @@ def _find_next_best_model( "ForwardFeatureSelection argument." ) - fit_data = train_data[train_data["split"] == "train"] # data to fit the models with - sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with + fit_data = train_data[ + train_data["split"] == "train" + ] # data to fit the models with + sel_data = train_data[ + train_data["split"] == "selection" + ] # data to compare the models with for pred in candidate_predictors: # Train a model with an additional predictor - model = self._train_model(fit_data, target_column_name, (current_predictors + [pred])) + model = self._train_model( + fit_data, target_column_name, (current_predictors + [pred]) + ) # Evaluate the model performance = model.evaluate( - sel_data[current_predictors + [pred]], sel_data[target_column_name], split="selection" + sel_data[current_predictors + [pred]], + sel_data[target_column_name], + split="selection", ) if self.pos_only and (not (model.get_coef() >= 0).all()): @@ -329,16 +386,23 @@ def _find_next_best_model( # Check if the model is better than the current best model # and if it is, replace the current best. - if self.MLModel == LogisticRegressionModel and performance > best_performance: # AUC metric is used + if ( + self.MLModel == LogisticRegressionModel + and performance > best_performance + ): # AUC metric is used best_performance = performance best_model = model - elif self.MLModel == LinearRegressionModel and performance < best_performance: # RMSE metric is used + elif ( + self.MLModel == LinearRegressionModel and performance < best_performance + ): # RMSE metric is used best_performance = performance best_model = model return cast(Model, best_model) - def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list) -> Model: + def _train_model( + self, train_data: pd.DataFrame, target_column_name: str, predictors: list + ) -> Model: """Train the model with a given set of predictors. Parameters diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 408ead4..ba8befd 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -33,7 +33,9 @@ class LogisticRegressionModel: def __init__(self): """Initialize the LogisticRegressionModel class.""" - self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver="liblinear", random_state=42) + self.logit = LogisticRegression( + fit_intercept=True, C=1e9, solver="liblinear", random_state=42 + ) self._is_fitted = False # placeholder to keep track of a list of predictors self.predictors = [] @@ -152,7 +154,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.logit.predict_proba(X[self.predictors])[:, 1] - def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float: + def evaluate( + self, + X: pd.DataFrame, + y: pd.Series, + split: str = None, + metric: Optional[Callable] = None, + ) -> float: """ Evaluate the model on a given dataset (X, y). @@ -185,7 +193,9 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Opt y_pred = self.score_model(X) fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred) - cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds) + cutoff = ClassificationEvaluator._compute_optimal_cutoff( + fpr, tpr, thresholds + ) y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred]) performance = metric(y_true=y, y_pred=y_pred_b) @@ -219,11 +229,15 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: y_pred = self.score_model(data) importance_by_variable = { - utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0] + utils.clean_predictor_name(predictor): stats.pearsonr( + data[predictor], y_pred + )[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() + df = pd.DataFrame.from_dict( + importance_by_variable, orient="index" + ).reset_index() df.columns = ["predictor", "importance"] return df.sort_values(by="importance", ascending=False).reset_index(drop=True) @@ -285,7 +299,10 @@ def serialize(self) -> dict: if self._is_fitted: serialized_model.update( - {"coef_": self.linear.coef_.tolist(), "intercept_": self.linear.intercept_.tolist()} + { + "coef_": self.linear.coef_.tolist(), + "intercept_": self.linear.intercept_.tolist(), + } ) return serialized_model @@ -374,7 +391,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.linear.predict(X[self.predictors]) - def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str = None, metric: Optional[Callable] = None) -> float: + def evaluate( + self, + X: pd.DataFrame, + y: pd.Series, + split: str = None, + metric: Optional[Callable] = None, + ) -> float: """Evaluate the model on a given dataset (X, y). The optional split @@ -434,11 +457,15 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: y_pred = self.score_model(data) importance_by_variable = { - utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0] + utils.clean_predictor_name(predictor): stats.pearsonr( + data[predictor], y_pred + )[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() + df = pd.DataFrame.from_dict( + importance_by_variable, orient="index" + ).reset_index() df.columns = ["predictor", "importance"] return df.sort_values(by="importance", ascending=False).reset_index(drop=True) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 2d90b48..d6c1901 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -74,14 +74,22 @@ def compute_univariate_preselection( cleaned_predictor = utils.clean_predictor_name(predictor) auc_train = roc_auc_score( - y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor] + y_true=target_enc_train_data[target_column], + y_score=target_enc_train_data[predictor], ) auc_selection = roc_auc_score( - y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor] + y_true=target_enc_selection_data[target_column], + y_score=target_enc_selection_data[predictor], ) - result.append({"predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection}) + result.append( + { + "predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection, + } + ) df_auc = pd.DataFrame(result) @@ -95,23 +103,35 @@ def compute_univariate_preselection( df_auc["preselection"] = auc_thresh & auc_overtrain - df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) + df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index( + drop=True + ) elif model_type == "regression": for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) rmse_train = sqrt( - mean_squared_error(y_true=target_enc_train_data[target_column], y_pred=target_enc_train_data[predictor]) + mean_squared_error( + y_true=target_enc_train_data[target_column], + y_pred=target_enc_train_data[predictor], + ) ) rmse_selection = sqrt( mean_squared_error( - y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor] + y_true=target_enc_selection_data[target_column], + y_pred=target_enc_selection_data[predictor], ) ) - result.append({"predictor": cleaned_predictor, "RMSE train": rmse_train, "RMSE selection": rmse_selection}) + result.append( + { + "predictor": cleaned_predictor, + "RMSE train": rmse_train, + "RMSE selection": rmse_selection, + } + ) df_rmse = pd.DataFrame(result) @@ -120,12 +140,16 @@ def compute_univariate_preselection( # Identify those variables for which the RMSE difference between train # and selection is within a user-defined ratio - preselect_overtrain = df_rmse["RMSE selection"] - df_rmse["RMSE train"] # flip subtraction vs. AUC + preselect_overtrain = ( + df_rmse["RMSE selection"] - df_rmse["RMSE train"] + ) # flip subtraction vs. AUC rmse_overtrain = preselect_overtrain < preselect_overtrain_threshold df_rmse["preselection"] = rmse_thresh & rmse_overtrain - df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better + df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index( + drop=True + ) # lower is better return df_out @@ -147,7 +171,9 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """ if "AUC selection" in df_metric.columns: predictor_list = ( - df_metric[df_metric["preselection"]].sort_values(by="AUC selection", ascending=False).predictor.tolist() + df_metric[df_metric["preselection"]] + .sort_values(by="AUC selection", ascending=False) + .predictor.tolist() ) elif "RMSE selection" in df_metric.columns: predictor_list = ( @@ -159,7 +185,9 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: return [col + "_enc" for col in predictor_list] -def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: +def compute_correlations( + target_enc_train_data: pd.DataFrame, predictors: list +) -> pd.DataFrame: """Compute the correlations amongst the predictors in the DataFrame. Parameters @@ -177,7 +205,9 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) """ correlations = target_enc_train_data[predictors].corr() - predictors_cleaned = [utils.clean_predictor_name(predictor) for predictor in predictors] + predictors_cleaned = [ + utils.clean_predictor_name(predictor) for predictor in predictors + ] # Change index and columns with the cleaned version of the predictors # e.g. change "var1_enc" with "var1" diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index 55e036b..cd8579a 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -5,4 +5,9 @@ from .categorical_data_processor import CategoricalDataProcessor from .preprocessor import PreProcessor -__all__ = ["KBinsDiscretizer", "TargetEncoder", "CategoricalDataProcessor", "PreProcessor"] +__all__ = [ + "KBinsDiscretizer", + "TargetEncoder", + "CategoricalDataProcessor", + "PreProcessor", +] diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 6632720..ba762ed 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -114,7 +114,8 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_cleaned_categories_by_column"] = { - key: list(value) for key, value in self._cleaned_categories_by_column.items() + key: list(value) + for key, value in self._cleaned_categories_by_column.items() } return params @@ -148,7 +149,9 @@ def set_attributes_from_dict(self, params: dict): # of the following method from BaseEstimator: self.set_params(**params) - self._cleaned_categories_by_column = {key: set(value) for key, value in _fitted_output.items()} + self._cleaned_categories_by_column = { + key: set(value) for key, value in _fitted_output.items() + } return self @@ -172,7 +175,10 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column: str): for column_name in tqdm(column_names, desc="Fitting category regrouping..."): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column_name) + ) continue cleaned_cats = self._fit_column(data, column_name, target_column) @@ -207,7 +213,10 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> se model_type = self.model_type if len(data[column_name].unique()) == 1: - log.warning(f"Predictor {column_name} is constant" " and will be ignored in computation.") + log.warning( + f"Predictor {column_name} is constant" + " and will be ignored in computation." + ) return set(data[column_name].unique()) y = data[target_column] @@ -225,19 +234,25 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> se # do not merge categories in case of dummies, i.e. 0 and 1 # (and possibly "Missing") - if len(unique_categories) == 2 or (len(unique_categories) == 3 and "Missing" in unique_categories): + if len(unique_categories) == 2 or ( + len(unique_categories) == 3 and "Missing" in unique_categories + ): return set(unique_categories) # get small categories and add them to the merged category list # does not apply incidence factor when model_type = "regression" - small_categories = CategoricalDataProcessor._get_small_categories(X, incidence, self.category_size_threshold) + small_categories = CategoricalDataProcessor._get_small_categories( + X, incidence, self.category_size_threshold + ) combined_categories = combined_categories.union(small_categories) for category in unique_categories: if category in small_categories: continue - pval = CategoricalDataProcessor._compute_p_value(X, y, category, model_type, self.scale_contingency_table) + pval = CategoricalDataProcessor._compute_p_value( + X, y, category, model_type, self.scale_contingency_table + ) # if not significant, add it to the list if pval > self.p_value_threshold: @@ -266,7 +281,10 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: Data with additional transformed variables. """ if self.regroup and len(self._cleaned_categories_by_column) == 0: - msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in column_names: @@ -298,7 +316,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram data.loc[:, column_name_clean] = data[column_name].astype(object) # Fill missings first - data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings(data, column_name_clean) + data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings( + data, column_name_clean + ) if self.regroup: categories = self._cleaned_categories_by_column.get(column_name) @@ -307,10 +327,15 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram # Log warning if categories is None, which indicates it is # not in fitted output if categories is None: - log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped".format(column_name) + ) return data - data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_categories( + data.loc[ + :, column_name_clean + ] = CategoricalDataProcessor._replace_categories( data[column_name_clean], categories, self.regroup_name ) @@ -319,7 +344,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram return data - def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: + def fit_transform( + self, data: pd.DataFrame, column_names: list, target_column: str + ) -> pd.DataFrame: """Fit and transform the data. Parameters @@ -341,7 +368,9 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: s return self.transform(data, column_names) @staticmethod - def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set: + def _get_small_categories( + predictor_series: pd.Series, incidence: float, category_size_threshold: int + ) -> set: """ Fetch categories with a size below a certain threshold. @@ -372,7 +401,9 @@ def _get_small_categories(predictor_series: pd.Series, incidence: float, categor return set(category_counts[bool_mask].index.tolist()) @staticmethod - def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None) -> pd.DataFrame: + def _replace_missings( + data: pd.DataFrame, column_names: Optional[Union[list[str], str]] = None + ) -> pd.DataFrame: """Replace missing values (incl. empty strings). Parameters @@ -403,7 +434,11 @@ def _replace_missings(data: pd.DataFrame, column_names: Optional[Union[list[str] @staticmethod def _compute_p_value( - X: pd.Series, y: pd.Series, category: str, model_type: str, scale_contingency_table: bool + X: pd.Series, + y: pd.Series, + category: str, + model_type: str, + scale_contingency_table: bool, ) -> float: """ Calculate p-value. @@ -439,7 +474,9 @@ def _compute_p_value( df["other_categories"] = np.where(X == category, 0, 1) if model_type == "classification": - contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], margins=False) + contingency_table = pd.crosstab( + index=df["other_categories"], columns=df["y"], margins=False + ) # if true, we scale the "other" categories if scale_contingency_table: @@ -453,12 +490,16 @@ def _compute_p_value( pval = stats.chi2_contingency(contingency_table, correction=False)[1] elif model_type == "regression": - pval = stats.kruskal(df.y[df.other_categories == 0], df.y[df.other_categories == 1])[1] + pval = stats.kruskal( + df.y[df.other_categories == 0], df.y[df.other_categories == 1] + )[1] return pval @staticmethod - def _replace_categories(data: pd.Series, categories: set, replace_with: str) -> pd.Series: + def _replace_categories( + data: pd.Series, categories: set, replace_with: str + ) -> pd.Series: """ Replace categories in set with "Other". diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 0ad6265..84fae51 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -137,7 +137,8 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_bins_by_column"] = { - key: [list(tup) for tup in value] if value else None for key, value in self._bins_by_column.items() + key: [list(tup) for tup in value] if value else None + for key, value in self._bins_by_column.items() } return params @@ -160,7 +161,9 @@ def set_attributes_from_dict(self, params: dict): if type(_bins_by_column) != dict: raise ValueError( - "_bins_by_column is expected to be a dict but is of type {} instead".format(type(_bins_by_column)) + "_bins_by_column is expected to be a dict but is of type {} instead".format( + type(_bins_by_column) + ) ) # Clean out params dictionary to remove unknown keys (for safety!) @@ -171,7 +174,8 @@ def set_attributes_from_dict(self, params: dict): self.set_params(**params) self._bins_by_column = { - key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items() + key: ([tuple(v) for v in value] if value else None) + for key, value in _bins_by_column.items() } return self @@ -195,7 +199,10 @@ def fit(self, data: pd.DataFrame, column_names: list): for column_name in tqdm(column_names, desc="Computing discretization bins..."): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column_name) + ) continue bins = self._fit_column(data, column_name) @@ -203,7 +210,9 @@ def fit(self, data: pd.DataFrame, column_names: list): # Add to bins_by_column for later use self._bins_by_column[column_name] = bins - def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tuple]]: + def _fit_column( + self, data: pd.DataFrame, column_name: str + ) -> Optional[List[tuple]]: """Compute bins for a specific column in data. Parameters @@ -221,7 +230,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: - log.warning("Predictor '{}' is constant and will be ignored in computation".format(column_name)) + log.warning( + "Predictor '{}' is constant and will be ignored in computation".format( + column_name + ) + ) return None prop_inf = np.sum(np.isinf(data[column_name])) / data[column_name].shape[0] @@ -237,7 +250,11 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] if prop_nan >= 0.99: - log.warning(f"Column {column_name} is" f" {prop_nan:.1%}% NaNs, " f"consider dropping or transforming it.") + log.warning( + f"Column {column_name} is" + f" {prop_nan:.1%}% NaNs, " + f"consider dropping or transforming it." + ) n_bins = self.n_bins if self.auto_adapt_bins: @@ -249,7 +266,8 @@ def _fit_column(self, data: pd.DataFrame, column_name: str) -> Optional[List[tup if len(bin_edges) < 3: log.warning( - "Only 1 bin was found for predictor '{}' so it will " "be ignored in computation".format(column_name) + "Only 1 bin was found for predictor '{}' so it will " + "be ignored in computation".format(column_name) ) return None @@ -281,12 +299,18 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: data with additional discretized variables """ if len(self._bins_by_column) == 0: - msg = "{} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in tqdm(column_names, desc="Discretizing columns..."): if column_name not in self._bins_by_column: - log.warning("Column '{}' is not in fitted output " "and will be skipped".format(column_name)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped".format(column_name) + ) continue # can be None for a column with a constant value! @@ -296,7 +320,9 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: return data - def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tuple]) -> pd.DataFrame: + def _transform_column( + self, data: pd.DataFrame, column_name: str, bins: List[tuple] + ) -> pd.DataFrame: """Create a new column with binned values of column_name. Parameters @@ -323,7 +349,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str, bins: List[tup # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) - data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories(bin_labels) + data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories( + bin_labels + ) if data[column_name_bin].isnull().sum() > 0: @@ -355,7 +383,12 @@ def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: return self.transform(data, column_names) def _compute_bin_edges( - self, data: pd.DataFrame, column_name: str, n_bins: int, col_min: float, col_max: float + self, + data: pd.DataFrame, + column_name: str, + n_bins: int, + col_min: float, + col_max: float, ) -> list: """Compute the desired bin edges. @@ -475,7 +508,9 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: return bins @staticmethod - def _create_index(intervals: List[tuple], closed: str = "right") -> pd.IntervalIndex: + def _create_index( + intervals: List[tuple], closed: str = "right" + ) -> pd.IntervalIndex: """ Create an pd.IntervalIndex based on a list of tuples. diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 3a82efa..c873b68 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -169,7 +169,13 @@ def from_params( ) discretizer = KBinsDiscretizer( - n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format + n_bins, + strategy, + closed, + auto_adapt_bins, + starting_precision, + label_format, + change_endpoint_format, ) target_encoder = TargetEncoder(weight, imputation_strategy) @@ -201,10 +207,15 @@ def from_pipeline(cls, pipeline: dict): and no others. """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline, as it does not " "contain all and only the required parameters.") + raise ValueError( + "Invalid pipeline, as it does not " + "contain all and only the required parameters." + ) categorical_data_processor = CategoricalDataProcessor() - categorical_data_processor.set_attributes_from_dict(pipeline["categorical_data_processor"]) + categorical_data_processor.set_attributes_from_dict( + pipeline["categorical_data_processor"] + ) # model_type = categorical_data_processor.model_type discretizer = KBinsDiscretizer() @@ -213,9 +224,20 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"]) + return cls( + categorical_data_processor, + discretizer, + target_encoder, + is_fitted=pipeline["_is_fitted"], + ) - def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str): + def fit( + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, + ): """Fit the data to the preprocessing pipeline. Parameters @@ -230,7 +252,9 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: li Column name of the target. """ # get list of all variables - preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) log.info("Starting to fit pipeline") start = time.time() @@ -244,25 +268,39 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: li if continuous_vars: begin = time.time() self._discretizer.fit(train_data, continuous_vars) - log.info("Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin)) + log.info( + "Fitting KBinsDiscretizer took {} seconds".format(time.time() - begin) + ) train_data = self._discretizer.transform(train_data, continuous_vars) if discrete_vars: begin = time.time() - self._categorical_data_processor.fit(train_data, discrete_vars, target_column_name) - log.info("Fitting categorical_data_processor class took {} seconds".format(time.time() - begin)) + self._categorical_data_processor.fit( + train_data, discrete_vars, target_column_name + ) + log.info( + "Fitting categorical_data_processor class took {} seconds".format( + time.time() - begin + ) + ) - train_data = self._categorical_data_processor.transform(train_data, discrete_vars) + train_data = self._categorical_data_processor.transform( + train_data, discrete_vars + ) begin = time.time() - self._target_encoder.fit(train_data, preprocessed_variable_names, target_column_name) + self._target_encoder.fit( + train_data, preprocessed_variable_names, target_column_name + ) log.info("Fitting TargetEncoder took {} seconds".format(time.time() - begin)) self._is_fitted = True # set fitted boolean to True log.info("Fitting pipeline took {} seconds".format(time.time() - start)) - def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list) -> pd.DataFrame: + def transform( + self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list + ) -> pd.DataFrame: """Transform the data by applying the preprocessing pipeline. Parameters @@ -288,11 +326,14 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: li if not self._is_fitted: msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) - preprocessed_variable_names = PreProcessor._get_variable_list(continuous_vars, discrete_vars) + preprocessed_variable_names = PreProcessor._get_variable_list( + continuous_vars, discrete_vars + ) if continuous_vars: data = self._discretizer.transform(data, continuous_vars) @@ -307,7 +348,11 @@ def transform(self, data: pd.DataFrame, continuous_vars: list, discrete_vars: li return data def fit_transform( - self, train_data: pd.DataFrame, continuous_vars: list, discrete_vars: list, target_column_name: str + self, + train_data: pd.DataFrame, + continuous_vars: list, + discrete_vars: list, + target_column_name: str, ) -> pd.DataFrame: """Fit preprocessing pipeline and transform the data. @@ -333,7 +378,10 @@ def fit_transform( @staticmethod def train_selection_validation_split( - data: pd.DataFrame, train_prop: float = 0.6, selection_prop: float = 0.2, validation_prop: float = 0.2 + data: pd.DataFrame, + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2, ) -> pd.DataFrame: """Add `split` column with train/selection/validation values to the dataset. @@ -358,7 +406,10 @@ def train_selection_validation_split( DataFrame with additional split column. """ if not math.isclose(train_prop + selection_prop + validation_prop, 1.0): - raise ValueError("The sum of train_prop, selection_prop and " "validation_prop must be 1.0.") + raise ValueError( + "The sum of train_prop, selection_prop and " + "validation_prop must be 1.0." + ) if train_prop == 0.0: raise ValueError("train_prop cannot be zero!") @@ -373,7 +424,10 @@ def train_selection_validation_split( correction = nrows - (size_train + size_select + size_valid) split = ( - ["train"] * size_train + ["train"] * correction + ["selection"] * size_select + ["validation"] * size_valid + ["train"] * size_train + + ["train"] * correction + + ["selection"] * size_select + + ["validation"] * size_valid ) shuffle(split) @@ -395,9 +449,13 @@ def serialize_pipeline(self) -> dict[str, Any]: Return the pipeline as a dictionary. """ pipeline: dict[str, Any] - pipeline = {"metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")}} + pipeline = { + "metadata": {"timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S")} + } - pipeline["categorical_data_processor"] = self._categorical_data_processor.attributes_to_dict() + pipeline[ + "categorical_data_processor" + ] = self._categorical_data_processor.attributes_to_dict() pipeline["discretizer"] = self._discretizer.attributes_to_dict() pipeline["target_encoder"] = self._target_encoder.attributes_to_dict() @@ -416,7 +474,9 @@ def _is_valid_pipeline(pipeline: dict) -> bool: Loaded pipeline from JSON file. """ keys = inspect.getfullargspec(PreProcessor.from_params).args - valid_keys = set([key for key in keys if key not in ["cls", "serialization_path"]]) + valid_keys = set( + [key for key in keys if key not in ["cls", "serialization_path"]] + ) input_keys: Set[str] = set() for key in pipeline: @@ -453,7 +513,9 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list: ValueError In case both lists are empty. """ - var_list = [col + "_processed" for col in discrete_vars] + [col + "_bin" for col in continuous_vars] + var_list = [col + "_processed" for col in discrete_vars] + [ + col + "_bin" for col in continuous_vars + ] if not var_list: raise ValueError("Variable var_list is None or empty list.") diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index cd6bc34..7cd3f6a 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -75,7 +75,9 @@ def __init__(self, weight: float = 0.0, imputation_strategy: str = "mean"): elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError( "Valid options for 'imputation_strategy' are {}. " - "Got imputation_strategy={!r} instead.".format(self.valid_imputation_strategies, imputation_strategy) + "Got imputation_strategy={!r} instead.".format( + self.valid_imputation_strategies, imputation_strategy + ) ) if weight == 0: @@ -104,7 +106,9 @@ def attributes_to_dict(self) -> dict: """ params = self.get_params() - params["_mapping"] = {key: value.to_dict() for key, value in self._mapping.items()} + params["_mapping"] = { + key: value.to_dict() for key, value in self._mapping.items() + } params["_global_mean"] = self._global_mean @@ -122,7 +126,10 @@ def set_attributes_from_dict(self, params: dict): if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] - if "imputation_strategy" in params and params["imputation_strategy"] in self.valid_imputation_strategies: + if ( + "imputation_strategy" in params + and params["imputation_strategy"] in self.valid_imputation_strategies + ): self.imputation_strategy = params["imputation_strategy"] if "_global_mean" in params and type(params["_global_mean"]) == float: @@ -137,7 +144,9 @@ def dict_to_series(key, value): s.index.name = key return s - self._mapping = {key: dict_to_series(key, value) for key, value in _mapping.items()} + self._mapping = { + key: dict_to_series(key, value) for key, value in _mapping.items() + } return self @@ -160,7 +169,10 @@ def fit(self, data: pd.DataFrame, column_names: list, target_column: str): for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting".format(column)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column) + ) continue self._mapping[column] = self._fit_column(data[column], y) @@ -220,7 +232,8 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """ if (len(self._mapping) == 0) or (self._global_mean is None): msg = ( - "This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -229,7 +242,10 @@ def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: log.warning("Unknown column '{}' will be skipped.".format(column)) continue elif column not in self._mapping: - log.warning("Column '{}' is not in fitted output " "and will be skipped.".format(column)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped.".format(column) + ) continue data = self._transform_column(data, column) @@ -258,7 +274,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram # Convert dtype to float, because when the original dtype # is of type "category", the resulting dtype would otherwise also be of # type "category": - data[new_column] = data[column_name].map(self._mapping[column_name]).astype("float") + data[new_column] = ( + data[column_name].map(self._mapping[column_name]).astype("float") + ) # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result @@ -274,7 +292,9 @@ def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFram return data - def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: + def fit_transform( + self, data: pd.DataFrame, column_names: list, target_column: str + ) -> pd.DataFrame: """Fit the encoder and transform the data. Parameters diff --git a/cobra/utils.py b/cobra/utils.py index 0287947..4efee0d 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -8,4 +8,6 @@ def clean_predictor_name(predictor_name: str) -> str: This is done by stripping the redundant suffix (e.g. "_enc" or "_bin") off from the end of the predictor name to return a clean version of the predictor """ - return predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "") + return ( + predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "") + ) From ae2296d06a718329b4bbfe89d332d69ca702ad6f Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 24 Jun 2022 15:52:43 +0200 Subject: [PATCH 7/9] chore: remove line length of 120, default is 88 --- Makefile | 2 +- cobra/evaluation/evaluator.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index c0b115b..3b20397 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ test-unit: @echo 'unit tests OK' black-check: - black --diff --line-length 80 cobra/ + black --diff cobra/ black: black cobra/ diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index 278bf9d..985cd63 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -787,16 +787,12 @@ def plot_qq(self, path: str = None, dim: tuple = (12, 8)): ax.set_xlabel("Theoretical quantiles", fontsize=15) ax.set_xticks( - range( - int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1 - ) + range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1) ) ax.set_ylabel("Standardized residuals", fontsize=15) ax.set_yticks( - range( - int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1 - ) + range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1) ) ax.legend(loc="best") From 816a0449d81b51a9c9ed870a969ddf4d8f951840 Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Tue, 12 Jul 2022 14:35:36 +0200 Subject: [PATCH 8/9] chore: Python version 3.9 to work with typings --- .github/workflows/development_CI.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/development_CI.yaml b/.github/workflows/development_CI.yaml index e0f18f8..100c71e 100644 --- a/.github/workflows/development_CI.yaml +++ b/.github/workflows/development_CI.yaml @@ -16,10 +16,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.9 - name: Install dependencies run: | From 9b60ffe3fd2fa8d01eeaf16589411fb75996881b Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Tue, 12 Jul 2022 14:36:26 +0200 Subject: [PATCH 9/9] chore: add pytest-cov --- requirements.dev.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.dev.txt b/requirements.dev.txt index 9534dc0..dc2121b 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -3,4 +3,5 @@ mypy>=0.942 pycodestyle>=2.8.0 pydocstyle>=6.1.1 pytest>=7.1.1 -pytest-mock>=3.7.0 \ No newline at end of file +pytest-mock>=3.7.0 +pytest-cov>=3.0.0 \ No newline at end of file